HPU support (#36424)

* test * fix * fix * skip some and run some first * test fsdp * fix * patches for generate * test distributed * copy * don't test distributed loss for hpu * require fp16 and run first * changes from marc's PR fixing zero3 * better alternative * return True when fp16 support on gaudi without creating bridge * fix * fix tested dtype in deepspeed inference test * test * fix * test * fix * skip * require fp16 * run first fsdp * Apply suggestions from code review * address comments * address comments and refactor test * reduce precison * avoid doing gaudi1 specific stuff in the genreation loop * document test_gradient_accumulation_loss_alignment_with_model_loss test a bit more
2025-03-12 09:08:12 +01:00
parent 50d3530aa0
commit 89f6956015
19 changed files with 337 additions and 139 deletions
--- a/tests/trainer/test_trainer.py
+++ b/tests/trainer/test_trainer.py
@@ -75,6 +75,7 @@ from transformers.testing_utils import (
    require_intel_extension_for_pytorch,
    require_liger_kernel,
    require_lomo,
+    require_non_hpu,
    require_non_xpu,
    require_optuna,
    require_peft,
@@ -88,6 +89,7 @@ from transformers.testing_utils import (
    require_torch,
    require_torch_accelerator,
    require_torch_bf16,
+    require_torch_fp16,
    require_torch_gpu,
    require_torch_multi_accelerator,
    require_torch_non_multi_accelerator,
@@ -98,6 +100,7 @@ from transformers.testing_utils import (
    require_torchdynamo,
    require_vision,
    require_wandb,
+    run_first,
    run_test_using_subprocess,
    slow,
    torch_device,
@@ -119,6 +122,13 @@ from transformers.utils import (
 from transformers.utils.hp_naming import TrialShortNamer


+if torch_device == "hpu":
+    RTOL = 1e-3
+    ATOL = 1e-3
+else:
+    RTOL = 1e-5
+    ATOL = 1e-5
+
 if is_torch_available():
    import torch
    from torch import nn
@@ -726,11 +736,11 @@ class TrainerIntegrationPrerunTest(TestCasePlus, TrainerIntegrationCommon):
            trainer.train()
            self.alternate_trained_model = (trainer.model.a, trainer.model.b)

-    def check_trained_model(self, model, alternate_seed=False):
+    def check_trained_model(self, model, alternate_seed=False, **kwargs):
        # Checks a training seeded with learning_rate = 0.1
        (a, b) = self.alternate_trained_model if alternate_seed else self.default_trained_model
-        torch.testing.assert_close(model.a, a)
-        torch.testing.assert_close(model.b, b)
+        torch.testing.assert_close(model.a, a, **kwargs)
+        torch.testing.assert_close(model.b, b, **kwargs)

    def test_reproducible_training(self):
        # Checks that training worked, model trained and seed made a reproducible training.
@@ -812,11 +822,6 @@ class TrainerIntegrationPrerunTest(TestCasePlus, TrainerIntegrationCommon):

        data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

-        model = AutoModelForCausalLM.from_pretrained(model_name)
-        state_dict = model.state_dict()
-
-        base_loss_callback = StoreLossCallback()
-
        args_kwargs = {
            "report_to": "none",
            "logging_steps": 1,
@@ -830,6 +835,10 @@ class TrainerIntegrationPrerunTest(TestCasePlus, TrainerIntegrationCommon):
                tmp_dir,
                **args_kwargs,
            )
+            # train with base loss
+            set_seed(42)
+            model = AutoModelForCausalLM.from_pretrained(model_name)
+            base_loss_callback = StoreLossCallback()
            trainer = Trainer(
                model,
                args,
@@ -840,16 +849,17 @@ class TrainerIntegrationPrerunTest(TestCasePlus, TrainerIntegrationCommon):
            assert trainer.model_accepts_loss_kwargs
            trainer.train()

-        grad_accum_loss_callback = StoreLossCallback()
-        with tempfile.TemporaryDirectory() as tmp_dir:
            args = TrainingArguments(
                tmp_dir,
                **args_kwargs,
                gradient_accumulation_steps=2,
                per_device_train_batch_size=4,
            )
+
+            # train with gradient accumulation
            set_seed(42)
            model = AutoModelForCausalLM.from_pretrained(model_name)
+            grad_accum_loss_callback = StoreLossCallback()
            trainer = Trainer(
                model,
                args,
@@ -857,10 +867,12 @@ class TrainerIntegrationPrerunTest(TestCasePlus, TrainerIntegrationCommon):
                callbacks=[grad_accum_loss_callback],
                data_collator=data_collator,
            )
+            assert trainer.model_accepts_loss_kwargs
            trainer.train()

+            # train with broken loss
            set_seed(42)
-            model.load_state_dict(state_dict)
+            model = AutoModelForCausalLM.from_pretrained(model_name)
            broken_loss_callback = StoreLossCallback()
            trainer = Trainer(
                model,
@@ -869,30 +881,28 @@ class TrainerIntegrationPrerunTest(TestCasePlus, TrainerIntegrationCommon):
                callbacks=[broken_loss_callback],
                data_collator=data_collator,
            )
-            # disable model_accepts_loss_kwargs
+            # disable model_accepts_loss_kwargs so that "num_items_in_batch" is not passed to the model
            trainer.model_accepts_loss_kwargs = False
            trainer.train()

-            # Calculate the difference between the base loss and the grad_accum loss
-            diff_truth = [
-                abs(base - grad) for base, grad in zip(base_loss_callback.losses, grad_accum_loss_callback.losses)
-            ]
-            diff_broken = [
-                abs(base - grad) for base, grad in zip(base_loss_callback.losses, broken_loss_callback.losses)
-            ]
+        # Calculate the difference between the base loss and the grad_accum loss
+        diff_truth = [
+            abs(base - grad) for base, grad in zip(base_loss_callback.losses, grad_accum_loss_callback.losses)
+        ]
+        diff_broken = [abs(base - grad) for base, grad in zip(base_loss_callback.losses, broken_loss_callback.losses)]

-            # all diff truth should be quite close
-            self.assertLess(max(diff_truth), 0.01, f"Difference {max(diff_truth)} is not within 0.01")
+        # all diff truth should be quite close
+        self.assertLess(max(diff_truth), 0.01, f"Difference {max(diff_truth)} is not within 0.01")

-            # max diff broken should be very off
-            self.assertGreater(max(diff_broken), 1.5, f"Difference {max(diff_broken)} is not greater than 2")
+        # max diff broken should be very off
+        self.assertGreater(max(diff_broken), 1.3, f"Difference {max(diff_broken)} is not greater than 1.3")

-            loss_base = sum(base_loss_callback.losses)
-            loss_broken = sum(broken_loss_callback.losses)
+        loss_base = sum(base_loss_callback.losses)
+        loss_broken = sum(broken_loss_callback.losses)

-            # mean/sum loss should not vary too much.
-            relative_diff = abs(loss_base - loss_broken) / max(loss_base, loss_broken)
-            self.assertLess(relative_diff, 0.2, f"Relative difference {relative_diff} is not within 0.2")
+        # mean/sum loss should not vary too much.
+        relative_diff = abs(loss_base - loss_broken) / max(loss_base, loss_broken)
+        self.assertLess(relative_diff, 0.2, f"Relative difference {relative_diff} is not within 0.2")

    def test_gradient_accumulation_loss_alignment_with_loss_func(self):
        set_seed(42)
@@ -1214,14 +1224,14 @@ class TrainerIntegrationPrerunTest(TestCasePlus, TrainerIntegrationCommon):
            self.assertFalse(torch.allclose(trainer.model.b, b))
            self.assertGreater(trainer.optimizer.state_dict()["param_groups"][0]["lr"], 0)

-    @require_torch_accelerator
    @require_torch_bf16
+    @require_torch_accelerator
    def test_mixed_bf16(self):
        # very basic test
        with tempfile.TemporaryDirectory() as tmp_dir:
            trainer = get_regression_trainer(learning_rate=0.1, bf16=True, output_dir=tmp_dir)
            trainer.train()
-            self.check_trained_model(trainer.model)
+            self.check_trained_model(trainer.model, atol=ATOL, rtol=RTOL)

        # --bf16 --half_precision_backend apex can't be used together
        with tempfile.TemporaryDirectory() as tmp_dir:
@@ -3582,6 +3592,7 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
                )

    @slow
+    @run_first
    def test_trainer_eval_mrpc(self):
        MODEL_ID = "google-bert/bert-base-cased-finetuned-mrpc"
        tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
@@ -3598,6 +3609,7 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
            self.assertLess(result["eval_loss"], 0.2)

    @slow
+    @run_first
    def test_trainer_eval_multiple(self):
        MODEL_ID = "openai-community/gpt2"
        tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
@@ -3897,6 +3909,7 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
            trainer = get_regression_trainer(skip_memory_metrics=True, output_dir=tmp_dir)
            self.check_mem_metrics(trainer, self.assertNotIn)

+    @require_torch_fp16
    @require_torch_accelerator
    def test_fp16_full_eval(self):
        # this is a sensitive test so let's keep debugging printouts in place for quick diagnosis.
@@ -4152,6 +4165,7 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
            self.assertListEqual(trainer.optimizer.param_groups[1]["params"], no_wd_params)

    @slow
+    @require_non_hpu
    @require_torch_multi_accelerator
    def test_end_to_end_example(self):
        # Tests that `translation.py` will run without issues
--- a/tests/trainer/test_trainer_distributed.py
+++ b/tests/trainer/test_trainer_distributed.py
@@ -19,12 +19,11 @@ import numpy as np
 from transformers import EvalPrediction, HfArgumentParser, TrainingArguments, is_torch_available
 from transformers.testing_utils import (
    TestCasePlus,
+    backend_device_count,
    execute_subprocess_async,
    get_torch_dist_unique_port,
-    require_torch_multi_gpu,
-    require_torch_multi_xpu,
-    require_torch_neuroncore,
-    require_torch_npu,
+    require_torch_multi_accelerator,
+    torch_device,
 )
 from transformers.training_args import ParallelMode
 from transformers.utils import logging
@@ -117,38 +116,10 @@ if is_torch_available():
            return result


-class TestTrainerDistributedNeuronCore(TestCasePlus):
-    @require_torch_neuroncore
-    def test_trainer(self):
-        distributed_args = f"""--nproc_per_node=2
-            --master_port={get_torch_dist_unique_port()}
-            {self.test_file_dir}/test_trainer_distributed.py
-        """.split()
-        output_dir = self.get_auto_remove_tmp_dir()
-        args = f"--output_dir {output_dir}".split()
-        cmd = ["torchrun"] + distributed_args + args
-        execute_subprocess_async(cmd, env=self.get_env())
-        # successful return here == success - any errors would have caused an error in the sub-call
-
-
-class TestTrainerDistributedNPU(TestCasePlus):
-    @require_torch_npu
-    def test_trainer(self):
-        distributed_args = f"""--nproc_per_node=2
-            --master_port={get_torch_dist_unique_port()}
-            {self.test_file_dir}/test_trainer_distributed.py
-        """.split()
-        output_dir = self.get_auto_remove_tmp_dir()
-        args = f"--output_dir {output_dir}".split()
-        cmd = ["torchrun"] + distributed_args + args
-        execute_subprocess_async(cmd, env=self.get_env())
-        # successful return here == success - any errors would have caused an error in the sub-call
-
-
 class TestTrainerDistributed(TestCasePlus):
-    @require_torch_multi_gpu
+    @require_torch_multi_accelerator
    def test_trainer(self):
-        distributed_args = f"""--nproc_per_node={torch.cuda.device_count()}
+        distributed_args = f"""--nproc_per_node={backend_device_count(torch_device)}
            --master_port={get_torch_dist_unique_port()}
            {self.test_file_dir}/test_trainer_distributed.py
        """.split()
@@ -159,20 +130,6 @@ class TestTrainerDistributed(TestCasePlus):
        # successful return here == success - any errors would have caused an error in the sub-call


-@require_torch_multi_xpu
-class TestTrainerDistributedXPU(TestCasePlus):
-    def test_trainer(self):
-        distributed_args = f"""--nproc_per_node={torch.xpu.device_count()}
-            --master_port={get_torch_dist_unique_port()}
-            {self.test_file_dir}/test_trainer_distributed.py
-        """.split()
-        output_dir = self.get_auto_remove_tmp_dir()
-        args = f"--output_dir {output_dir}".split()
-        cmd = ["torchrun"] + distributed_args + args
-        execute_subprocess_async(cmd, env=self.get_env())
-        # successful return here == success - any errors would have caused an error in the sub-call
-
-
 if __name__ == "__main__":
    # The script below is meant to be run under torch.distributed, on a machine with multiple GPUs:
    #
--- a/tests/trainer/test_trainer_fsdp.py
+++ b/tests/trainer/test_trainer_fsdp.py
@@ -17,12 +17,15 @@ from typing import Dict
 from transformers import is_torch_available
 from transformers.testing_utils import (
    TestCasePlus,
+    backend_device_count,
    execute_subprocess_async,
    get_torch_dist_unique_port,
    require_accelerate,
    require_fp8,
    require_fsdp,
-    require_torch_multi_gpu,
+    require_torch_multi_accelerator,
+    run_first,
+    torch_device,
 )


@@ -64,9 +67,10 @@ if is_torch_available():


 class TestFSDPTrainer(TestCasePlus):
+    @require_torch_multi_accelerator
    @require_accelerate
-    @require_torch_multi_gpu
    @require_fsdp
+    @run_first
    def test_trainer(self):
        output_dir = self.get_auto_remove_tmp_dir()
        cmd = [
@@ -76,7 +80,7 @@ class TestFSDPTrainer(TestCasePlus):
            "--main_process_port",
            f"{get_torch_dist_unique_port()}",
            "--num_processes",
-            f"{torch.cuda.device_count()}",
+            f"{backend_device_count(torch_device)}",
            "--fsdp_transformer_layer_cls_to_wrap",
            "GPT2Block",
            f"{self.test_file_dir}/test_trainer_fsdp.py",
@@ -90,10 +94,11 @@ class TestFSDPTrainer(TestCasePlus):


 class TestFSDPTrainerFP8(TestCasePlus):
+    @require_torch_multi_accelerator
    @require_accelerate
-    @require_torch_multi_gpu
    @require_fsdp
    @require_fp8
+    @run_first
    def test_trainer(self):
        output_dir = self.get_auto_remove_tmp_dir()
        cmd = [
@@ -103,7 +108,7 @@ class TestFSDPTrainerFP8(TestCasePlus):
            "--main_process_port",
            f"{get_torch_dist_unique_port()}",
            "--num_processes",
-            f"{torch.cuda.device_count()}",
+            f"{backend_device_count(torch_device)}",
            "--mixed_precision",
            "fp8",
            "--fsdp_transformer_layer_cls_to_wrap",
@@ -117,32 +122,34 @@ class TestFSDPTrainerFP8(TestCasePlus):
        execute_subprocess_async(cmd, env=self.get_env())
        # successful return here == success - any errors would have caused an error in the sub-call

-    class TestFSDPTrainerWrap(TestCasePlus):
-        @require_accelerate
-        @require_torch_multi_gpu
-        @require_fsdp
-        def test_trainer(self):
-            output_dir = self.get_auto_remove_tmp_dir()
-            cmd = [
-                "accelerate",
-                "launch",
-                "--use_fsdp",
-                "--main_process_port",
-                f"{get_torch_dist_unique_port()}",
-                "--num_processes",
-                f"{torch.cuda.device_count()}",
-                "--fsdp_transformer_layer_cls_to_wrap",
-                "GPT2Block",
-                f"{self.test_file_dir}/test_trainer_fsdp.py",
-                "--output_dir",
-                f"{output_dir}",
-                "--report_to",
-                "none",
-                "--auto_find_batch_size",
-                "True",
-            ]
-            execute_subprocess_async(cmd, env=self.get_env())
-            # successful return here == success - any errors would have caused an error in the sub-call
+
+class TestFSDPTrainerWrap(TestCasePlus):
+    @require_torch_multi_accelerator
+    @require_accelerate
+    @require_fsdp
+    @run_first
+    def test_trainer(self):
+        output_dir = self.get_auto_remove_tmp_dir()
+        cmd = [
+            "accelerate",
+            "launch",
+            "--use_fsdp",
+            "--main_process_port",
+            f"{get_torch_dist_unique_port()}",
+            "--num_processes",
+            f"{backend_device_count(torch_device)}",
+            "--fsdp_transformer_layer_cls_to_wrap",
+            "GPT2Block",
+            f"{self.test_file_dir}/test_trainer_fsdp.py",
+            "--output_dir",
+            f"{output_dir}",
+            "--report_to",
+            "none",
+            "--auto_find_batch_size",
+            "True",
+        ]
+        execute_subprocess_async(cmd, env=self.get_env())
+        # successful return here == success - any errors would have caused an error in the sub-call


 if __name__ == "__main__":