Gaudi3 CI (#38790)

2025-06-23 10:56:51 +02:00
parent 2166b6b4ff
commit 984ff89e73
16 changed files with 618 additions and 14 deletions
--- a/tests/deepspeed/test_deepspeed.py
+++ b/tests/deepspeed/test_deepspeed.py
@@ -1134,10 +1134,12 @@ class TestDeepSpeedWithLauncher(TestCasePlus):

    @parameterized.expand(params, name_func=parameterized_custom_name_func)
    @require_torch_multi_accelerator
+    @run_first
    def test_basic_distributed(self, stage, dtype):
        self.run_and_check(stage=stage, dtype=dtype, distributed=True)

    @require_torch_fp16
+    @run_first
    def test_do_eval_no_train(self):
        # testing only zero3 since zero2 makes no sense with inference
        self.run_and_check(
@@ -1150,6 +1152,7 @@ class TestDeepSpeedWithLauncher(TestCasePlus):
        )

    @parameterized.expand(params, name_func=parameterized_custom_name_func)
+    @run_first
    def test_fp32_non_distributed(self, stage, dtype):
        # real model needs too much GPU memory under stage2+fp32, so using tiny random model here -
        # therefore no quality checks, just basic completion checks are done
@@ -1166,6 +1169,7 @@ class TestDeepSpeedWithLauncher(TestCasePlus):

    @parameterized.expand(params, name_func=parameterized_custom_name_func)
    @require_torch_multi_accelerator
+    @run_first
    def test_fp32_distributed(self, stage, dtype):
        # real model needs too much GPU memory under stage2+fp32, so using tiny random model here -
        # therefore no quality checks, just basic completion checks are done
@@ -1181,6 +1185,7 @@ class TestDeepSpeedWithLauncher(TestCasePlus):
        )

    @parameterized.expand(params, name_func=parameterized_custom_name_func)
+    @run_first
    def test_resume_train_not_from_ds_checkpoint(self, stage, dtype):
        # do normal training and then resume not from the deepspeed checkpoint but explicitly from
        # the saved model dir
@@ -1207,6 +1212,7 @@ class TestDeepSpeedWithLauncher(TestCasePlus):

    @parameterized.expand(["bf16", "fp16", "fp32"])
    @require_torch_multi_accelerator
+    @run_first
    def test_inference(self, dtype):
        if dtype == "bf16" and not is_torch_bf16_available_on_device(torch_device):
            self.skipTest(reason="test requires bfloat16 hardware support")
@@ -1361,6 +1367,7 @@ class TestDeepSpeedWithLauncher(TestCasePlus):
        return output_dir

    @parameterized.expand(params, name_func=parameterized_custom_name_func)
+    @run_first
    def test_clm(self, stage, dtype):
        # this test exercises model.resize_token_embeddings() which requires param gathering outside
        # of forward - it's not used by `run_translation.py`, but it is in `run_clm.py`
@@ -1397,6 +1404,7 @@ class TestDeepSpeedWithLauncher(TestCasePlus):
        execute_subprocess_async(cmd, env=self.get_env())

    @require_torch_fp16
+    @run_first
    def test_clm_from_config_zero3_fp16(self):
        # this test exercises AutoModel.from_config(config) - to ensure zero.Init is called

--- a/tests/deepspeed/test_model_zoo.py
+++ b/tests/deepspeed/test_model_zoo.py
@@ -28,6 +28,7 @@ from transformers.testing_utils import (
    get_tests_dir,
    require_deepspeed,
    require_torch_accelerator,
+    run_first,
    slow,
    torch_device,
 )
@@ -327,6 +328,7 @@ params = list(itertools.product(stages, task_cmds.keys()))


@slow
+@run_first
@require_deepspeed
@require_torch_accelerator
 class TestDeepSpeedModelZoo(TestCasePlus):
--- a/tests/fsdp/test_fsdp.py
+++ b/tests/fsdp/test_fsdp.py
@@ -358,6 +358,7 @@ class TrainerIntegrationFSDP(TestCasePlus, TrainerIntegrationCommon):
            raise AssertionError("CPU offloading failed with FSDP!")

    @require_torch_multi_accelerator
+    @run_first
    @slow
    @require_fsdp_v2_version
    @require_accelerate_fsdp2
@@ -405,6 +406,7 @@ class TrainerIntegrationFSDP(TestCasePlus, TrainerIntegrationCommon):
                self.assertAlmostEqual(log["learning_rate"], log1["learning_rate"], delta=1e-5)

    @require_torch_multi_accelerator
+    @run_first
    @slow
    @require_fsdp
    @require_fsdp_v2_version
--- a/tests/test_modeling_common.py
+++ b/tests/test_modeling_common.py
@@ -84,6 +84,7 @@ from transformers.testing_utils import (
    require_bitsandbytes,
    require_deepspeed,
    require_flash_attn,
+    require_non_hpu,
    require_safetensors,
    require_torch,
    require_torch_accelerator,
@@ -92,6 +93,7 @@ from transformers.testing_utils import (
    require_torch_multi_accelerator,
    require_torch_multi_gpu,
    require_torch_sdpa,
+    run_first,
    run_test_using_subprocess,
    set_config_for_less_flaky_test,
    set_model_for_less_flaky_test,
@@ -2797,6 +2799,7 @@ class ModelTesterMixin:
                    else:
                        torch.testing.assert_close(base_output[0], new_output[0], rtol=1e-5, atol=1e-5)

+    @require_non_hpu
    @require_accelerate
    @mark.accelerate_tests
    @require_torch_multi_accelerator
@@ -3727,6 +3730,9 @@ class ModelTesterMixin:
                if torch_device in ["cpu", "cuda"]:
                    atol = atols[torch_device, enable_kernels, torch_dtype]
                    rtol = rtols[torch_device, enable_kernels, torch_dtype]
+                elif torch_device == "hpu":
+                    atol = atols["cuda", enable_kernels, torch_dtype]
+                    rtol = rtols["cuda", enable_kernels, torch_dtype]
                elif torch_device == "xpu":
                    # As of PyTorch 2.5 XPU backend supports only torch.nn.attention.SDPBackend.MATH
                    # which is implemented on PyTorch level using aten operators and is
@@ -4666,6 +4672,7 @@ class ModelTesterMixin:

    # Here we need to run with a subprocess as otherwise setting back the default device to the default value ("cpu")
    # may bring unwanted consequences on other tests. See PR #37553
+    @run_first
    @run_test_using_subprocess
    @require_torch_accelerator
    def test_can_load_with_global_device_set(self):
--- a/tests/trainer/test_trainer.py
+++ b/tests/trainer/test_trainer.py
@@ -3062,6 +3062,7 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
    # the test slower.
    @require_torch_non_multi_accelerator
    @run_test_using_subprocess
+    @run_first
    @slow
    def test_can_resume_training_lm(self):
        # Check if it works for a simple language modeling example
@@ -3517,7 +3518,6 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
                )

    @slow
-    @run_first
    def test_trainer_eval_mrpc(self):
        MODEL_ID = "google-bert/bert-base-cased-finetuned-mrpc"
        tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
@@ -3534,7 +3534,6 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
            self.assertLess(result["eval_loss"], 0.2)

    @slow
-    @run_first
    def test_trainer_eval_multiple(self):
        MODEL_ID = "openai-community/gpt2"
        tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
@@ -4125,6 +4124,7 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
            self.assertListEqual(trainer.optimizer.param_groups[1]["params"], no_wd_params)

    @slow
+    @run_first
    @require_non_hpu
    @require_torch_multi_accelerator
    def test_end_to_end_example(self):
--- a/tests/trainer/test_trainer_distributed.py
+++ b/tests/trainer/test_trainer_distributed.py
@@ -22,6 +22,7 @@ from transformers.testing_utils import (
    execute_subprocess_async,
    get_torch_dist_unique_port,
    require_torch_multi_accelerator,
+    run_first,
    torch_device,
 )
 from transformers.training_args import ParallelMode
@@ -116,6 +117,7 @@ if is_torch_available():


 class TestTrainerDistributed(TestCasePlus):
+    @run_first
    @require_torch_multi_accelerator
    def test_trainer(self):
        distributed_args = f"""--nproc_per_node={backend_device_count(torch_device)}
@@ -199,8 +201,7 @@ if __name__ == "__main__":
    model = RegressionModel()
    training_args.per_device_train_batch_size = 1
    training_args.max_steps = 1
-    training_args.accelerator_config = {
-        "dispatch_batches": False,
-    }
+    training_args.accelerator_config.dispatch_batches = False
+
    trainer = Trainer(model, training_args, train_dataset=train_dataset)
    trainer.train()
--- a/tests/trainer/test_trainer_distributed_loss.py
+++ b/tests/trainer/test_trainer_distributed_loss.py
@@ -18,11 +18,13 @@ from transformers.testing_utils import (
    execute_subprocess_async,
    get_torch_dist_unique_port,
    require_torch_multi_accelerator,
+    run_first,
    torch_device,
 )


 class TestTrainerDistributedLoss(TestCasePlus):
+    @run_first
    @require_torch_multi_accelerator
    def test_trainer(self):
        device_count = backend_device_count(torch_device)
--- a/tests/trainer/test_trainer_distributed_worker_seed.py
+++ b/tests/trainer/test_trainer_distributed_worker_seed.py
@@ -18,6 +18,7 @@ from transformers.testing_utils import (
    execute_subprocess_async,
    get_torch_dist_unique_port,
    require_torch_multi_accelerator,
+    run_first,
    torch_device,
 )

@@ -57,6 +58,7 @@ class DummyModel(nn.Module):


 class TestTrainerDistributedWorkerSeed(TestCasePlus):
+    @run_first
    @require_torch_multi_accelerator
    def test_trainer(self):
        device_count = backend_device_count(torch_device)
--- a/tests/utils/test_modeling_utils.py
+++ b/tests/utils/test_modeling_utils.py
@@ -58,6 +58,7 @@ from transformers.testing_utils import (
    is_staging_test,
    require_accelerate,
    require_flax,
+    require_non_hpu,
    require_read_token,
    require_safetensors,
    require_tf,
@@ -1002,6 +1003,7 @@ class ModelUtilsTest(TestCasePlus):

        self.assertIsNotNone(model)

+    @require_non_hpu
    @require_accelerate
    @mark.accelerate_tests
    @require_torch_multi_accelerator