Fix many HPU failures in the CI (#39066)

* more torch.hpu patches * increase top_k because it results in flaky behavior when Tempreture, TopP and TopK are used together, which ends up killing beams early. * remove temporal fix * fix scatter operation when input and src are the same * trigger * fix and reduce * skip finding batch size as it makes the hpu go loco * fix fsdp (yay all are passing) * fix checking equal nan values * style * remove models list * order * rename to cuda_extensions * Update src/transformers/trainer.py
2025-07-03 11:17:27 +02:00
parent bff964c429
commit 18e0cae207
5 changed files with 71 additions and 54 deletions
--- a/tests/trainer/test_trainer.py
+++ b/tests/trainer/test_trainer.py
@@ -662,6 +662,11 @@ class TrainerIntegrationCommon:
        metrics = trainer.evaluate()
        self.assertEqual(metrics[metric], best_value)

+    def remove_nan_logs(self, log):
+        for key in list(log.keys()):
+            if log[key] != log[key]:  # Check if the value is NaN
+                del log[key]
+
    def check_trainer_state_are_the_same(self, trainer_state, trainer_state1):
        # We'll pop things so operate on copies.
        state = trainer_state.copy()
@@ -675,6 +680,10 @@ class TrainerIntegrationCommon:
            for key in skip_log_keys:
                _ = log.pop(key, None)
                _ = log1.pop(key, None)
+
+            self.remove_nan_logs(log)
+            self.remove_nan_logs(log1)
+
            self.assertEqual(log, log1)

    def convert_to_sharded_checkpoint(self, folder, save_safe=True, load_safe=True):
@@ -3174,6 +3183,7 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
            self.assertAlmostEqual(b, b1, delta=1e-5)

    @slow
+    @require_non_hpu
    @require_accelerate
    @require_torch_non_multi_accelerator
    def test_auto_batch_size_finder(self):