Fix many HPU failures in the CI (#39066)

* more torch.hpu patches

* increase top_k because it results in flaky behavior when Tempreture, TopP and TopK are used together, which ends up killing beams early.

* remove temporal fix

* fix scatter operation when input and src are the same

* trigger

* fix and reduce

* skip finding batch size as it makes the hpu go loco

* fix fsdp (yay all are passing)

* fix checking equal nan values

* style

* remove models list

* order

* rename to cuda_extensions

* Update src/transformers/trainer.py
This commit is contained in:
Ilyas Moutawwakil
2025-07-03 11:17:27 +02:00
committed by GitHub
parent bff964c429
commit 18e0cae207
5 changed files with 71 additions and 54 deletions

View File

@@ -662,6 +662,11 @@ class TrainerIntegrationCommon:
metrics = trainer.evaluate()
self.assertEqual(metrics[metric], best_value)
def remove_nan_logs(self, log):
for key in list(log.keys()):
if log[key] != log[key]: # Check if the value is NaN
del log[key]
def check_trainer_state_are_the_same(self, trainer_state, trainer_state1):
# We'll pop things so operate on copies.
state = trainer_state.copy()
@@ -675,6 +680,10 @@ class TrainerIntegrationCommon:
for key in skip_log_keys:
_ = log.pop(key, None)
_ = log1.pop(key, None)
self.remove_nan_logs(log)
self.remove_nan_logs(log1)
self.assertEqual(log, log1)
def convert_to_sharded_checkpoint(self, folder, save_safe=True, load_safe=True):
@@ -3174,6 +3183,7 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
self.assertAlmostEqual(b, b1, delta=1e-5)
@slow
@require_non_hpu
@require_accelerate
@require_torch_non_multi_accelerator
def test_auto_batch_size_finder(self):