This commit is contained in:
Ilyas Moutawwakil
2025-06-23 10:56:51 +02:00
committed by GitHub
parent 2166b6b4ff
commit 984ff89e73
16 changed files with 618 additions and 14 deletions

View File

@@ -3062,6 +3062,7 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
# the test slower.
@require_torch_non_multi_accelerator
@run_test_using_subprocess
@run_first
@slow
def test_can_resume_training_lm(self):
# Check if it works for a simple language modeling example
@@ -3517,7 +3518,6 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
)
@slow
@run_first
def test_trainer_eval_mrpc(self):
MODEL_ID = "google-bert/bert-base-cased-finetuned-mrpc"
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
@@ -3534,7 +3534,6 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
self.assertLess(result["eval_loss"], 0.2)
@slow
@run_first
def test_trainer_eval_multiple(self):
MODEL_ID = "openai-community/gpt2"
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
@@ -4125,6 +4124,7 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
self.assertListEqual(trainer.optimizer.param_groups[1]["params"], no_wd_params)
@slow
@run_first
@require_non_hpu
@require_torch_multi_accelerator
def test_end_to_end_example(self):

View File

@@ -22,6 +22,7 @@ from transformers.testing_utils import (
execute_subprocess_async,
get_torch_dist_unique_port,
require_torch_multi_accelerator,
run_first,
torch_device,
)
from transformers.training_args import ParallelMode
@@ -116,6 +117,7 @@ if is_torch_available():
class TestTrainerDistributed(TestCasePlus):
@run_first
@require_torch_multi_accelerator
def test_trainer(self):
distributed_args = f"""--nproc_per_node={backend_device_count(torch_device)}
@@ -199,8 +201,7 @@ if __name__ == "__main__":
model = RegressionModel()
training_args.per_device_train_batch_size = 1
training_args.max_steps = 1
training_args.accelerator_config = {
"dispatch_batches": False,
}
training_args.accelerator_config.dispatch_batches = False
trainer = Trainer(model, training_args, train_dataset=train_dataset)
trainer.train()

View File

@@ -18,11 +18,13 @@ from transformers.testing_utils import (
execute_subprocess_async,
get_torch_dist_unique_port,
require_torch_multi_accelerator,
run_first,
torch_device,
)
class TestTrainerDistributedLoss(TestCasePlus):
@run_first
@require_torch_multi_accelerator
def test_trainer(self):
device_count = backend_device_count(torch_device)

View File

@@ -18,6 +18,7 @@ from transformers.testing_utils import (
execute_subprocess_async,
get_torch_dist_unique_port,
require_torch_multi_accelerator,
run_first,
torch_device,
)
@@ -57,6 +58,7 @@ class DummyModel(nn.Module):
class TestTrainerDistributedWorkerSeed(TestCasePlus):
@run_first
@require_torch_multi_accelerator
def test_trainer(self):
device_count = backend_device_count(torch_device)