Gaudi3 CI (#38790)
This commit is contained in:
committed by
GitHub
parent
2166b6b4ff
commit
984ff89e73
@@ -3062,6 +3062,7 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
|
||||
# the test slower.
|
||||
@require_torch_non_multi_accelerator
|
||||
@run_test_using_subprocess
|
||||
@run_first
|
||||
@slow
|
||||
def test_can_resume_training_lm(self):
|
||||
# Check if it works for a simple language modeling example
|
||||
@@ -3517,7 +3518,6 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
|
||||
)
|
||||
|
||||
@slow
|
||||
@run_first
|
||||
def test_trainer_eval_mrpc(self):
|
||||
MODEL_ID = "google-bert/bert-base-cased-finetuned-mrpc"
|
||||
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
|
||||
@@ -3534,7 +3534,6 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
|
||||
self.assertLess(result["eval_loss"], 0.2)
|
||||
|
||||
@slow
|
||||
@run_first
|
||||
def test_trainer_eval_multiple(self):
|
||||
MODEL_ID = "openai-community/gpt2"
|
||||
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
|
||||
@@ -4125,6 +4124,7 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
|
||||
self.assertListEqual(trainer.optimizer.param_groups[1]["params"], no_wd_params)
|
||||
|
||||
@slow
|
||||
@run_first
|
||||
@require_non_hpu
|
||||
@require_torch_multi_accelerator
|
||||
def test_end_to_end_example(self):
|
||||
|
||||
@@ -22,6 +22,7 @@ from transformers.testing_utils import (
|
||||
execute_subprocess_async,
|
||||
get_torch_dist_unique_port,
|
||||
require_torch_multi_accelerator,
|
||||
run_first,
|
||||
torch_device,
|
||||
)
|
||||
from transformers.training_args import ParallelMode
|
||||
@@ -116,6 +117,7 @@ if is_torch_available():
|
||||
|
||||
|
||||
class TestTrainerDistributed(TestCasePlus):
|
||||
@run_first
|
||||
@require_torch_multi_accelerator
|
||||
def test_trainer(self):
|
||||
distributed_args = f"""--nproc_per_node={backend_device_count(torch_device)}
|
||||
@@ -199,8 +201,7 @@ if __name__ == "__main__":
|
||||
model = RegressionModel()
|
||||
training_args.per_device_train_batch_size = 1
|
||||
training_args.max_steps = 1
|
||||
training_args.accelerator_config = {
|
||||
"dispatch_batches": False,
|
||||
}
|
||||
training_args.accelerator_config.dispatch_batches = False
|
||||
|
||||
trainer = Trainer(model, training_args, train_dataset=train_dataset)
|
||||
trainer.train()
|
||||
|
||||
@@ -18,11 +18,13 @@ from transformers.testing_utils import (
|
||||
execute_subprocess_async,
|
||||
get_torch_dist_unique_port,
|
||||
require_torch_multi_accelerator,
|
||||
run_first,
|
||||
torch_device,
|
||||
)
|
||||
|
||||
|
||||
class TestTrainerDistributedLoss(TestCasePlus):
|
||||
@run_first
|
||||
@require_torch_multi_accelerator
|
||||
def test_trainer(self):
|
||||
device_count = backend_device_count(torch_device)
|
||||
|
||||
@@ -18,6 +18,7 @@ from transformers.testing_utils import (
|
||||
execute_subprocess_async,
|
||||
get_torch_dist_unique_port,
|
||||
require_torch_multi_accelerator,
|
||||
run_first,
|
||||
torch_device,
|
||||
)
|
||||
|
||||
@@ -57,6 +58,7 @@ class DummyModel(nn.Module):
|
||||
|
||||
|
||||
class TestTrainerDistributedWorkerSeed(TestCasePlus):
|
||||
@run_first
|
||||
@require_torch_multi_accelerator
|
||||
def test_trainer(self):
|
||||
device_count = backend_device_count(torch_device)
|
||||
|
||||
Reference in New Issue
Block a user