Use inherit tempdir makers for tests + fix failing DS tests (#35600)
* Use existing APIs to make tempdir folders * Fixup deepspeed too * output_dir -> tmp_dir
This commit is contained in:
@@ -482,6 +482,7 @@ class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, T
|
|||||||
max_grad_norm=max_grad_norm,
|
max_grad_norm=max_grad_norm,
|
||||||
adam_beta1=adam_beta1,
|
adam_beta1=adam_beta1,
|
||||||
adam_beta2=adam_beta2,
|
adam_beta2=adam_beta2,
|
||||||
|
output_dir=self.get_auto_remove_tmp_dir(),
|
||||||
)
|
)
|
||||||
with self.assertRaises(Exception) as context:
|
with self.assertRaises(Exception) as context:
|
||||||
trainer.train()
|
trainer.train()
|
||||||
@@ -506,7 +507,9 @@ class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, T
|
|||||||
del ds_config_zero2_dict["scheduler"] # force default HF Trainer scheduler
|
del ds_config_zero2_dict["scheduler"] # force default HF Trainer scheduler
|
||||||
ds_config_zero2_dict["zero_optimization"]["offload_optimizer"]["device"] = "none"
|
ds_config_zero2_dict["zero_optimization"]["offload_optimizer"]["device"] = "none"
|
||||||
ds_config_zero2_dict["fp16"]["initial_scale_power"] = 1 # force optimizer on the first step
|
ds_config_zero2_dict["fp16"]["initial_scale_power"] = 1 # force optimizer on the first step
|
||||||
trainer = get_regression_trainer(a=a, local_rank=0, fp16=True, deepspeed=ds_config_zero2_dict)
|
trainer = get_regression_trainer(
|
||||||
|
a=a, local_rank=0, fp16=True, deepspeed=ds_config_zero2_dict, output_dir=self.get_auto_remove_tmp_dir()
|
||||||
|
)
|
||||||
trainer.train()
|
trainer.train()
|
||||||
new_a = trainer.model.a.item()
|
new_a = trainer.model.a.item()
|
||||||
self.assertNotEqual(new_a, a)
|
self.assertNotEqual(new_a, a)
|
||||||
@@ -518,7 +521,9 @@ class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, T
|
|||||||
del ds_config_zero2_dict["optimizer"] # force default HF Trainer optimizer
|
del ds_config_zero2_dict["optimizer"] # force default HF Trainer optimizer
|
||||||
ds_config_zero2_dict["zero_optimization"]["offload_optimizer"]["device"] = "none"
|
ds_config_zero2_dict["zero_optimization"]["offload_optimizer"]["device"] = "none"
|
||||||
ds_config_zero2_dict["fp16"]["initial_scale_power"] = 1 # force optimizer on the first step
|
ds_config_zero2_dict["fp16"]["initial_scale_power"] = 1 # force optimizer on the first step
|
||||||
trainer = get_regression_trainer(a=a, local_rank=0, fp16=True, deepspeed=ds_config_zero2_dict)
|
trainer = get_regression_trainer(
|
||||||
|
a=a, local_rank=0, fp16=True, deepspeed=ds_config_zero2_dict, output_dir=self.get_auto_remove_tmp_dir()
|
||||||
|
)
|
||||||
trainer.train()
|
trainer.train()
|
||||||
new_a = trainer.model.a.item()
|
new_a = trainer.model.a.item()
|
||||||
self.assertNotEqual(new_a, a)
|
self.assertNotEqual(new_a, a)
|
||||||
@@ -530,7 +535,9 @@ class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, T
|
|||||||
del ds_config_zero2_dict["scheduler"] # force default HF Trainer scheduler
|
del ds_config_zero2_dict["scheduler"] # force default HF Trainer scheduler
|
||||||
ds_config_zero2_dict["zero_optimization"]["offload_optimizer"]["device"] = "none"
|
ds_config_zero2_dict["zero_optimization"]["offload_optimizer"]["device"] = "none"
|
||||||
ds_config_zero2_dict["fp16"]["initial_scale_power"] = 1 # force optimizer on the first step
|
ds_config_zero2_dict["fp16"]["initial_scale_power"] = 1 # force optimizer on the first step
|
||||||
trainer = get_regression_trainer(a=a, local_rank=0, fp16=True, deepspeed=ds_config_zero2_dict)
|
trainer = get_regression_trainer(
|
||||||
|
a=a, local_rank=0, fp16=True, deepspeed=ds_config_zero2_dict, output_dir=self.get_auto_remove_tmp_dir()
|
||||||
|
)
|
||||||
trainer.train()
|
trainer.train()
|
||||||
new_a = trainer.model.a.item()
|
new_a = trainer.model.a.item()
|
||||||
self.assertNotEqual(new_a, a)
|
self.assertNotEqual(new_a, a)
|
||||||
@@ -546,7 +553,9 @@ class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, T
|
|||||||
ds_config_zero3_dict["zero_optimization"]["offload_optimizer"] = nvme_config
|
ds_config_zero3_dict["zero_optimization"]["offload_optimizer"] = nvme_config
|
||||||
ds_config_zero3_dict["zero_optimization"]["offload_param"] = nvme_config
|
ds_config_zero3_dict["zero_optimization"]["offload_param"] = nvme_config
|
||||||
ds_config_zero3_dict["zero_optimization"]["stage3_gather_16bit_weights_on_model_save"] = True
|
ds_config_zero3_dict["zero_optimization"]["stage3_gather_16bit_weights_on_model_save"] = True
|
||||||
trainer = get_regression_trainer(local_rank=0, fp16=True, deepspeed=ds_config_zero3_dict)
|
trainer = get_regression_trainer(
|
||||||
|
local_rank=0, fp16=True, deepspeed=ds_config_zero3_dict, output_dir=self.get_auto_remove_tmp_dir()
|
||||||
|
)
|
||||||
with CaptureLogger(deepspeed_logger) as cl:
|
with CaptureLogger(deepspeed_logger) as cl:
|
||||||
trainer.train()
|
trainer.train()
|
||||||
self.assertIn("DeepSpeed info", cl.out, "expected DeepSpeed logger output but got none")
|
self.assertIn("DeepSpeed info", cl.out, "expected DeepSpeed logger output but got none")
|
||||||
@@ -567,6 +576,7 @@ class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, T
|
|||||||
fp16=True,
|
fp16=True,
|
||||||
model_init=model_init,
|
model_init=model_init,
|
||||||
deepspeed=ds_config_zero3_dict,
|
deepspeed=ds_config_zero3_dict,
|
||||||
|
output_dir=self.get_auto_remove_tmp_dir(),
|
||||||
)
|
)
|
||||||
|
|
||||||
n_trials = 3
|
n_trials = 3
|
||||||
@@ -588,7 +598,7 @@ class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, T
|
|||||||
ds_config_dict["zero_optimization"]["offload_optimizer"]["device"] = "cpu"
|
ds_config_dict["zero_optimization"]["offload_optimizer"]["device"] = "cpu"
|
||||||
ds_config_dict["zero_force_ds_cpu_optimizer"] = False # offload is not efficient w/o CPUAdam
|
ds_config_dict["zero_force_ds_cpu_optimizer"] = False # offload is not efficient w/o CPUAdam
|
||||||
with mockenv_context(**self.dist_env_1_gpu):
|
with mockenv_context(**self.dist_env_1_gpu):
|
||||||
kwargs = {"local_rank": 0, "deepspeed": ds_config_dict}
|
kwargs = {"local_rank": 0, "deepspeed": ds_config_dict, "output_dir": self.get_auto_remove_tmp_dir()}
|
||||||
kwargs[dtype] = True
|
kwargs[dtype] = True
|
||||||
trainer = get_regression_trainer(**kwargs)
|
trainer = get_regression_trainer(**kwargs)
|
||||||
with CaptureLogger(deepspeed_logger) as cl:
|
with CaptureLogger(deepspeed_logger) as cl:
|
||||||
@@ -604,7 +614,11 @@ class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, T
|
|||||||
# it's run not as a first test as `sys.stdout` will no longer be the same. So we either have
|
# it's run not as a first test as `sys.stdout` will no longer be the same. So we either have
|
||||||
# to reset `deepspeed_logger.handlers[0].setStream(sys.stdout)` or directly capture from the deepspeed_logger.
|
# to reset `deepspeed_logger.handlers[0].setStream(sys.stdout)` or directly capture from the deepspeed_logger.
|
||||||
with mockenv_context(**self.dist_env_1_gpu):
|
with mockenv_context(**self.dist_env_1_gpu):
|
||||||
kwargs = {"local_rank": 0, "deepspeed": self.get_config_dict(stage)}
|
kwargs = {
|
||||||
|
"local_rank": 0,
|
||||||
|
"deepspeed": self.get_config_dict(stage),
|
||||||
|
"output_dir": self.get_auto_remove_tmp_dir(),
|
||||||
|
}
|
||||||
kwargs[dtype] = True
|
kwargs[dtype] = True
|
||||||
trainer = get_regression_trainer(**kwargs)
|
trainer = get_regression_trainer(**kwargs)
|
||||||
|
|
||||||
@@ -630,6 +644,7 @@ class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, T
|
|||||||
"deepspeed": self.get_config_dict(stage),
|
"deepspeed": self.get_config_dict(stage),
|
||||||
"per_device_train_batch_size": 8,
|
"per_device_train_batch_size": 8,
|
||||||
"logging_steps": 1,
|
"logging_steps": 1,
|
||||||
|
"output_dir": self.get_auto_remove_tmp_dir(),
|
||||||
}
|
}
|
||||||
kwargs[dtype] = True
|
kwargs[dtype] = True
|
||||||
trainer = get_regression_trainer(**kwargs)
|
trainer = get_regression_trainer(**kwargs)
|
||||||
@@ -673,6 +688,7 @@ class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, T
|
|||||||
"local_rank": 0,
|
"local_rank": 0,
|
||||||
"train_len": train_len,
|
"train_len": train_len,
|
||||||
"deepspeed": self.get_config_dict(stage),
|
"deepspeed": self.get_config_dict(stage),
|
||||||
|
"output_dir": self.get_auto_remove_tmp_dir(),
|
||||||
}
|
}
|
||||||
kwargs[dtype] = True
|
kwargs[dtype] = True
|
||||||
|
|
||||||
|
|||||||
@@ -1222,8 +1222,7 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
|
|||||||
train_dataset = RegressionDataset()
|
train_dataset = RegressionDataset()
|
||||||
eval_dataset = RegressionDataset()
|
eval_dataset = RegressionDataset()
|
||||||
model = RegressionDictModel()
|
model = RegressionDictModel()
|
||||||
with tempfile.TemporaryDirectory() as tmp_dir:
|
args = TrainingArguments(self.get_auto_remove_tmp_dir(), report_to="none")
|
||||||
args = TrainingArguments(tmp_dir, report_to="none")
|
|
||||||
trainer = Trainer(model, args, train_dataset=train_dataset, eval_dataset=eval_dataset)
|
trainer = Trainer(model, args, train_dataset=train_dataset, eval_dataset=eval_dataset)
|
||||||
trainer.train()
|
trainer.train()
|
||||||
_ = trainer.evaluate()
|
_ = trainer.evaluate()
|
||||||
@@ -1234,8 +1233,7 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
|
|||||||
tiny_gpt2 = GPT2LMHeadModel(config)
|
tiny_gpt2 = GPT2LMHeadModel(config)
|
||||||
x = torch.randint(0, 100, (128,))
|
x = torch.randint(0, 100, (128,))
|
||||||
eval_dataset = RepeatDataset(x)
|
eval_dataset = RepeatDataset(x)
|
||||||
with tempfile.TemporaryDirectory() as tmp_dir:
|
args = TrainingArguments(self.get_auto_remove_tmp_dir(), report_to="none")
|
||||||
args = TrainingArguments(tmp_dir, report_to="none")
|
|
||||||
trainer = Trainer(tiny_gpt2, args, eval_dataset=eval_dataset)
|
trainer = Trainer(tiny_gpt2, args, eval_dataset=eval_dataset)
|
||||||
# By default the past_key_values are removed
|
# By default the past_key_values are removed
|
||||||
result = trainer.predict(eval_dataset)
|
result = trainer.predict(eval_dataset)
|
||||||
@@ -1246,7 +1244,7 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
|
|||||||
self.assertEqual(len(result.predictions), 2)
|
self.assertEqual(len(result.predictions), 2)
|
||||||
|
|
||||||
def test_training_arguments_are_left_untouched(self):
|
def test_training_arguments_are_left_untouched(self):
|
||||||
with tempfile.TemporaryDirectory() as tmp_dir:
|
tmp_dir = self.get_auto_remove_tmp_dir()
|
||||||
trainer = get_regression_trainer(output_dir=tmp_dir)
|
trainer = get_regression_trainer(output_dir=tmp_dir)
|
||||||
trainer.train()
|
trainer.train()
|
||||||
args = TrainingArguments(tmp_dir, report_to=[])
|
args = TrainingArguments(tmp_dir, report_to=[])
|
||||||
@@ -1258,7 +1256,7 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
|
|||||||
|
|
||||||
def test_number_of_steps_in_training(self):
|
def test_number_of_steps_in_training(self):
|
||||||
# Regular training has n_epochs * len(train_dl) steps
|
# Regular training has n_epochs * len(train_dl) steps
|
||||||
with tempfile.TemporaryDirectory() as tmp_dir:
|
tmp_dir = self.get_auto_remove_tmp_dir()
|
||||||
trainer = get_regression_trainer(learning_rate=0.1, output_dir=tmp_dir)
|
trainer = get_regression_trainer(learning_rate=0.1, output_dir=tmp_dir)
|
||||||
train_output = trainer.train()
|
train_output = trainer.train()
|
||||||
self.assertEqual(train_output.global_step, self.n_epochs * 64 / self.batch_size)
|
self.assertEqual(train_output.global_step, self.n_epochs * 64 / self.batch_size)
|
||||||
@@ -1277,7 +1275,7 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
|
|||||||
@require_intel_extension_for_pytorch
|
@require_intel_extension_for_pytorch
|
||||||
def test_number_of_steps_in_training_with_ipex(self):
|
def test_number_of_steps_in_training_with_ipex(self):
|
||||||
for mix_bf16 in [True, False]:
|
for mix_bf16 in [True, False]:
|
||||||
with tempfile.TemporaryDirectory() as tmp_dir:
|
tmp_dir = self.get_auto_remove_tmp_dir()
|
||||||
# Regular training has n_epochs * len(train_dl) steps
|
# Regular training has n_epochs * len(train_dl) steps
|
||||||
trainer = get_regression_trainer(
|
trainer = get_regression_trainer(
|
||||||
learning_rate=0.1, use_ipex=True, bf16=mix_bf16, use_cpu=True, output_dir=tmp_dir
|
learning_rate=0.1, use_ipex=True, bf16=mix_bf16, use_cpu=True, output_dir=tmp_dir
|
||||||
@@ -1311,9 +1309,8 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
|
|||||||
x = torch.randint(0, 100, (128,))
|
x = torch.randint(0, 100, (128,))
|
||||||
train_dataset = RepeatDataset(x)
|
train_dataset = RepeatDataset(x)
|
||||||
|
|
||||||
with tempfile.TemporaryDirectory() as tmp_dir:
|
|
||||||
args = TrainingArguments(
|
args = TrainingArguments(
|
||||||
tmp_dir,
|
self.get_auto_remove_tmp_dir(),
|
||||||
per_device_train_batch_size=2,
|
per_device_train_batch_size=2,
|
||||||
torch_compile=True,
|
torch_compile=True,
|
||||||
max_steps=1, # compile happens on the first step
|
max_steps=1, # compile happens on the first step
|
||||||
@@ -1348,9 +1345,8 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
|
|||||||
x = torch.randint(0, 100, (128,))
|
x = torch.randint(0, 100, (128,))
|
||||||
train_dataset = RepeatDataset(x)
|
train_dataset = RepeatDataset(x)
|
||||||
|
|
||||||
with tempfile.TemporaryDirectory() as tmp_dir:
|
|
||||||
args = TrainingArguments(
|
args = TrainingArguments(
|
||||||
tmp_dir,
|
self.get_auto_remove_tmp_dir(),
|
||||||
learning_rate=1e-9,
|
learning_rate=1e-9,
|
||||||
logging_steps=5,
|
logging_steps=5,
|
||||||
)
|
)
|
||||||
@@ -1387,9 +1383,9 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
|
|||||||
|
|
||||||
tokenizer.pad_token = tokenizer.eos_token
|
tokenizer.pad_token = tokenizer.eos_token
|
||||||
|
|
||||||
with tempfile.TemporaryDirectory() as tmpdir:
|
tmp_dir = self.get_auto_remove_tmp_dir()
|
||||||
args = TrainingArguments(
|
args = TrainingArguments(
|
||||||
tmpdir,
|
tmp_dir,
|
||||||
per_device_train_batch_size=1,
|
per_device_train_batch_size=1,
|
||||||
learning_rate=1e-9,
|
learning_rate=1e-9,
|
||||||
save_steps=5,
|
save_steps=5,
|
||||||
@@ -1406,7 +1402,7 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
|
|||||||
# Reinitialize trainer
|
# Reinitialize trainer
|
||||||
trainer = Trainer(tiny_model, args, processing_class=tokenizer, train_dataset=train_dataset)
|
trainer = Trainer(tiny_model, args, processing_class=tokenizer, train_dataset=train_dataset)
|
||||||
|
|
||||||
checkpoint = os.path.join(tmpdir, "checkpoint-5")
|
checkpoint = os.path.join(tmp_dir, "checkpoint-5")
|
||||||
|
|
||||||
trainer.train(resume_from_checkpoint=checkpoint)
|
trainer.train(resume_from_checkpoint=checkpoint)
|
||||||
parameters1 = dict(tiny_model.named_parameters())
|
parameters1 = dict(tiny_model.named_parameters())
|
||||||
@@ -1421,10 +1417,13 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
|
|||||||
x = torch.randint(0, 100, (128,))
|
x = torch.randint(0, 100, (128,))
|
||||||
train_dataset = RepeatDataset(x)
|
train_dataset = RepeatDataset(x)
|
||||||
|
|
||||||
with tempfile.TemporaryDirectory() as tmpdir:
|
|
||||||
# Trainer without inf/nan filter
|
# Trainer without inf/nan filter
|
||||||
args = TrainingArguments(
|
args = TrainingArguments(
|
||||||
tmpdir, learning_rate=1e-9, logging_steps=5, logging_nan_inf_filter=False, optim="rmsprop_bnb"
|
self.get_auto_remove_tmp_dir(),
|
||||||
|
learning_rate=1e-9,
|
||||||
|
logging_steps=5,
|
||||||
|
logging_nan_inf_filter=False,
|
||||||
|
optim="rmsprop_bnb",
|
||||||
)
|
)
|
||||||
trainer = Trainer(tiny_gpt2, args, train_dataset=train_dataset)
|
trainer = Trainer(tiny_gpt2, args, train_dataset=train_dataset)
|
||||||
|
|
||||||
@@ -1438,10 +1437,13 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
|
|||||||
x = torch.randint(0, 100, (128,))
|
x = torch.randint(0, 100, (128,))
|
||||||
train_dataset = RepeatDataset(x)
|
train_dataset = RepeatDataset(x)
|
||||||
|
|
||||||
with tempfile.TemporaryDirectory() as tmpdir:
|
|
||||||
# Trainer without inf/nan filter
|
# Trainer without inf/nan filter
|
||||||
args = TrainingArguments(
|
args = TrainingArguments(
|
||||||
tmpdir, learning_rate=1e-9, logging_steps=5, logging_nan_inf_filter=False, optim="ademamix"
|
self.get_auto_remove_tmp_dir(),
|
||||||
|
learning_rate=1e-9,
|
||||||
|
logging_steps=5,
|
||||||
|
logging_nan_inf_filter=False,
|
||||||
|
optim="ademamix",
|
||||||
)
|
)
|
||||||
trainer = Trainer(tiny_gpt2, args, train_dataset=train_dataset)
|
trainer = Trainer(tiny_gpt2, args, train_dataset=train_dataset)
|
||||||
|
|
||||||
@@ -1455,10 +1457,13 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
|
|||||||
x = torch.randint(0, 100, (128,))
|
x = torch.randint(0, 100, (128,))
|
||||||
train_dataset = RepeatDataset(x)
|
train_dataset = RepeatDataset(x)
|
||||||
|
|
||||||
with tempfile.TemporaryDirectory() as tmpdir:
|
|
||||||
# Trainer without inf/nan filter
|
# Trainer without inf/nan filter
|
||||||
args = TrainingArguments(
|
args = TrainingArguments(
|
||||||
tmpdir, learning_rate=1e-9, logging_steps=5, logging_nan_inf_filter=False, optim="ademamix_8bit"
|
self.get_auto_remove_tmp_dir(),
|
||||||
|
learning_rate=1e-9,
|
||||||
|
logging_steps=5,
|
||||||
|
logging_nan_inf_filter=False,
|
||||||
|
optim="ademamix_8bit",
|
||||||
)
|
)
|
||||||
trainer = Trainer(tiny_gpt2, args, train_dataset=train_dataset)
|
trainer = Trainer(tiny_gpt2, args, train_dataset=train_dataset)
|
||||||
|
|
||||||
@@ -1472,10 +1477,13 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
|
|||||||
x = torch.randint(0, 100, (128,))
|
x = torch.randint(0, 100, (128,))
|
||||||
train_dataset = RepeatDataset(x)
|
train_dataset = RepeatDataset(x)
|
||||||
|
|
||||||
with tempfile.TemporaryDirectory() as tmpdir:
|
|
||||||
# Trainer without inf/nan filter
|
# Trainer without inf/nan filter
|
||||||
args = TrainingArguments(
|
args = TrainingArguments(
|
||||||
tmpdir, learning_rate=1e-9, logging_steps=5, logging_nan_inf_filter=False, optim="rmsprop_bnb_8bit"
|
self.get_auto_remove_tmp_dir(),
|
||||||
|
learning_rate=1e-9,
|
||||||
|
logging_steps=5,
|
||||||
|
logging_nan_inf_filter=False,
|
||||||
|
optim="rmsprop_bnb_8bit",
|
||||||
)
|
)
|
||||||
trainer = Trainer(tiny_gpt2, args, train_dataset=train_dataset)
|
trainer = Trainer(tiny_gpt2, args, train_dataset=train_dataset)
|
||||||
|
|
||||||
@@ -1488,10 +1496,13 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
|
|||||||
tiny_gpt2 = GPT2LMHeadModel(config)
|
tiny_gpt2 = GPT2LMHeadModel(config)
|
||||||
x = torch.randint(0, 100, (128,))
|
x = torch.randint(0, 100, (128,))
|
||||||
train_dataset = RepeatDataset(x)
|
train_dataset = RepeatDataset(x)
|
||||||
with tempfile.TemporaryDirectory() as tmpdir:
|
|
||||||
# Trainer without inf/nan filter
|
# Trainer without inf/nan filter
|
||||||
args = TrainingArguments(
|
args = TrainingArguments(
|
||||||
tmpdir, learning_rate=1e-9, logging_steps=5, logging_nan_inf_filter=False, optim="rmsprop_bnb_32bit"
|
self.get_auto_remove_tmp_dir(),
|
||||||
|
learning_rate=1e-9,
|
||||||
|
logging_steps=5,
|
||||||
|
logging_nan_inf_filter=False,
|
||||||
|
optim="rmsprop_bnb_32bit",
|
||||||
)
|
)
|
||||||
trainer = Trainer(tiny_gpt2, args, train_dataset=train_dataset)
|
trainer = Trainer(tiny_gpt2, args, train_dataset=train_dataset)
|
||||||
|
|
||||||
@@ -1505,9 +1516,8 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
|
|||||||
train_dataset = RepeatDataset(x)
|
train_dataset = RepeatDataset(x)
|
||||||
|
|
||||||
# Trainer without inf/nan filter
|
# Trainer without inf/nan filter
|
||||||
with tempfile.TemporaryDirectory() as tmp_dir:
|
|
||||||
args = TrainingArguments(
|
args = TrainingArguments(
|
||||||
tmp_dir,
|
self.get_auto_remove_tmp_dir(),
|
||||||
learning_rate=1e-9,
|
learning_rate=1e-9,
|
||||||
logging_steps=5,
|
logging_steps=5,
|
||||||
logging_nan_inf_filter=False,
|
logging_nan_inf_filter=False,
|
||||||
@@ -1528,9 +1538,8 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
|
|||||||
# redefine the model
|
# redefine the model
|
||||||
tiny_gpt2 = GPT2LMHeadModel(config)
|
tiny_gpt2 = GPT2LMHeadModel(config)
|
||||||
# Trainer without inf/nan filter
|
# Trainer without inf/nan filter
|
||||||
with tempfile.TemporaryDirectory() as tmp_dir:
|
|
||||||
args = TrainingArguments(
|
args = TrainingArguments(
|
||||||
tmp_dir,
|
self.get_auto_remove_tmp_dir(),
|
||||||
learning_rate=1e-9,
|
learning_rate=1e-9,
|
||||||
logging_steps=5,
|
logging_steps=5,
|
||||||
logging_nan_inf_filter=False,
|
logging_nan_inf_filter=False,
|
||||||
@@ -1561,18 +1570,24 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
|
|||||||
train_dataset = RepeatDataset(x)
|
train_dataset = RepeatDataset(x)
|
||||||
|
|
||||||
# Trainer without inf/nan filter
|
# Trainer without inf/nan filter
|
||||||
with tempfile.TemporaryDirectory() as tmp_dir:
|
|
||||||
args = TrainingArguments(
|
args = TrainingArguments(
|
||||||
tmp_dir, learning_rate=1e9, logging_steps=5, logging_nan_inf_filter=False, report_to="none"
|
self.get_auto_remove_tmp_dir(),
|
||||||
|
learning_rate=1e9,
|
||||||
|
logging_steps=5,
|
||||||
|
logging_nan_inf_filter=False,
|
||||||
|
report_to="none",
|
||||||
)
|
)
|
||||||
trainer = Trainer(tiny_gpt2, args, train_dataset=train_dataset)
|
trainer = Trainer(tiny_gpt2, args, train_dataset=train_dataset)
|
||||||
trainer.train()
|
trainer.train()
|
||||||
log_history_no_filter = trainer.state.log_history
|
log_history_no_filter = trainer.state.log_history
|
||||||
|
|
||||||
# Trainer with inf/nan filter
|
# Trainer with inf/nan filter
|
||||||
with tempfile.TemporaryDirectory() as tmp_dir:
|
|
||||||
args = TrainingArguments(
|
args = TrainingArguments(
|
||||||
tmp_dir, learning_rate=1e9, logging_steps=5, logging_nan_inf_filter=True, report_to="none"
|
self.get_auto_remove_tmp_dir(),
|
||||||
|
learning_rate=1e9,
|
||||||
|
logging_steps=5,
|
||||||
|
logging_nan_inf_filter=True,
|
||||||
|
report_to="none",
|
||||||
)
|
)
|
||||||
trainer = Trainer(tiny_gpt2, args, train_dataset=train_dataset)
|
trainer = Trainer(tiny_gpt2, args, train_dataset=train_dataset)
|
||||||
trainer.train()
|
trainer.train()
|
||||||
@@ -1591,7 +1606,7 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
|
|||||||
else:
|
else:
|
||||||
n_gpu = 1
|
n_gpu = 1
|
||||||
|
|
||||||
with tempfile.TemporaryDirectory() as tmp_dir:
|
tmp_dir = self.get_auto_remove_tmp_dir()
|
||||||
trainer = get_regression_trainer(learning_rate=0.1, per_device_train_batch_size=16, output_dir=tmp_dir)
|
trainer = get_regression_trainer(learning_rate=0.1, per_device_train_batch_size=16, output_dir=tmp_dir)
|
||||||
self.assertEqual(trainer.get_train_dataloader().total_batch_size, 16 * n_gpu)
|
self.assertEqual(trainer.get_train_dataloader().total_batch_size, 16 * n_gpu)
|
||||||
trainer = get_regression_trainer(learning_rate=0.1, per_device_eval_batch_size=16, output_dir=tmp_dir)
|
trainer = get_regression_trainer(learning_rate=0.1, per_device_eval_batch_size=16, output_dir=tmp_dir)
|
||||||
@@ -1628,12 +1643,11 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
|
|||||||
# tests that we do not require dataloader to have a .dataset attribute
|
# tests that we do not require dataloader to have a .dataset attribute
|
||||||
def test_dataloader_without_dataset(self):
|
def test_dataloader_without_dataset(self):
|
||||||
train_dataset = RegressionDataset(length=128)
|
train_dataset = RegressionDataset(length=128)
|
||||||
with tempfile.TemporaryDirectory() as tmp_dir:
|
|
||||||
trainer = CustomDataloaderTrainer(
|
trainer = CustomDataloaderTrainer(
|
||||||
model=RegressionModel(),
|
model=RegressionModel(),
|
||||||
train_dataset=train_dataset,
|
train_dataset=train_dataset,
|
||||||
eval_dataset=train_dataset,
|
eval_dataset=train_dataset,
|
||||||
args=TrainingArguments(output_dir=tmp_dir, report_to="none"),
|
args=TrainingArguments(output_dir=self.get_auto_remove_tmp_dir(), report_to="none"),
|
||||||
)
|
)
|
||||||
|
|
||||||
trainer.train()
|
trainer.train()
|
||||||
@@ -1643,8 +1657,7 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
|
|||||||
train_dataset = RegressionDataset()
|
train_dataset = RegressionDataset()
|
||||||
config = GPT2Config(vocab_size=100, n_positions=128, n_embd=32, n_layer=3, n_head=4)
|
config = GPT2Config(vocab_size=100, n_positions=128, n_embd=32, n_layer=3, n_head=4)
|
||||||
tiny_gpt2 = GPT2LMHeadModel(config)
|
tiny_gpt2 = GPT2LMHeadModel(config)
|
||||||
with tempfile.TemporaryDirectory() as tmp_dir:
|
args = TrainingArguments(self.get_auto_remove_tmp_dir(), report_to="none", dataloader_persistent_workers=False)
|
||||||
args = TrainingArguments(tmp_dir, report_to="none", dataloader_persistent_workers=False)
|
|
||||||
|
|
||||||
# Single evaluation dataset
|
# Single evaluation dataset
|
||||||
eval_dataset = RegressionDataset()
|
eval_dataset = RegressionDataset()
|
||||||
@@ -1687,9 +1700,8 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
|
|||||||
train_dataset = RegressionDataset()
|
train_dataset = RegressionDataset()
|
||||||
config = GPT2Config(vocab_size=100, n_positions=128, n_embd=32, n_layer=3, n_head=4)
|
config = GPT2Config(vocab_size=100, n_positions=128, n_embd=32, n_layer=3, n_head=4)
|
||||||
tiny_gpt2 = GPT2LMHeadModel(config)
|
tiny_gpt2 = GPT2LMHeadModel(config)
|
||||||
with tempfile.TemporaryDirectory() as tmp_dir:
|
|
||||||
args = TrainingArguments(
|
args = TrainingArguments(
|
||||||
tmp_dir,
|
self.get_auto_remove_tmp_dir(),
|
||||||
report_to="none",
|
report_to="none",
|
||||||
dataloader_persistent_workers=True,
|
dataloader_persistent_workers=True,
|
||||||
dataloader_num_workers=2,
|
dataloader_num_workers=2,
|
||||||
@@ -1747,9 +1759,8 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
|
|||||||
self.assertNotEqual(modeling_llama.apply_rotary_pos_emb, liger_rotary_pos_emb)
|
self.assertNotEqual(modeling_llama.apply_rotary_pos_emb, liger_rotary_pos_emb)
|
||||||
self.assertFalse(isinstance(tiny_llama.model.norm, LigerRMSNorm))
|
self.assertFalse(isinstance(tiny_llama.model.norm, LigerRMSNorm))
|
||||||
|
|
||||||
with tempfile.TemporaryDirectory() as tmp_dir:
|
|
||||||
args = TrainingArguments(
|
args = TrainingArguments(
|
||||||
tmp_dir,
|
self.get_auto_remove_tmp_dir(),
|
||||||
use_liger_kernel=True,
|
use_liger_kernel=True,
|
||||||
)
|
)
|
||||||
Trainer(tiny_llama, args)
|
Trainer(tiny_llama, args)
|
||||||
@@ -1768,8 +1779,9 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
|
|||||||
x = torch.randint(0, 100, (128,))
|
x = torch.randint(0, 100, (128,))
|
||||||
train_dataset = RepeatDataset(x)
|
train_dataset = RepeatDataset(x)
|
||||||
|
|
||||||
with tempfile.TemporaryDirectory() as tmpdir:
|
args = TrainingArguments(
|
||||||
args = TrainingArguments(tmpdir, learning_rate=1e-2, logging_steps=5, max_steps=20, use_liger_kernel=True)
|
self.get_auto_remove_tmp_dir(), learning_rate=1e-2, logging_steps=5, max_steps=20, use_liger_kernel=True
|
||||||
|
)
|
||||||
trainer = Trainer(tiny_llama, args, train_dataset=train_dataset)
|
trainer = Trainer(tiny_llama, args, train_dataset=train_dataset)
|
||||||
|
|
||||||
# Check this works
|
# Check this works
|
||||||
@@ -1786,9 +1798,10 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
|
|||||||
x = torch.randint(0, 100, (128,))
|
x = torch.randint(0, 100, (128,))
|
||||||
train_dataset = RepeatDataset(x)
|
train_dataset = RepeatDataset(x)
|
||||||
|
|
||||||
with tempfile.TemporaryDirectory() as tmpdir:
|
|
||||||
# Trainer without inf/nan filter
|
# Trainer without inf/nan filter
|
||||||
args = TrainingArguments(tmpdir, learning_rate=1e-2, logging_steps=5, optim="lomo", max_steps=20)
|
args = TrainingArguments(
|
||||||
|
self.get_auto_remove_tmp_dir(), learning_rate=1e-2, logging_steps=5, optim="lomo", max_steps=20
|
||||||
|
)
|
||||||
trainer = Trainer(tiny_llama, args, train_dataset=train_dataset)
|
trainer = Trainer(tiny_llama, args, train_dataset=train_dataset)
|
||||||
|
|
||||||
# Check this works
|
# Check this works
|
||||||
@@ -1805,10 +1818,9 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
|
|||||||
x = torch.randint(0, 100, (128,))
|
x = torch.randint(0, 100, (128,))
|
||||||
train_dataset = RepeatDataset(x)
|
train_dataset = RepeatDataset(x)
|
||||||
|
|
||||||
with tempfile.TemporaryDirectory() as tmpdir:
|
|
||||||
# Trainer without inf/nan filter
|
# Trainer without inf/nan filter
|
||||||
args = TrainingArguments(
|
args = TrainingArguments(
|
||||||
tmpdir,
|
self.get_auto_remove_tmp_dir(),
|
||||||
learning_rate=1e-9,
|
learning_rate=1e-9,
|
||||||
logging_steps=5,
|
logging_steps=5,
|
||||||
optim="adalomo",
|
optim="adalomo",
|
||||||
@@ -1820,16 +1832,15 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
|
|||||||
|
|
||||||
@require_grokadamw
|
@require_grokadamw
|
||||||
@require_torch_gpu
|
@require_torch_gpu
|
||||||
def test_grokadamw():
|
def test_grokadamw(self):
|
||||||
config = LlamaConfig(vocab_size=100, hidden_size=32, num_hidden_layers=3, num_attention_heads=4)
|
config = LlamaConfig(vocab_size=100, hidden_size=32, num_hidden_layers=3, num_attention_heads=4)
|
||||||
tiny_llama = LlamaForCausalLM(config)
|
tiny_llama = LlamaForCausalLM(config)
|
||||||
x = torch.randint(0, 100, (128,))
|
x = torch.randint(0, 100, (128,))
|
||||||
train_dataset = RepeatDataset(x)
|
train_dataset = RepeatDataset(x)
|
||||||
|
|
||||||
with tempfile.TemporaryDirectory() as tmpdir:
|
|
||||||
# Trainer without inf/nan filter
|
# Trainer without inf/nan filter
|
||||||
args = TrainingArguments(
|
args = TrainingArguments(
|
||||||
tmpdir,
|
self.get_auto_remove_tmp_dir(),
|
||||||
learning_rate=2e-5,
|
learning_rate=2e-5,
|
||||||
logging_steps=5,
|
logging_steps=5,
|
||||||
optim="grokadamw",
|
optim="grokadamw",
|
||||||
@@ -1848,10 +1859,9 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
|
|||||||
x = torch.randint(0, 100, (128,))
|
x = torch.randint(0, 100, (128,))
|
||||||
train_dataset = RepeatDataset(x)
|
train_dataset = RepeatDataset(x)
|
||||||
|
|
||||||
with tempfile.TemporaryDirectory() as tmpdir:
|
|
||||||
# Trainer without inf/nan filter
|
# Trainer without inf/nan filter
|
||||||
args = TrainingArguments(
|
args = TrainingArguments(
|
||||||
tmpdir,
|
self.get_auto_remove_tmp_dir(),
|
||||||
learning_rate=1e-9,
|
learning_rate=1e-9,
|
||||||
logging_steps=5,
|
logging_steps=5,
|
||||||
optim="schedule_free_adamw",
|
optim="schedule_free_adamw",
|
||||||
@@ -1950,10 +1960,9 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
|
|||||||
x = torch.randint(0, 100, (128,))
|
x = torch.randint(0, 100, (128,))
|
||||||
train_dataset = RepeatDataset(x)
|
train_dataset = RepeatDataset(x)
|
||||||
|
|
||||||
with tempfile.TemporaryDirectory() as tmpdir:
|
|
||||||
# Trainer without inf/nan filter
|
# Trainer without inf/nan filter
|
||||||
args = TrainingArguments(
|
args = TrainingArguments(
|
||||||
tmpdir,
|
self.get_auto_remove_tmp_dir(),
|
||||||
learning_rate=1e-9,
|
learning_rate=1e-9,
|
||||||
logging_steps=5,
|
logging_steps=5,
|
||||||
optim="galore_adamw",
|
optim="galore_adamw",
|
||||||
@@ -1972,10 +1981,9 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
|
|||||||
x = torch.randint(0, 100, (128,))
|
x = torch.randint(0, 100, (128,))
|
||||||
train_dataset = RepeatDataset(x)
|
train_dataset = RepeatDataset(x)
|
||||||
|
|
||||||
with tempfile.TemporaryDirectory() as tmpdir:
|
|
||||||
# Trainer without inf/nan filter
|
# Trainer without inf/nan filter
|
||||||
args = TrainingArguments(
|
args = TrainingArguments(
|
||||||
tmpdir,
|
self.get_auto_remove_tmp_dir(),
|
||||||
learning_rate=1e-9,
|
learning_rate=1e-9,
|
||||||
logging_steps=5,
|
logging_steps=5,
|
||||||
optim="galore_adamw",
|
optim="galore_adamw",
|
||||||
@@ -1995,10 +2003,9 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
|
|||||||
x = torch.randint(0, 100, (128,))
|
x = torch.randint(0, 100, (128,))
|
||||||
train_dataset = RepeatDataset(x)
|
train_dataset = RepeatDataset(x)
|
||||||
|
|
||||||
with tempfile.TemporaryDirectory() as tmpdir:
|
|
||||||
# Trainer without inf/nan filter
|
# Trainer without inf/nan filter
|
||||||
args = TrainingArguments(
|
args = TrainingArguments(
|
||||||
tmpdir,
|
self.get_auto_remove_tmp_dir(),
|
||||||
learning_rate=1e-9,
|
learning_rate=1e-9,
|
||||||
logging_steps=5,
|
logging_steps=5,
|
||||||
optim="galore_adamw_layerwise",
|
optim="galore_adamw_layerwise",
|
||||||
@@ -2017,10 +2024,9 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
|
|||||||
x = torch.randint(0, 100, (128,))
|
x = torch.randint(0, 100, (128,))
|
||||||
train_dataset = RepeatDataset(x)
|
train_dataset = RepeatDataset(x)
|
||||||
|
|
||||||
with tempfile.TemporaryDirectory() as tmpdir:
|
|
||||||
# Trainer without inf/nan filter
|
# Trainer without inf/nan filter
|
||||||
args = TrainingArguments(
|
args = TrainingArguments(
|
||||||
tmpdir,
|
self.get_auto_remove_tmp_dir(),
|
||||||
learning_rate=1e-9,
|
learning_rate=1e-9,
|
||||||
logging_steps=5,
|
logging_steps=5,
|
||||||
optim="galore_adamw_layerwise",
|
optim="galore_adamw_layerwise",
|
||||||
@@ -2040,10 +2046,9 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
|
|||||||
x = torch.randint(0, 100, (128,))
|
x = torch.randint(0, 100, (128,))
|
||||||
train_dataset = RepeatDataset(x)
|
train_dataset = RepeatDataset(x)
|
||||||
|
|
||||||
with tempfile.TemporaryDirectory() as tmpdir:
|
|
||||||
# Trainer without inf/nan filter
|
# Trainer without inf/nan filter
|
||||||
args = TrainingArguments(
|
args = TrainingArguments(
|
||||||
tmpdir,
|
self.get_auto_remove_tmp_dir(),
|
||||||
learning_rate=1e-9,
|
learning_rate=1e-9,
|
||||||
logging_steps=5,
|
logging_steps=5,
|
||||||
optim="galore_adamw_8bit",
|
optim="galore_adamw_8bit",
|
||||||
@@ -2156,13 +2161,12 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
|
|||||||
x = torch.randint(0, 100, (128,))
|
x = torch.randint(0, 100, (128,))
|
||||||
train_dataset = RepeatDataset(x)
|
train_dataset = RepeatDataset(x)
|
||||||
|
|
||||||
with tempfile.TemporaryDirectory() as tmpdir:
|
|
||||||
learning_rate = 1e-9
|
learning_rate = 1e-9
|
||||||
num_steps = 10
|
num_steps = 10
|
||||||
|
|
||||||
# Trainer without inf/nan filter
|
# Trainer without inf/nan filter
|
||||||
args = TrainingArguments(
|
args = TrainingArguments(
|
||||||
tmpdir,
|
self.get_auto_remove_tmp_dir(),
|
||||||
learning_rate=learning_rate,
|
learning_rate=learning_rate,
|
||||||
logging_steps=5,
|
logging_steps=5,
|
||||||
optim="galore_adamw",
|
optim="galore_adamw",
|
||||||
@@ -2182,14 +2186,13 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
|
|||||||
x = torch.randint(0, 100, (128,))
|
x = torch.randint(0, 100, (128,))
|
||||||
train_dataset = RepeatDataset(x)
|
train_dataset = RepeatDataset(x)
|
||||||
|
|
||||||
with tempfile.TemporaryDirectory() as tmpdir:
|
|
||||||
learning_rate = 2e-4
|
learning_rate = 2e-4
|
||||||
num_train_epochs = 2
|
num_train_epochs = 2
|
||||||
num_warmup_steps = 5
|
num_warmup_steps = 5
|
||||||
|
|
||||||
# Trainer without inf/nan filter
|
# Trainer without inf/nan filter
|
||||||
args = TrainingArguments(
|
args = TrainingArguments(
|
||||||
tmpdir,
|
self.get_auto_remove_tmp_dir(),
|
||||||
num_train_epochs=num_train_epochs,
|
num_train_epochs=num_train_epochs,
|
||||||
learning_rate=learning_rate,
|
learning_rate=learning_rate,
|
||||||
warmup_steps=num_warmup_steps,
|
warmup_steps=num_warmup_steps,
|
||||||
@@ -2707,41 +2710,41 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
|
|||||||
self.assertNotIn(log_info_string, cl.out)
|
self.assertNotIn(log_info_string, cl.out)
|
||||||
|
|
||||||
def test_save_checkpoints(self):
|
def test_save_checkpoints(self):
|
||||||
with tempfile.TemporaryDirectory() as tmpdir:
|
tmp_dir = self.get_auto_remove_tmp_dir()
|
||||||
trainer = get_regression_trainer(output_dir=tmpdir, save_steps=5)
|
trainer = get_regression_trainer(output_dir=tmp_dir, save_steps=5)
|
||||||
trainer.train()
|
trainer.train()
|
||||||
self.check_saved_checkpoints(tmpdir, 5, int(self.n_epochs * 64 / self.batch_size))
|
self.check_saved_checkpoints(tmp_dir, 5, int(self.n_epochs * 64 / self.batch_size))
|
||||||
|
|
||||||
# With a regular model that is not a PreTrainedModel
|
# With a regular model that is not a PreTrainedModel
|
||||||
with tempfile.TemporaryDirectory() as tmpdir:
|
tmp_dir = self.get_auto_remove_tmp_dir()
|
||||||
trainer = get_regression_trainer(output_dir=tmpdir, save_steps=5, pretrained=False)
|
trainer = get_regression_trainer(output_dir=tmp_dir, save_steps=5, pretrained=False)
|
||||||
trainer.train()
|
trainer.train()
|
||||||
self.check_saved_checkpoints(tmpdir, 5, int(self.n_epochs * 64 / self.batch_size), False)
|
self.check_saved_checkpoints(tmp_dir, 5, int(self.n_epochs * 64 / self.batch_size), False)
|
||||||
|
|
||||||
@require_safetensors
|
@require_safetensors
|
||||||
def test_safe_checkpoints(self):
|
def test_safe_checkpoints(self):
|
||||||
for save_safetensors in [True, False]:
|
for save_safetensors in [True, False]:
|
||||||
with tempfile.TemporaryDirectory() as tmpdir:
|
tmp_dir = self.get_auto_remove_tmp_dir()
|
||||||
trainer = get_regression_trainer(output_dir=tmpdir, save_steps=5, save_safetensors=save_safetensors)
|
trainer = get_regression_trainer(output_dir=tmp_dir, save_steps=5, save_safetensors=save_safetensors)
|
||||||
trainer.train()
|
trainer.train()
|
||||||
self.check_saved_checkpoints(
|
self.check_saved_checkpoints(
|
||||||
tmpdir, 5, int(self.n_epochs * 64 / self.batch_size), safe_weights=save_safetensors
|
tmp_dir, 5, int(self.n_epochs * 64 / self.batch_size), safe_weights=save_safetensors
|
||||||
)
|
)
|
||||||
|
|
||||||
# With a regular model that is not a PreTrainedModel
|
# With a regular model that is not a PreTrainedModel
|
||||||
with tempfile.TemporaryDirectory() as tmpdir:
|
tmp_dir = self.get_auto_remove_tmp_dir()
|
||||||
trainer = get_regression_trainer(
|
trainer = get_regression_trainer(
|
||||||
output_dir=tmpdir, save_steps=5, pretrained=False, save_safetensors=save_safetensors
|
output_dir=tmp_dir, save_steps=5, pretrained=False, save_safetensors=save_safetensors
|
||||||
)
|
)
|
||||||
trainer.train()
|
trainer.train()
|
||||||
self.check_saved_checkpoints(
|
self.check_saved_checkpoints(
|
||||||
tmpdir, 5, int(self.n_epochs * 64 / self.batch_size), False, safe_weights=save_safetensors
|
tmp_dir, 5, int(self.n_epochs * 64 / self.batch_size), False, safe_weights=save_safetensors
|
||||||
)
|
)
|
||||||
|
|
||||||
def test_load_best_model_with_save(self):
|
def test_load_best_model_with_save(self):
|
||||||
with tempfile.TemporaryDirectory() as tmpdir:
|
tmp_dir = self.get_auto_remove_tmp_dir()
|
||||||
trainer = get_regression_trainer(
|
trainer = get_regression_trainer(
|
||||||
output_dir=tmpdir,
|
output_dir=tmp_dir,
|
||||||
save_steps=5,
|
save_steps=5,
|
||||||
evaluation_strategy="steps",
|
evaluation_strategy="steps",
|
||||||
eval_steps=5,
|
eval_steps=5,
|
||||||
@@ -2750,19 +2753,19 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
|
|||||||
trainer.train()
|
trainer.train()
|
||||||
# Check that we have the last known step:
|
# Check that we have the last known step:
|
||||||
assert os.path.exists(
|
assert os.path.exists(
|
||||||
os.path.join(tmpdir, f"checkpoint-{trainer.state.max_steps}")
|
os.path.join(tmp_dir, f"checkpoint-{trainer.state.max_steps}")
|
||||||
), f"Could not find checkpoint-{trainer.state.max_steps}"
|
), f"Could not find checkpoint-{trainer.state.max_steps}"
|
||||||
# And then check the last step
|
# And then check the last step
|
||||||
assert os.path.exists(os.path.join(tmpdir, "checkpoint-9")), "Could not find checkpoint-9"
|
assert os.path.exists(os.path.join(tmp_dir, "checkpoint-9")), "Could not find checkpoint-9"
|
||||||
|
|
||||||
# Now test that using a limit works
|
# Now test that using a limit works
|
||||||
# Should result in:
|
# Should result in:
|
||||||
# - save at step 5 (but is deleted)
|
# - save at step 5 (but is deleted)
|
||||||
# - save at step 10 (loaded in at the end when `load_best_model=True`)
|
# - save at step 10 (loaded in at the end when `load_best_model=True`)
|
||||||
# - save at step 11
|
# - save at step 11
|
||||||
with tempfile.TemporaryDirectory() as tmpdir:
|
tmp_dir = self.get_auto_remove_tmp_dir()
|
||||||
trainer = get_regression_trainer(
|
trainer = get_regression_trainer(
|
||||||
output_dir=tmpdir,
|
output_dir=tmp_dir,
|
||||||
save_steps=5,
|
save_steps=5,
|
||||||
evaluation_strategy="steps",
|
evaluation_strategy="steps",
|
||||||
eval_steps=5,
|
eval_steps=5,
|
||||||
@@ -2772,19 +2775,17 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
|
|||||||
)
|
)
|
||||||
trainer.train()
|
trainer.train()
|
||||||
# Check that we have the last known step:
|
# Check that we have the last known step:
|
||||||
assert os.path.exists(os.path.join(tmpdir, "checkpoint-11")), "Could not find checkpoint-11"
|
assert os.path.exists(os.path.join(tmp_dir, "checkpoint-11")), "Could not find checkpoint-11"
|
||||||
# And then check the last multiple
|
# And then check the last multiple
|
||||||
assert os.path.exists(os.path.join(tmpdir, "checkpoint-10")), "Could not find checkpoint-10"
|
assert os.path.exists(os.path.join(tmp_dir, "checkpoint-10")), "Could not find checkpoint-10"
|
||||||
# Finally check that we don't have an old one
|
# Finally check that we don't have an old one
|
||||||
assert not os.path.exists(os.path.join(tmpdir, "checkpoint-5")), "Found checkpoint-5, limit not respected"
|
assert not os.path.exists(os.path.join(tmp_dir, "checkpoint-5")), "Found checkpoint-5, limit not respected"
|
||||||
|
|
||||||
# Finally check that the right model was loaded in, checkpoint-10
|
# Finally check that the right model was loaded in, checkpoint-10
|
||||||
# this goes by the last `eval` step check to do so, so it won't be
|
# this goes by the last `eval` step check to do so, so it won't be
|
||||||
# the last model *saved*
|
# the last model *saved*
|
||||||
model_state = trainer.model.state_dict()
|
model_state = trainer.model.state_dict()
|
||||||
final_model_weights = safetensors.torch.load_file(
|
final_model_weights = safetensors.torch.load_file(os.path.join(tmp_dir, "checkpoint-10", "model.safetensors"))
|
||||||
os.path.join(tmpdir, "checkpoint-10", "model.safetensors")
|
|
||||||
)
|
|
||||||
for k, v in model_state.items():
|
for k, v in model_state.items():
|
||||||
assert torch.allclose(v, final_model_weights[k]), f"{k} is not the same"
|
assert torch.allclose(v, final_model_weights[k]), f"{k} is not the same"
|
||||||
|
|
||||||
@@ -2794,8 +2795,7 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
|
|||||||
# since wrapping primarily happens on multi-gpu setup we want multiple gpus to test for
|
# since wrapping primarily happens on multi-gpu setup we want multiple gpus to test for
|
||||||
# example DataParallel(DataParallel(model))
|
# example DataParallel(DataParallel(model))
|
||||||
|
|
||||||
with tempfile.TemporaryDirectory() as tmp_dir:
|
trainer = get_regression_trainer(output_dir=self.get_auto_remove_tmp_dir())
|
||||||
trainer = get_regression_trainer(output_dir=tmp_dir)
|
|
||||||
trainer.train()
|
trainer.train()
|
||||||
model_wrapped_before = trainer.model_wrapped
|
model_wrapped_before = trainer.model_wrapped
|
||||||
trainer.train()
|
trainer.train()
|
||||||
@@ -2808,9 +2808,9 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
|
|||||||
# save_steps, the checkpoint will resume training at epoch 2 or more (so the data seen by the model
|
# save_steps, the checkpoint will resume training at epoch 2 or more (so the data seen by the model
|
||||||
# won't be the same since the training dataloader is shuffled).
|
# won't be the same since the training dataloader is shuffled).
|
||||||
|
|
||||||
with tempfile.TemporaryDirectory() as tmpdir:
|
tmp_dir = self.get_auto_remove_tmp_dir()
|
||||||
kwargs = {
|
kwargs = {
|
||||||
"output_dir": tmpdir,
|
"output_dir": tmp_dir,
|
||||||
"train_len": 128,
|
"train_len": 128,
|
||||||
"save_steps": 5,
|
"save_steps": 5,
|
||||||
"learning_rate": 0.1,
|
"learning_rate": 0.1,
|
||||||
@@ -2821,7 +2821,7 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
|
|||||||
(a, b) = trainer.model.a.item(), trainer.model.b.item()
|
(a, b) = trainer.model.a.item(), trainer.model.b.item()
|
||||||
state = dataclasses.asdict(trainer.state)
|
state = dataclasses.asdict(trainer.state)
|
||||||
|
|
||||||
checkpoint = os.path.join(tmpdir, "checkpoint-5")
|
checkpoint = os.path.join(tmp_dir, "checkpoint-5")
|
||||||
|
|
||||||
# Reinitialize trainer
|
# Reinitialize trainer
|
||||||
trainer = get_regression_trainer(**kwargs)
|
trainer = get_regression_trainer(**kwargs)
|
||||||
@@ -2834,7 +2834,7 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
|
|||||||
self.check_trainer_state_are_the_same(state, state1)
|
self.check_trainer_state_are_the_same(state, state1)
|
||||||
|
|
||||||
# Now check with a later checkpoint that it also works when we span over one epoch
|
# Now check with a later checkpoint that it also works when we span over one epoch
|
||||||
checkpoint = os.path.join(tmpdir, "checkpoint-15")
|
checkpoint = os.path.join(tmp_dir, "checkpoint-15")
|
||||||
|
|
||||||
# Reinitialize trainer and load model
|
# Reinitialize trainer and load model
|
||||||
trainer = get_regression_trainer(**kwargs)
|
trainer = get_regression_trainer(**kwargs)
|
||||||
@@ -2847,9 +2847,9 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
|
|||||||
self.check_trainer_state_are_the_same(state, state1)
|
self.check_trainer_state_are_the_same(state, state1)
|
||||||
|
|
||||||
# With a regular model that is not a PreTrainedModel
|
# With a regular model that is not a PreTrainedModel
|
||||||
with tempfile.TemporaryDirectory() as tmpdir:
|
tmp_dir = self.get_auto_remove_tmp_dir()
|
||||||
kwargs = {
|
kwargs = {
|
||||||
"output_dir": tmpdir,
|
"output_dir": tmp_dir,
|
||||||
"train_len": 128,
|
"train_len": 128,
|
||||||
"save_steps": 5,
|
"save_steps": 5,
|
||||||
"learning_rate": 0.1,
|
"learning_rate": 0.1,
|
||||||
@@ -2861,7 +2861,7 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
|
|||||||
(a, b) = trainer.model.a.item(), trainer.model.b.item()
|
(a, b) = trainer.model.a.item(), trainer.model.b.item()
|
||||||
state = dataclasses.asdict(trainer.state)
|
state = dataclasses.asdict(trainer.state)
|
||||||
|
|
||||||
checkpoint = os.path.join(tmpdir, "checkpoint-5")
|
checkpoint = os.path.join(tmp_dir, "checkpoint-5")
|
||||||
|
|
||||||
# Reinitialize trainer and load model
|
# Reinitialize trainer and load model
|
||||||
trainer = get_regression_trainer(**kwargs)
|
trainer = get_regression_trainer(**kwargs)
|
||||||
@@ -2874,7 +2874,7 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
|
|||||||
self.check_trainer_state_are_the_same(state, state1)
|
self.check_trainer_state_are_the_same(state, state1)
|
||||||
|
|
||||||
# Now check with a later checkpoint that it also works when we span over one epoch
|
# Now check with a later checkpoint that it also works when we span over one epoch
|
||||||
checkpoint = os.path.join(tmpdir, "checkpoint-15")
|
checkpoint = os.path.join(tmp_dir, "checkpoint-15")
|
||||||
|
|
||||||
# Reinitialize trainer and load model
|
# Reinitialize trainer and load model
|
||||||
trainer = get_regression_trainer(**kwargs)
|
trainer = get_regression_trainer(**kwargs)
|
||||||
@@ -2889,15 +2889,15 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
|
|||||||
# Now check failures
|
# Now check failures
|
||||||
|
|
||||||
# 1. fail to find a bogus checkpoint
|
# 1. fail to find a bogus checkpoint
|
||||||
with tempfile.TemporaryDirectory() as tmpdir:
|
tmp_dir = self.get_auto_remove_tmp_dir()
|
||||||
trainer = get_regression_trainer(output_dir=tmpdir)
|
trainer = get_regression_trainer(output_dir=tmp_dir)
|
||||||
with self.assertRaises(Exception) as context:
|
with self.assertRaises(Exception) as context:
|
||||||
trainer.train(resume_from_checkpoint=f"{checkpoint}-bogus")
|
trainer.train(resume_from_checkpoint=f"{checkpoint}-bogus")
|
||||||
self.assertTrue("Can't find a valid checkpoint at" in str(context.exception))
|
self.assertTrue("Can't find a valid checkpoint at" in str(context.exception))
|
||||||
|
|
||||||
# 2. fail to find any checkpoint - due a fresh output_dir
|
# 2. fail to find any checkpoint - due a fresh output_dir
|
||||||
with tempfile.TemporaryDirectory() as tmpdir:
|
tmp_dir = self.get_auto_remove_tmp_dir()
|
||||||
trainer = get_regression_trainer(output_dir=tmpdir)
|
trainer = get_regression_trainer(output_dir=tmp_dir)
|
||||||
with self.assertRaises(Exception) as context:
|
with self.assertRaises(Exception) as context:
|
||||||
trainer.train(resume_from_checkpoint=True)
|
trainer.train(resume_from_checkpoint=True)
|
||||||
self.assertTrue("No valid checkpoint found in output directory" in str(context.exception))
|
self.assertTrue("No valid checkpoint found in output directory" in str(context.exception))
|
||||||
|
|||||||
Reference in New Issue
Block a user