Use inherit tempdir makers for tests + fix failing DS tests (#35600)

* Use existing APIs to make tempdir folders

* Fixup deepspeed too

* output_dir -> tmp_dir
This commit is contained in:
Zach Mueller
2025-01-10 10:01:58 -05:00
committed by GitHub
parent bbc00046b9
commit 1211e616a4
2 changed files with 636 additions and 620 deletions

View File

@@ -482,6 +482,7 @@ class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, T
max_grad_norm=max_grad_norm,
adam_beta1=adam_beta1,
adam_beta2=adam_beta2,
output_dir=self.get_auto_remove_tmp_dir(),
)
with self.assertRaises(Exception) as context:
trainer.train()
@@ -506,7 +507,9 @@ class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, T
del ds_config_zero2_dict["scheduler"] # force default HF Trainer scheduler
ds_config_zero2_dict["zero_optimization"]["offload_optimizer"]["device"] = "none"
ds_config_zero2_dict["fp16"]["initial_scale_power"] = 1 # force optimizer on the first step
trainer = get_regression_trainer(a=a, local_rank=0, fp16=True, deepspeed=ds_config_zero2_dict)
trainer = get_regression_trainer(
a=a, local_rank=0, fp16=True, deepspeed=ds_config_zero2_dict, output_dir=self.get_auto_remove_tmp_dir()
)
trainer.train()
new_a = trainer.model.a.item()
self.assertNotEqual(new_a, a)
@@ -518,7 +521,9 @@ class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, T
del ds_config_zero2_dict["optimizer"] # force default HF Trainer optimizer
ds_config_zero2_dict["zero_optimization"]["offload_optimizer"]["device"] = "none"
ds_config_zero2_dict["fp16"]["initial_scale_power"] = 1 # force optimizer on the first step
trainer = get_regression_trainer(a=a, local_rank=0, fp16=True, deepspeed=ds_config_zero2_dict)
trainer = get_regression_trainer(
a=a, local_rank=0, fp16=True, deepspeed=ds_config_zero2_dict, output_dir=self.get_auto_remove_tmp_dir()
)
trainer.train()
new_a = trainer.model.a.item()
self.assertNotEqual(new_a, a)
@@ -530,7 +535,9 @@ class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, T
del ds_config_zero2_dict["scheduler"] # force default HF Trainer scheduler
ds_config_zero2_dict["zero_optimization"]["offload_optimizer"]["device"] = "none"
ds_config_zero2_dict["fp16"]["initial_scale_power"] = 1 # force optimizer on the first step
trainer = get_regression_trainer(a=a, local_rank=0, fp16=True, deepspeed=ds_config_zero2_dict)
trainer = get_regression_trainer(
a=a, local_rank=0, fp16=True, deepspeed=ds_config_zero2_dict, output_dir=self.get_auto_remove_tmp_dir()
)
trainer.train()
new_a = trainer.model.a.item()
self.assertNotEqual(new_a, a)
@@ -546,7 +553,9 @@ class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, T
ds_config_zero3_dict["zero_optimization"]["offload_optimizer"] = nvme_config
ds_config_zero3_dict["zero_optimization"]["offload_param"] = nvme_config
ds_config_zero3_dict["zero_optimization"]["stage3_gather_16bit_weights_on_model_save"] = True
trainer = get_regression_trainer(local_rank=0, fp16=True, deepspeed=ds_config_zero3_dict)
trainer = get_regression_trainer(
local_rank=0, fp16=True, deepspeed=ds_config_zero3_dict, output_dir=self.get_auto_remove_tmp_dir()
)
with CaptureLogger(deepspeed_logger) as cl:
trainer.train()
self.assertIn("DeepSpeed info", cl.out, "expected DeepSpeed logger output but got none")
@@ -567,6 +576,7 @@ class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, T
fp16=True,
model_init=model_init,
deepspeed=ds_config_zero3_dict,
output_dir=self.get_auto_remove_tmp_dir(),
)
n_trials = 3
@@ -588,7 +598,7 @@ class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, T
ds_config_dict["zero_optimization"]["offload_optimizer"]["device"] = "cpu"
ds_config_dict["zero_force_ds_cpu_optimizer"] = False # offload is not efficient w/o CPUAdam
with mockenv_context(**self.dist_env_1_gpu):
kwargs = {"local_rank": 0, "deepspeed": ds_config_dict}
kwargs = {"local_rank": 0, "deepspeed": ds_config_dict, "output_dir": self.get_auto_remove_tmp_dir()}
kwargs[dtype] = True
trainer = get_regression_trainer(**kwargs)
with CaptureLogger(deepspeed_logger) as cl:
@@ -604,7 +614,11 @@ class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, T
# it's run not as a first test as `sys.stdout` will no longer be the same. So we either have
# to reset `deepspeed_logger.handlers[0].setStream(sys.stdout)` or directly capture from the deepspeed_logger.
with mockenv_context(**self.dist_env_1_gpu):
kwargs = {"local_rank": 0, "deepspeed": self.get_config_dict(stage)}
kwargs = {
"local_rank": 0,
"deepspeed": self.get_config_dict(stage),
"output_dir": self.get_auto_remove_tmp_dir(),
}
kwargs[dtype] = True
trainer = get_regression_trainer(**kwargs)
@@ -630,6 +644,7 @@ class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, T
"deepspeed": self.get_config_dict(stage),
"per_device_train_batch_size": 8,
"logging_steps": 1,
"output_dir": self.get_auto_remove_tmp_dir(),
}
kwargs[dtype] = True
trainer = get_regression_trainer(**kwargs)
@@ -673,6 +688,7 @@ class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, T
"local_rank": 0,
"train_len": train_len,
"deepspeed": self.get_config_dict(stage),
"output_dir": self.get_auto_remove_tmp_dir(),
}
kwargs[dtype] = True

View File

@@ -1222,8 +1222,7 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
train_dataset = RegressionDataset()
eval_dataset = RegressionDataset()
model = RegressionDictModel()
with tempfile.TemporaryDirectory() as tmp_dir:
args = TrainingArguments(tmp_dir, report_to="none")
args = TrainingArguments(self.get_auto_remove_tmp_dir(), report_to="none")
trainer = Trainer(model, args, train_dataset=train_dataset, eval_dataset=eval_dataset)
trainer.train()
_ = trainer.evaluate()
@@ -1234,8 +1233,7 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
tiny_gpt2 = GPT2LMHeadModel(config)
x = torch.randint(0, 100, (128,))
eval_dataset = RepeatDataset(x)
with tempfile.TemporaryDirectory() as tmp_dir:
args = TrainingArguments(tmp_dir, report_to="none")
args = TrainingArguments(self.get_auto_remove_tmp_dir(), report_to="none")
trainer = Trainer(tiny_gpt2, args, eval_dataset=eval_dataset)
# By default the past_key_values are removed
result = trainer.predict(eval_dataset)
@@ -1246,7 +1244,7 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
self.assertEqual(len(result.predictions), 2)
def test_training_arguments_are_left_untouched(self):
with tempfile.TemporaryDirectory() as tmp_dir:
tmp_dir = self.get_auto_remove_tmp_dir()
trainer = get_regression_trainer(output_dir=tmp_dir)
trainer.train()
args = TrainingArguments(tmp_dir, report_to=[])
@@ -1258,7 +1256,7 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
def test_number_of_steps_in_training(self):
# Regular training has n_epochs * len(train_dl) steps
with tempfile.TemporaryDirectory() as tmp_dir:
tmp_dir = self.get_auto_remove_tmp_dir()
trainer = get_regression_trainer(learning_rate=0.1, output_dir=tmp_dir)
train_output = trainer.train()
self.assertEqual(train_output.global_step, self.n_epochs * 64 / self.batch_size)
@@ -1277,7 +1275,7 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
@require_intel_extension_for_pytorch
def test_number_of_steps_in_training_with_ipex(self):
for mix_bf16 in [True, False]:
with tempfile.TemporaryDirectory() as tmp_dir:
tmp_dir = self.get_auto_remove_tmp_dir()
# Regular training has n_epochs * len(train_dl) steps
trainer = get_regression_trainer(
learning_rate=0.1, use_ipex=True, bf16=mix_bf16, use_cpu=True, output_dir=tmp_dir
@@ -1311,9 +1309,8 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
x = torch.randint(0, 100, (128,))
train_dataset = RepeatDataset(x)
with tempfile.TemporaryDirectory() as tmp_dir:
args = TrainingArguments(
tmp_dir,
self.get_auto_remove_tmp_dir(),
per_device_train_batch_size=2,
torch_compile=True,
max_steps=1, # compile happens on the first step
@@ -1348,9 +1345,8 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
x = torch.randint(0, 100, (128,))
train_dataset = RepeatDataset(x)
with tempfile.TemporaryDirectory() as tmp_dir:
args = TrainingArguments(
tmp_dir,
self.get_auto_remove_tmp_dir(),
learning_rate=1e-9,
logging_steps=5,
)
@@ -1387,9 +1383,9 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
tokenizer.pad_token = tokenizer.eos_token
with tempfile.TemporaryDirectory() as tmpdir:
tmp_dir = self.get_auto_remove_tmp_dir()
args = TrainingArguments(
tmpdir,
tmp_dir,
per_device_train_batch_size=1,
learning_rate=1e-9,
save_steps=5,
@@ -1406,7 +1402,7 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
# Reinitialize trainer
trainer = Trainer(tiny_model, args, processing_class=tokenizer, train_dataset=train_dataset)
checkpoint = os.path.join(tmpdir, "checkpoint-5")
checkpoint = os.path.join(tmp_dir, "checkpoint-5")
trainer.train(resume_from_checkpoint=checkpoint)
parameters1 = dict(tiny_model.named_parameters())
@@ -1421,10 +1417,13 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
x = torch.randint(0, 100, (128,))
train_dataset = RepeatDataset(x)
with tempfile.TemporaryDirectory() as tmpdir:
# Trainer without inf/nan filter
args = TrainingArguments(
tmpdir, learning_rate=1e-9, logging_steps=5, logging_nan_inf_filter=False, optim="rmsprop_bnb"
self.get_auto_remove_tmp_dir(),
learning_rate=1e-9,
logging_steps=5,
logging_nan_inf_filter=False,
optim="rmsprop_bnb",
)
trainer = Trainer(tiny_gpt2, args, train_dataset=train_dataset)
@@ -1438,10 +1437,13 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
x = torch.randint(0, 100, (128,))
train_dataset = RepeatDataset(x)
with tempfile.TemporaryDirectory() as tmpdir:
# Trainer without inf/nan filter
args = TrainingArguments(
tmpdir, learning_rate=1e-9, logging_steps=5, logging_nan_inf_filter=False, optim="ademamix"
self.get_auto_remove_tmp_dir(),
learning_rate=1e-9,
logging_steps=5,
logging_nan_inf_filter=False,
optim="ademamix",
)
trainer = Trainer(tiny_gpt2, args, train_dataset=train_dataset)
@@ -1455,10 +1457,13 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
x = torch.randint(0, 100, (128,))
train_dataset = RepeatDataset(x)
with tempfile.TemporaryDirectory() as tmpdir:
# Trainer without inf/nan filter
args = TrainingArguments(
tmpdir, learning_rate=1e-9, logging_steps=5, logging_nan_inf_filter=False, optim="ademamix_8bit"
self.get_auto_remove_tmp_dir(),
learning_rate=1e-9,
logging_steps=5,
logging_nan_inf_filter=False,
optim="ademamix_8bit",
)
trainer = Trainer(tiny_gpt2, args, train_dataset=train_dataset)
@@ -1472,10 +1477,13 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
x = torch.randint(0, 100, (128,))
train_dataset = RepeatDataset(x)
with tempfile.TemporaryDirectory() as tmpdir:
# Trainer without inf/nan filter
args = TrainingArguments(
tmpdir, learning_rate=1e-9, logging_steps=5, logging_nan_inf_filter=False, optim="rmsprop_bnb_8bit"
self.get_auto_remove_tmp_dir(),
learning_rate=1e-9,
logging_steps=5,
logging_nan_inf_filter=False,
optim="rmsprop_bnb_8bit",
)
trainer = Trainer(tiny_gpt2, args, train_dataset=train_dataset)
@@ -1488,10 +1496,13 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
tiny_gpt2 = GPT2LMHeadModel(config)
x = torch.randint(0, 100, (128,))
train_dataset = RepeatDataset(x)
with tempfile.TemporaryDirectory() as tmpdir:
# Trainer without inf/nan filter
args = TrainingArguments(
tmpdir, learning_rate=1e-9, logging_steps=5, logging_nan_inf_filter=False, optim="rmsprop_bnb_32bit"
self.get_auto_remove_tmp_dir(),
learning_rate=1e-9,
logging_steps=5,
logging_nan_inf_filter=False,
optim="rmsprop_bnb_32bit",
)
trainer = Trainer(tiny_gpt2, args, train_dataset=train_dataset)
@@ -1505,9 +1516,8 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
train_dataset = RepeatDataset(x)
# Trainer without inf/nan filter
with tempfile.TemporaryDirectory() as tmp_dir:
args = TrainingArguments(
tmp_dir,
self.get_auto_remove_tmp_dir(),
learning_rate=1e-9,
logging_steps=5,
logging_nan_inf_filter=False,
@@ -1528,9 +1538,8 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
# redefine the model
tiny_gpt2 = GPT2LMHeadModel(config)
# Trainer without inf/nan filter
with tempfile.TemporaryDirectory() as tmp_dir:
args = TrainingArguments(
tmp_dir,
self.get_auto_remove_tmp_dir(),
learning_rate=1e-9,
logging_steps=5,
logging_nan_inf_filter=False,
@@ -1561,18 +1570,24 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
train_dataset = RepeatDataset(x)
# Trainer without inf/nan filter
with tempfile.TemporaryDirectory() as tmp_dir:
args = TrainingArguments(
tmp_dir, learning_rate=1e9, logging_steps=5, logging_nan_inf_filter=False, report_to="none"
self.get_auto_remove_tmp_dir(),
learning_rate=1e9,
logging_steps=5,
logging_nan_inf_filter=False,
report_to="none",
)
trainer = Trainer(tiny_gpt2, args, train_dataset=train_dataset)
trainer.train()
log_history_no_filter = trainer.state.log_history
# Trainer with inf/nan filter
with tempfile.TemporaryDirectory() as tmp_dir:
args = TrainingArguments(
tmp_dir, learning_rate=1e9, logging_steps=5, logging_nan_inf_filter=True, report_to="none"
self.get_auto_remove_tmp_dir(),
learning_rate=1e9,
logging_steps=5,
logging_nan_inf_filter=True,
report_to="none",
)
trainer = Trainer(tiny_gpt2, args, train_dataset=train_dataset)
trainer.train()
@@ -1591,7 +1606,7 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
else:
n_gpu = 1
with tempfile.TemporaryDirectory() as tmp_dir:
tmp_dir = self.get_auto_remove_tmp_dir()
trainer = get_regression_trainer(learning_rate=0.1, per_device_train_batch_size=16, output_dir=tmp_dir)
self.assertEqual(trainer.get_train_dataloader().total_batch_size, 16 * n_gpu)
trainer = get_regression_trainer(learning_rate=0.1, per_device_eval_batch_size=16, output_dir=tmp_dir)
@@ -1628,12 +1643,11 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
# tests that we do not require dataloader to have a .dataset attribute
def test_dataloader_without_dataset(self):
train_dataset = RegressionDataset(length=128)
with tempfile.TemporaryDirectory() as tmp_dir:
trainer = CustomDataloaderTrainer(
model=RegressionModel(),
train_dataset=train_dataset,
eval_dataset=train_dataset,
args=TrainingArguments(output_dir=tmp_dir, report_to="none"),
args=TrainingArguments(output_dir=self.get_auto_remove_tmp_dir(), report_to="none"),
)
trainer.train()
@@ -1643,8 +1657,7 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
train_dataset = RegressionDataset()
config = GPT2Config(vocab_size=100, n_positions=128, n_embd=32, n_layer=3, n_head=4)
tiny_gpt2 = GPT2LMHeadModel(config)
with tempfile.TemporaryDirectory() as tmp_dir:
args = TrainingArguments(tmp_dir, report_to="none", dataloader_persistent_workers=False)
args = TrainingArguments(self.get_auto_remove_tmp_dir(), report_to="none", dataloader_persistent_workers=False)
# Single evaluation dataset
eval_dataset = RegressionDataset()
@@ -1687,9 +1700,8 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
train_dataset = RegressionDataset()
config = GPT2Config(vocab_size=100, n_positions=128, n_embd=32, n_layer=3, n_head=4)
tiny_gpt2 = GPT2LMHeadModel(config)
with tempfile.TemporaryDirectory() as tmp_dir:
args = TrainingArguments(
tmp_dir,
self.get_auto_remove_tmp_dir(),
report_to="none",
dataloader_persistent_workers=True,
dataloader_num_workers=2,
@@ -1747,9 +1759,8 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
self.assertNotEqual(modeling_llama.apply_rotary_pos_emb, liger_rotary_pos_emb)
self.assertFalse(isinstance(tiny_llama.model.norm, LigerRMSNorm))
with tempfile.TemporaryDirectory() as tmp_dir:
args = TrainingArguments(
tmp_dir,
self.get_auto_remove_tmp_dir(),
use_liger_kernel=True,
)
Trainer(tiny_llama, args)
@@ -1768,8 +1779,9 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
x = torch.randint(0, 100, (128,))
train_dataset = RepeatDataset(x)
with tempfile.TemporaryDirectory() as tmpdir:
args = TrainingArguments(tmpdir, learning_rate=1e-2, logging_steps=5, max_steps=20, use_liger_kernel=True)
args = TrainingArguments(
self.get_auto_remove_tmp_dir(), learning_rate=1e-2, logging_steps=5, max_steps=20, use_liger_kernel=True
)
trainer = Trainer(tiny_llama, args, train_dataset=train_dataset)
# Check this works
@@ -1786,9 +1798,10 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
x = torch.randint(0, 100, (128,))
train_dataset = RepeatDataset(x)
with tempfile.TemporaryDirectory() as tmpdir:
# Trainer without inf/nan filter
args = TrainingArguments(tmpdir, learning_rate=1e-2, logging_steps=5, optim="lomo", max_steps=20)
args = TrainingArguments(
self.get_auto_remove_tmp_dir(), learning_rate=1e-2, logging_steps=5, optim="lomo", max_steps=20
)
trainer = Trainer(tiny_llama, args, train_dataset=train_dataset)
# Check this works
@@ -1805,10 +1818,9 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
x = torch.randint(0, 100, (128,))
train_dataset = RepeatDataset(x)
with tempfile.TemporaryDirectory() as tmpdir:
# Trainer without inf/nan filter
args = TrainingArguments(
tmpdir,
self.get_auto_remove_tmp_dir(),
learning_rate=1e-9,
logging_steps=5,
optim="adalomo",
@@ -1820,16 +1832,15 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
@require_grokadamw
@require_torch_gpu
def test_grokadamw():
def test_grokadamw(self):
config = LlamaConfig(vocab_size=100, hidden_size=32, num_hidden_layers=3, num_attention_heads=4)
tiny_llama = LlamaForCausalLM(config)
x = torch.randint(0, 100, (128,))
train_dataset = RepeatDataset(x)
with tempfile.TemporaryDirectory() as tmpdir:
# Trainer without inf/nan filter
args = TrainingArguments(
tmpdir,
self.get_auto_remove_tmp_dir(),
learning_rate=2e-5,
logging_steps=5,
optim="grokadamw",
@@ -1848,10 +1859,9 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
x = torch.randint(0, 100, (128,))
train_dataset = RepeatDataset(x)
with tempfile.TemporaryDirectory() as tmpdir:
# Trainer without inf/nan filter
args = TrainingArguments(
tmpdir,
self.get_auto_remove_tmp_dir(),
learning_rate=1e-9,
logging_steps=5,
optim="schedule_free_adamw",
@@ -1950,10 +1960,9 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
x = torch.randint(0, 100, (128,))
train_dataset = RepeatDataset(x)
with tempfile.TemporaryDirectory() as tmpdir:
# Trainer without inf/nan filter
args = TrainingArguments(
tmpdir,
self.get_auto_remove_tmp_dir(),
learning_rate=1e-9,
logging_steps=5,
optim="galore_adamw",
@@ -1972,10 +1981,9 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
x = torch.randint(0, 100, (128,))
train_dataset = RepeatDataset(x)
with tempfile.TemporaryDirectory() as tmpdir:
# Trainer without inf/nan filter
args = TrainingArguments(
tmpdir,
self.get_auto_remove_tmp_dir(),
learning_rate=1e-9,
logging_steps=5,
optim="galore_adamw",
@@ -1995,10 +2003,9 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
x = torch.randint(0, 100, (128,))
train_dataset = RepeatDataset(x)
with tempfile.TemporaryDirectory() as tmpdir:
# Trainer without inf/nan filter
args = TrainingArguments(
tmpdir,
self.get_auto_remove_tmp_dir(),
learning_rate=1e-9,
logging_steps=5,
optim="galore_adamw_layerwise",
@@ -2017,10 +2024,9 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
x = torch.randint(0, 100, (128,))
train_dataset = RepeatDataset(x)
with tempfile.TemporaryDirectory() as tmpdir:
# Trainer without inf/nan filter
args = TrainingArguments(
tmpdir,
self.get_auto_remove_tmp_dir(),
learning_rate=1e-9,
logging_steps=5,
optim="galore_adamw_layerwise",
@@ -2040,10 +2046,9 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
x = torch.randint(0, 100, (128,))
train_dataset = RepeatDataset(x)
with tempfile.TemporaryDirectory() as tmpdir:
# Trainer without inf/nan filter
args = TrainingArguments(
tmpdir,
self.get_auto_remove_tmp_dir(),
learning_rate=1e-9,
logging_steps=5,
optim="galore_adamw_8bit",
@@ -2156,13 +2161,12 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
x = torch.randint(0, 100, (128,))
train_dataset = RepeatDataset(x)
with tempfile.TemporaryDirectory() as tmpdir:
learning_rate = 1e-9
num_steps = 10
# Trainer without inf/nan filter
args = TrainingArguments(
tmpdir,
self.get_auto_remove_tmp_dir(),
learning_rate=learning_rate,
logging_steps=5,
optim="galore_adamw",
@@ -2182,14 +2186,13 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
x = torch.randint(0, 100, (128,))
train_dataset = RepeatDataset(x)
with tempfile.TemporaryDirectory() as tmpdir:
learning_rate = 2e-4
num_train_epochs = 2
num_warmup_steps = 5
# Trainer without inf/nan filter
args = TrainingArguments(
tmpdir,
self.get_auto_remove_tmp_dir(),
num_train_epochs=num_train_epochs,
learning_rate=learning_rate,
warmup_steps=num_warmup_steps,
@@ -2707,41 +2710,41 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
self.assertNotIn(log_info_string, cl.out)
def test_save_checkpoints(self):
with tempfile.TemporaryDirectory() as tmpdir:
trainer = get_regression_trainer(output_dir=tmpdir, save_steps=5)
tmp_dir = self.get_auto_remove_tmp_dir()
trainer = get_regression_trainer(output_dir=tmp_dir, save_steps=5)
trainer.train()
self.check_saved_checkpoints(tmpdir, 5, int(self.n_epochs * 64 / self.batch_size))
self.check_saved_checkpoints(tmp_dir, 5, int(self.n_epochs * 64 / self.batch_size))
# With a regular model that is not a PreTrainedModel
with tempfile.TemporaryDirectory() as tmpdir:
trainer = get_regression_trainer(output_dir=tmpdir, save_steps=5, pretrained=False)
tmp_dir = self.get_auto_remove_tmp_dir()
trainer = get_regression_trainer(output_dir=tmp_dir, save_steps=5, pretrained=False)
trainer.train()
self.check_saved_checkpoints(tmpdir, 5, int(self.n_epochs * 64 / self.batch_size), False)
self.check_saved_checkpoints(tmp_dir, 5, int(self.n_epochs * 64 / self.batch_size), False)
@require_safetensors
def test_safe_checkpoints(self):
for save_safetensors in [True, False]:
with tempfile.TemporaryDirectory() as tmpdir:
trainer = get_regression_trainer(output_dir=tmpdir, save_steps=5, save_safetensors=save_safetensors)
tmp_dir = self.get_auto_remove_tmp_dir()
trainer = get_regression_trainer(output_dir=tmp_dir, save_steps=5, save_safetensors=save_safetensors)
trainer.train()
self.check_saved_checkpoints(
tmpdir, 5, int(self.n_epochs * 64 / self.batch_size), safe_weights=save_safetensors
tmp_dir, 5, int(self.n_epochs * 64 / self.batch_size), safe_weights=save_safetensors
)
# With a regular model that is not a PreTrainedModel
with tempfile.TemporaryDirectory() as tmpdir:
tmp_dir = self.get_auto_remove_tmp_dir()
trainer = get_regression_trainer(
output_dir=tmpdir, save_steps=5, pretrained=False, save_safetensors=save_safetensors
output_dir=tmp_dir, save_steps=5, pretrained=False, save_safetensors=save_safetensors
)
trainer.train()
self.check_saved_checkpoints(
tmpdir, 5, int(self.n_epochs * 64 / self.batch_size), False, safe_weights=save_safetensors
tmp_dir, 5, int(self.n_epochs * 64 / self.batch_size), False, safe_weights=save_safetensors
)
def test_load_best_model_with_save(self):
with tempfile.TemporaryDirectory() as tmpdir:
tmp_dir = self.get_auto_remove_tmp_dir()
trainer = get_regression_trainer(
output_dir=tmpdir,
output_dir=tmp_dir,
save_steps=5,
evaluation_strategy="steps",
eval_steps=5,
@@ -2750,19 +2753,19 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
trainer.train()
# Check that we have the last known step:
assert os.path.exists(
os.path.join(tmpdir, f"checkpoint-{trainer.state.max_steps}")
os.path.join(tmp_dir, f"checkpoint-{trainer.state.max_steps}")
), f"Could not find checkpoint-{trainer.state.max_steps}"
# And then check the last step
assert os.path.exists(os.path.join(tmpdir, "checkpoint-9")), "Could not find checkpoint-9"
assert os.path.exists(os.path.join(tmp_dir, "checkpoint-9")), "Could not find checkpoint-9"
# Now test that using a limit works
# Should result in:
# - save at step 5 (but is deleted)
# - save at step 10 (loaded in at the end when `load_best_model=True`)
# - save at step 11
with tempfile.TemporaryDirectory() as tmpdir:
tmp_dir = self.get_auto_remove_tmp_dir()
trainer = get_regression_trainer(
output_dir=tmpdir,
output_dir=tmp_dir,
save_steps=5,
evaluation_strategy="steps",
eval_steps=5,
@@ -2772,19 +2775,17 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
)
trainer.train()
# Check that we have the last known step:
assert os.path.exists(os.path.join(tmpdir, "checkpoint-11")), "Could not find checkpoint-11"
assert os.path.exists(os.path.join(tmp_dir, "checkpoint-11")), "Could not find checkpoint-11"
# And then check the last multiple
assert os.path.exists(os.path.join(tmpdir, "checkpoint-10")), "Could not find checkpoint-10"
assert os.path.exists(os.path.join(tmp_dir, "checkpoint-10")), "Could not find checkpoint-10"
# Finally check that we don't have an old one
assert not os.path.exists(os.path.join(tmpdir, "checkpoint-5")), "Found checkpoint-5, limit not respected"
assert not os.path.exists(os.path.join(tmp_dir, "checkpoint-5")), "Found checkpoint-5, limit not respected"
# Finally check that the right model was loaded in, checkpoint-10
# this goes by the last `eval` step check to do so, so it won't be
# the last model *saved*
model_state = trainer.model.state_dict()
final_model_weights = safetensors.torch.load_file(
os.path.join(tmpdir, "checkpoint-10", "model.safetensors")
)
final_model_weights = safetensors.torch.load_file(os.path.join(tmp_dir, "checkpoint-10", "model.safetensors"))
for k, v in model_state.items():
assert torch.allclose(v, final_model_weights[k]), f"{k} is not the same"
@@ -2794,8 +2795,7 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
# since wrapping primarily happens on multi-gpu setup we want multiple gpus to test for
# example DataParallel(DataParallel(model))
with tempfile.TemporaryDirectory() as tmp_dir:
trainer = get_regression_trainer(output_dir=tmp_dir)
trainer = get_regression_trainer(output_dir=self.get_auto_remove_tmp_dir())
trainer.train()
model_wrapped_before = trainer.model_wrapped
trainer.train()
@@ -2808,9 +2808,9 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
# save_steps, the checkpoint will resume training at epoch 2 or more (so the data seen by the model
# won't be the same since the training dataloader is shuffled).
with tempfile.TemporaryDirectory() as tmpdir:
tmp_dir = self.get_auto_remove_tmp_dir()
kwargs = {
"output_dir": tmpdir,
"output_dir": tmp_dir,
"train_len": 128,
"save_steps": 5,
"learning_rate": 0.1,
@@ -2821,7 +2821,7 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
(a, b) = trainer.model.a.item(), trainer.model.b.item()
state = dataclasses.asdict(trainer.state)
checkpoint = os.path.join(tmpdir, "checkpoint-5")
checkpoint = os.path.join(tmp_dir, "checkpoint-5")
# Reinitialize trainer
trainer = get_regression_trainer(**kwargs)
@@ -2834,7 +2834,7 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
self.check_trainer_state_are_the_same(state, state1)
# Now check with a later checkpoint that it also works when we span over one epoch
checkpoint = os.path.join(tmpdir, "checkpoint-15")
checkpoint = os.path.join(tmp_dir, "checkpoint-15")
# Reinitialize trainer and load model
trainer = get_regression_trainer(**kwargs)
@@ -2847,9 +2847,9 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
self.check_trainer_state_are_the_same(state, state1)
# With a regular model that is not a PreTrainedModel
with tempfile.TemporaryDirectory() as tmpdir:
tmp_dir = self.get_auto_remove_tmp_dir()
kwargs = {
"output_dir": tmpdir,
"output_dir": tmp_dir,
"train_len": 128,
"save_steps": 5,
"learning_rate": 0.1,
@@ -2861,7 +2861,7 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
(a, b) = trainer.model.a.item(), trainer.model.b.item()
state = dataclasses.asdict(trainer.state)
checkpoint = os.path.join(tmpdir, "checkpoint-5")
checkpoint = os.path.join(tmp_dir, "checkpoint-5")
# Reinitialize trainer and load model
trainer = get_regression_trainer(**kwargs)
@@ -2874,7 +2874,7 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
self.check_trainer_state_are_the_same(state, state1)
# Now check with a later checkpoint that it also works when we span over one epoch
checkpoint = os.path.join(tmpdir, "checkpoint-15")
checkpoint = os.path.join(tmp_dir, "checkpoint-15")
# Reinitialize trainer and load model
trainer = get_regression_trainer(**kwargs)
@@ -2889,15 +2889,15 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
# Now check failures
# 1. fail to find a bogus checkpoint
with tempfile.TemporaryDirectory() as tmpdir:
trainer = get_regression_trainer(output_dir=tmpdir)
tmp_dir = self.get_auto_remove_tmp_dir()
trainer = get_regression_trainer(output_dir=tmp_dir)
with self.assertRaises(Exception) as context:
trainer.train(resume_from_checkpoint=f"{checkpoint}-bogus")
self.assertTrue("Can't find a valid checkpoint at" in str(context.exception))
# 2. fail to find any checkpoint - due a fresh output_dir
with tempfile.TemporaryDirectory() as tmpdir:
trainer = get_regression_trainer(output_dir=tmpdir)
tmp_dir = self.get_auto_remove_tmp_dir()
trainer = get_regression_trainer(output_dir=tmp_dir)
with self.assertRaises(Exception) as context:
trainer.train(resume_from_checkpoint=True)
self.assertTrue("No valid checkpoint found in output directory" in str(context.exception))