🚨🚨🚨 Replace DataLoader logic for Accelerate in Trainer, remove unneeded tests 🚨🚨🚨 (#24028)

* Working integration

* Fix failing test

* Revert label host logic

* Bring it back!
This commit is contained in:
Zach Mueller
2023-06-12 11:23:37 -04:00
committed by GitHub
parent dc42a9d76f
commit ebd94b0f6f
2 changed files with 60 additions and 278 deletions

View File

@@ -798,9 +798,9 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
def test_train_and_eval_dataloaders(self):
n_gpu = max(1, torch.cuda.device_count())
trainer = get_regression_trainer(learning_rate=0.1, per_device_train_batch_size=16)
self.assertEqual(trainer.get_train_dataloader().batch_size, 16 * n_gpu)
self.assertEqual(trainer.get_train_dataloader().total_batch_size, 16 * n_gpu)
trainer = get_regression_trainer(learning_rate=0.1, per_device_eval_batch_size=16)
self.assertEqual(trainer.get_eval_dataloader().batch_size, 16 * n_gpu)
self.assertEqual(trainer.get_eval_dataloader().total_batch_size, 16 * n_gpu)
# Check drop_last works
trainer = get_regression_trainer(
@@ -833,67 +833,6 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
trainer.train()
trainer.evaluate()
def test_sampler_seed(self):
# nb: we don't want to inherit from IterableDataset to hit the right code path
class DummyDataset(torch.utils.data.Dataset):
def __init__(self, length: int = 101):
self.length = length
def __len__(self):
return self.length
def __getitem__(self, i):
if (i < 0) or (i >= self.length):
raise IndexError
return {"input_ids": [i]}
class DummyModel(PreTrainedModel):
def __init__(self, num_params: int):
super().__init__(PretrainedConfig())
# Add some (unused) params. the point here is that randomness in model_init shouldn't influence
# data loader order.
self.params = nn.Parameter(torch.randn(num_params))
def forward(self, input_ids, labels=None):
if labels is not None:
return torch.tensor(0.0, device=input_ids.device), input_ids
else:
return input_ids
def _get_first_data_sample(num_params, seed, data_seed, **kwargs):
with tempfile.TemporaryDirectory() as tmpdir:
trainer = Trainer(
model_init=lambda: DummyModel(num_params),
args=TrainingArguments(
output_dir=tmpdir,
**kwargs,
seed=seed,
data_seed=data_seed,
local_rank=-1,
),
train_dataset=DummyDataset(),
)
return next(iter(trainer.get_train_dataloader()))
# test that the seed is passed to the sampler
# the codepath we want to hit is world_size <= 1, and both group_by_length
for group_by_length in [True, False]:
sample42_1 = _get_first_data_sample(num_params=10, seed=42, data_seed=42, group_by_length=group_by_length)
sample42_2 = _get_first_data_sample(num_params=11, seed=42, data_seed=42, group_by_length=group_by_length)
self.assertTrue(torch.equal(sample42_1["input_ids"], sample42_2["input_ids"]))
# should get same samples with different seed, so long as data_seed is the same
sample42_3 = _get_first_data_sample(num_params=11, seed=11, data_seed=42, group_by_length=group_by_length)
self.assertTrue(torch.equal(sample42_1["input_ids"], sample42_3["input_ids"]))
# make sure we have some randomness in the samples if data_seed is different
others = [
_get_first_data_sample(num_params=i, seed=42, data_seed=i, group_by_length=group_by_length)
for i in range(10)
]
self.assertTrue(any(not torch.equal(sample42_1["input_ids"], sample["input_ids"]) for sample in others))
@require_torch_multi_gpu
def test_data_is_not_parallelized_when_model_is_parallel(self):
model = RegressionModel()
@@ -907,9 +846,9 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
self.assertEqual(trainer.args.n_gpu, 1)
# The batch size of the training and evaluation dataloaders should be 16, not 16 * n_gpu
self.assertEqual(trainer.get_train_dataloader().batch_size, 16)
self.assertEqual(trainer.get_train_dataloader().total_batch_size, 16)
self.assertEqual(len(trainer.get_train_dataloader()), 64 // 16)
self.assertEqual(trainer.get_eval_dataloader().batch_size, 16)
self.assertEqual(trainer.get_eval_dataloader().total_batch_size, 16)
self.assertEqual(len(trainer.get_eval_dataloader()), 64 // 16)
def test_evaluate(self):
@@ -1742,26 +1681,6 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
self.assertIsInstance(loader, torch.utils.data.DataLoader)
self.assertIsInstance(loader.sampler, torch.utils.data.dataloader._InfiniteConstantSampler)
def test_training_finite_iterable_dataset(self):
config = RegressionModelConfig()
model = RegressionPreTrainedModel(config)
batch_size = 1
num_samples = 10
available_steps = num_samples // batch_size
data = FiniteIterableDataset(length=num_samples)
train_args = TrainingArguments(
"..",
max_steps=available_steps + 1, # set a higher number than actually available
per_device_train_batch_size=batch_size,
)
trainer = Trainer(model, train_dataset=data, args=train_args)
with self.assertLogs("transformers.trainer", level="WARNING") as logs:
trainer.train()
self.assertIn(f"stopping training at step {available_steps}!", logs.output[0])
def test_evaluation_iterable_dataset(self):
config = RegressionModelConfig(a=1.5, b=2.5)
model = RegressionPreTrainedModel(config)