From 5b7dcc73427d16218488846a365d10866dca9c3e Mon Sep 17 00:00:00 2001 From: David Hall Date: Tue, 8 Mar 2022 10:45:41 -0800 Subject: [PATCH] Seed _get_train_sampler's generator with arg seed to improve reproducibility (#15961) * Seed get_train_sampler's generator with arg seed to improve reproducibility and make the world_size<=1 code path more similar to the others * move test file into trainer test explicitly * dumb typo * make style lint happy * per discussion, switch to data_seed * Apply suggestions from code review Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> --- src/transformers/trainer.py | 17 +++++++-- src/transformers/training_args.py | 5 +++ tests/trainer/test_trainer.py | 61 +++++++++++++++++++++++++++++++ 3 files changed, 79 insertions(+), 4 deletions(-) diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py index 87a7fb90b0..8b890f435c 100755 --- a/src/transformers/trainer.py +++ b/src/transformers/trainer.py @@ -591,7 +591,16 @@ class Trainer: generator = None if self.args.world_size <= 1 and _is_torch_generator_available: generator = torch.Generator() - generator.manual_seed(int(torch.empty((), dtype=torch.int64).random_().item())) + # for backwards compatibility, we generate a seed here (which is sampled from a generator seeded with + # `args.seed`) if data_seed isn't provided. + # Further on in this method, we default to `args.seed` instead. + if self.args.data_seed is None: + seed = int(torch.empty((), dtype=torch.int64).random_().item()) + else: + seed = self.args.data_seed + generator.manual_seed(seed) + + seed = self.args.data_seed if self.args.data_seed is not None else self.args.seed # Build the sampler. if self.args.group_by_length: @@ -620,7 +629,7 @@ class Trainer: rank=self.args.process_index, lengths=lengths, model_input_name=model_input_name, - seed=self.args.seed, + seed=seed, ) else: @@ -638,14 +647,14 @@ class Trainer: batch_size=self.args.per_device_train_batch_size, num_replicas=self.args.world_size, rank=self.args.process_index, - seed=self.args.seed, + seed=seed, ) else: return DistributedSampler( self.train_dataset, num_replicas=self.args.world_size, rank=self.args.process_index, - seed=self.args.seed, + seed=seed, ) def get_train_dataloader(self) -> DataLoader: diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py index e34b79166b..d8096c5efa 100644 --- a/src/transformers/training_args.py +++ b/src/transformers/training_args.py @@ -220,6 +220,10 @@ class TrainingArguments: seed (`int`, *optional*, defaults to 42): Random seed that will be set at the beginning of training. To ensure reproducibility across runs, use the [`~Trainer.model_init`] function to instantiate the model if it has some randomly initialized parameters. + data_seed (`int`, *optional*): + Random seed to be used with data samplers. If not set, random generators for data sampling will use the + same seed as `seed`. This can be used to ensure reproducibility of data sampling, independent of the model + seed. bf16 (`bool`, *optional*, defaults to `False`): Whether to use bf16 16-bit (mixed) precision training instead of 32-bit training. Requires Ampere or higher NVIDIA architecture. This is an experimental API and it may change. @@ -539,6 +543,7 @@ class TrainingArguments: ) no_cuda: bool = field(default=False, metadata={"help": "Do not use CUDA even when it is available"}) seed: int = field(default=42, metadata={"help": "Random seed that will be set at the beginning of training."}) + data_seed: int = field(default=None, metadata={"help": "Random seed to be used with data samplers."}) bf16: bool = field( default=False, metadata={ diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py index 4e8fd2b125..a1a8a88c70 100644 --- a/tests/trainer/test_trainer.py +++ b/tests/trainer/test_trainer.py @@ -647,6 +647,67 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon): new_eval_dataset = RegressionDataset(length=128) self.assertEqual(len(trainer.get_eval_dataloader(new_eval_dataset)), 128 // (32 * n_gpu)) + def test_sampler_seed(self): + # nb: we don't want to inherit from IterableDataset to hit the right code path + class DummyDataset(torch.utils.data.Dataset): + def __init__(self, length: int = 101): + self.length = length + + def __len__(self): + return self.length + + def __getitem__(self, i): + if (i < 0) or (i >= self.length): + raise IndexError + return {"input_ids": [i]} + + class DummyModel(PreTrainedModel): + def __init__(self, num_params: int): + super().__init__(PretrainedConfig()) + # Add some (unused) params. the point here is that randomness in model_init shouldn't influence + # data loader order. + self.params = nn.Parameter(torch.randn(num_params)) + + def forward(self, input_ids, labels=None): + if labels is not None: + return torch.tensor(0.0, device=input_ids.device), input_ids + else: + return input_ids + + def _get_first_data_sample(num_params, seed, data_seed, **kwargs): + with tempfile.TemporaryDirectory() as tmpdir: + trainer = Trainer( + model_init=lambda: DummyModel(num_params), + args=TrainingArguments( + output_dir=tmpdir, + **kwargs, + seed=seed, + data_seed=data_seed, + local_rank=-1, + ), + train_dataset=DummyDataset(), + ) + + return next(iter(trainer.get_train_dataloader())) + + # test that the seed is passed to the sampler + # the codepath we want to hit is world_size <= 1, and both group_by_length + for group_by_length in [True, False]: + sample42_1 = _get_first_data_sample(num_params=10, seed=42, data_seed=42, group_by_length=group_by_length) + sample42_2 = _get_first_data_sample(num_params=11, seed=42, data_seed=42, group_by_length=group_by_length) + self.assertTrue(torch.equal(sample42_1["input_ids"], sample42_2["input_ids"])) + + # should get same samples with different seed, so long as data_seed is the same + sample42_3 = _get_first_data_sample(num_params=11, seed=11, data_seed=42, group_by_length=group_by_length) + self.assertTrue(torch.equal(sample42_1["input_ids"], sample42_3["input_ids"])) + + # make sure we have some randomness in the samples if data_seed is different + others = [ + _get_first_data_sample(num_params=i, seed=42, data_seed=i, group_by_length=group_by_length) + for i in range(10) + ] + self.assertTrue(any(not torch.equal(sample42_1["input_ids"], sample["input_ids"]) for sample in others)) + @require_torch_multi_gpu def test_data_is_not_parallelized_when_model_is_parallel(self): model = RegressionModel()