Seed _get_train_sampler's generator with arg seed to improve reproducibility (#15961)

* Seed get_train_sampler's generator with arg seed to improve reproducibility

and make the world_size<=1 code path more similar to the others

* move test file into trainer test explicitly

* dumb typo

* make style lint happy

* per discussion, switch to data_seed

* Apply suggestions from code review

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
This commit is contained in:
David Hall
2022-03-08 10:45:41 -08:00
committed by GitHub
parent 70203b5937
commit 5b7dcc7342
3 changed files with 79 additions and 4 deletions

View File

@@ -591,7 +591,16 @@ class Trainer:
generator = None generator = None
if self.args.world_size <= 1 and _is_torch_generator_available: if self.args.world_size <= 1 and _is_torch_generator_available:
generator = torch.Generator() generator = torch.Generator()
generator.manual_seed(int(torch.empty((), dtype=torch.int64).random_().item())) # for backwards compatibility, we generate a seed here (which is sampled from a generator seeded with
# `args.seed`) if data_seed isn't provided.
# Further on in this method, we default to `args.seed` instead.
if self.args.data_seed is None:
seed = int(torch.empty((), dtype=torch.int64).random_().item())
else:
seed = self.args.data_seed
generator.manual_seed(seed)
seed = self.args.data_seed if self.args.data_seed is not None else self.args.seed
# Build the sampler. # Build the sampler.
if self.args.group_by_length: if self.args.group_by_length:
@@ -620,7 +629,7 @@ class Trainer:
rank=self.args.process_index, rank=self.args.process_index,
lengths=lengths, lengths=lengths,
model_input_name=model_input_name, model_input_name=model_input_name,
seed=self.args.seed, seed=seed,
) )
else: else:
@@ -638,14 +647,14 @@ class Trainer:
batch_size=self.args.per_device_train_batch_size, batch_size=self.args.per_device_train_batch_size,
num_replicas=self.args.world_size, num_replicas=self.args.world_size,
rank=self.args.process_index, rank=self.args.process_index,
seed=self.args.seed, seed=seed,
) )
else: else:
return DistributedSampler( return DistributedSampler(
self.train_dataset, self.train_dataset,
num_replicas=self.args.world_size, num_replicas=self.args.world_size,
rank=self.args.process_index, rank=self.args.process_index,
seed=self.args.seed, seed=seed,
) )
def get_train_dataloader(self) -> DataLoader: def get_train_dataloader(self) -> DataLoader:

View File

@@ -220,6 +220,10 @@ class TrainingArguments:
seed (`int`, *optional*, defaults to 42): seed (`int`, *optional*, defaults to 42):
Random seed that will be set at the beginning of training. To ensure reproducibility across runs, use the Random seed that will be set at the beginning of training. To ensure reproducibility across runs, use the
[`~Trainer.model_init`] function to instantiate the model if it has some randomly initialized parameters. [`~Trainer.model_init`] function to instantiate the model if it has some randomly initialized parameters.
data_seed (`int`, *optional*):
Random seed to be used with data samplers. If not set, random generators for data sampling will use the
same seed as `seed`. This can be used to ensure reproducibility of data sampling, independent of the model
seed.
bf16 (`bool`, *optional*, defaults to `False`): bf16 (`bool`, *optional*, defaults to `False`):
Whether to use bf16 16-bit (mixed) precision training instead of 32-bit training. Requires Ampere or higher Whether to use bf16 16-bit (mixed) precision training instead of 32-bit training. Requires Ampere or higher
NVIDIA architecture. This is an experimental API and it may change. NVIDIA architecture. This is an experimental API and it may change.
@@ -539,6 +543,7 @@ class TrainingArguments:
) )
no_cuda: bool = field(default=False, metadata={"help": "Do not use CUDA even when it is available"}) no_cuda: bool = field(default=False, metadata={"help": "Do not use CUDA even when it is available"})
seed: int = field(default=42, metadata={"help": "Random seed that will be set at the beginning of training."}) seed: int = field(default=42, metadata={"help": "Random seed that will be set at the beginning of training."})
data_seed: int = field(default=None, metadata={"help": "Random seed to be used with data samplers."})
bf16: bool = field( bf16: bool = field(
default=False, default=False,
metadata={ metadata={

View File

@@ -647,6 +647,67 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
new_eval_dataset = RegressionDataset(length=128) new_eval_dataset = RegressionDataset(length=128)
self.assertEqual(len(trainer.get_eval_dataloader(new_eval_dataset)), 128 // (32 * n_gpu)) self.assertEqual(len(trainer.get_eval_dataloader(new_eval_dataset)), 128 // (32 * n_gpu))
def test_sampler_seed(self):
# nb: we don't want to inherit from IterableDataset to hit the right code path
class DummyDataset(torch.utils.data.Dataset):
def __init__(self, length: int = 101):
self.length = length
def __len__(self):
return self.length
def __getitem__(self, i):
if (i < 0) or (i >= self.length):
raise IndexError
return {"input_ids": [i]}
class DummyModel(PreTrainedModel):
def __init__(self, num_params: int):
super().__init__(PretrainedConfig())
# Add some (unused) params. the point here is that randomness in model_init shouldn't influence
# data loader order.
self.params = nn.Parameter(torch.randn(num_params))
def forward(self, input_ids, labels=None):
if labels is not None:
return torch.tensor(0.0, device=input_ids.device), input_ids
else:
return input_ids
def _get_first_data_sample(num_params, seed, data_seed, **kwargs):
with tempfile.TemporaryDirectory() as tmpdir:
trainer = Trainer(
model_init=lambda: DummyModel(num_params),
args=TrainingArguments(
output_dir=tmpdir,
**kwargs,
seed=seed,
data_seed=data_seed,
local_rank=-1,
),
train_dataset=DummyDataset(),
)
return next(iter(trainer.get_train_dataloader()))
# test that the seed is passed to the sampler
# the codepath we want to hit is world_size <= 1, and both group_by_length
for group_by_length in [True, False]:
sample42_1 = _get_first_data_sample(num_params=10, seed=42, data_seed=42, group_by_length=group_by_length)
sample42_2 = _get_first_data_sample(num_params=11, seed=42, data_seed=42, group_by_length=group_by_length)
self.assertTrue(torch.equal(sample42_1["input_ids"], sample42_2["input_ids"]))
# should get same samples with different seed, so long as data_seed is the same
sample42_3 = _get_first_data_sample(num_params=11, seed=11, data_seed=42, group_by_length=group_by_length)
self.assertTrue(torch.equal(sample42_1["input_ids"], sample42_3["input_ids"]))
# make sure we have some randomness in the samples if data_seed is different
others = [
_get_first_data_sample(num_params=i, seed=42, data_seed=i, group_by_length=group_by_length)
for i in range(10)
]
self.assertTrue(any(not torch.equal(sample42_1["input_ids"], sample["input_ids"]) for sample in others))
@require_torch_multi_gpu @require_torch_multi_gpu
def test_data_is_not_parallelized_when_model_is_parallel(self): def test_data_is_not_parallelized_when_model_is_parallel(self):
model = RegressionModel() model = RegressionModel()