From c33f6046c3dab8f41bedf893404e6469dea3bce8 Mon Sep 17 00:00:00 2001 From: hasan salim kanmaz <49716619+hasansalimkanmaz@users.noreply.github.com> Date: Wed, 11 May 2022 15:37:13 +0200 Subject: [PATCH] [WIP] Enable reproducibility for distributed trainings (#16907) * add seed worker and set_deterministic_seed_for_cuda function to enforce reproducability * change function name to enable determinism, add docstrings, reproducability support for tf * change function name to enable_determinism_for_distributed_training * revert changes in set_seed and call set_seed within enable_full_determinism * add one position argument for seed_worker function * add full_determinism flag in training args and call enable_full_determinism when it is true * add enable_full_determinism to documentation * apply make fixup after the last commit * Update src/transformers/training_args.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> --- docs/source/en/internal/trainer_utils.mdx | 2 ++ src/transformers/__init__.py | 4 +-- src/transformers/trainer.py | 7 +++-- src/transformers/trainer_tf.py | 11 ++++++-- src/transformers/trainer_utils.py | 33 +++++++++++++++++++++++ src/transformers/training_args.py | 9 +++++++ 6 files changed, 60 insertions(+), 6 deletions(-) diff --git a/docs/source/en/internal/trainer_utils.mdx b/docs/source/en/internal/trainer_utils.mdx index 054bd69b44..bba182d5ab 100644 --- a/docs/source/en/internal/trainer_utils.mdx +++ b/docs/source/en/internal/trainer_utils.mdx @@ -22,6 +22,8 @@ Most of those are only useful if you are studying the code of the Trainer in the [[autodoc]] IntervalStrategy +[[autodoc]] enable_full_determinism + [[autodoc]] set_seed [[autodoc]] torch_distributed_zero_first diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index c96e1a8699..cd70cd58cc 100755 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -372,7 +372,7 @@ _import_structure = { "TrainerControl", "TrainerState", ], - "trainer_utils": ["EvalPrediction", "IntervalStrategy", "SchedulerType", "set_seed"], + "trainer_utils": ["EvalPrediction", "IntervalStrategy", "SchedulerType", "enable_full_determinism", "set_seed"], "training_args": ["TrainingArguments"], "training_args_seq2seq": ["Seq2SeqTrainingArguments"], "training_args_tf": ["TFTrainingArguments"], @@ -2810,7 +2810,7 @@ if TYPE_CHECKING: TrainerControl, TrainerState, ) - from .trainer_utils import EvalPrediction, IntervalStrategy, SchedulerType, set_seed + from .trainer_utils import EvalPrediction, IntervalStrategy, SchedulerType, enable_full_determinism, set_seed from .training_args import TrainingArguments from .training_args_seq2seq import Seq2SeqTrainingArguments from .training_args_tf import TFTrainingArguments diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py index 20392bfb94..9855f29a46 100755 --- a/src/transformers/trainer.py +++ b/src/transformers/trainer.py @@ -115,10 +115,12 @@ from .trainer_utils import ( default_compute_objective, default_hp_space, denumpify_detensorize, + enable_full_determinism, find_executable_batch_size, get_last_checkpoint, has_length, number_of_arguments, + seed_worker, set_seed, speed_metrics, ) @@ -300,7 +302,7 @@ class Trainer: args = TrainingArguments(output_dir=output_dir) self.args = args # Seed must be set before instantiating the model when using model - set_seed(self.args.seed) + enable_full_determinism(self.args.seed) if self.args.full_determinism else set_seed(self.args.seed) self.hp_name = None self.deepspeed = None self.is_in_train = False @@ -746,6 +748,7 @@ class Trainer: drop_last=self.args.dataloader_drop_last, num_workers=self.args.dataloader_num_workers, pin_memory=self.args.dataloader_pin_memory, + worker_init_fn=seed_worker, ) def _get_eval_sampler(self, eval_dataset: Dataset) -> Optional[torch.utils.data.Sampler]: @@ -1254,7 +1257,7 @@ class Trainer: model_reloaded = False if self.model_init is not None: # Seed must be set before instantiating the model when using model_init. - set_seed(args.seed) + enable_full_determinism(self.args.seed) if self.args.full_determinism else set_seed(self.args.seed) self.model = self.call_model_init(trial) model_reloaded = True # Reinitializes optimizer and scheduler diff --git a/src/transformers/trainer_tf.py b/src/transformers/trainer_tf.py index 71c2e691d2..737dd4deaf 100644 --- a/src/transformers/trainer_tf.py +++ b/src/transformers/trainer_tf.py @@ -34,7 +34,14 @@ from tensorflow.python.distribute.values import PerReplica from .modeling_tf_utils import TFPreTrainedModel from .optimization_tf import GradientAccumulator, create_optimizer -from .trainer_utils import PREFIX_CHECKPOINT_DIR, EvalPrediction, IntervalStrategy, PredictionOutput, set_seed +from .trainer_utils import ( + PREFIX_CHECKPOINT_DIR, + EvalPrediction, + IntervalStrategy, + PredictionOutput, + enable_full_determinism, + set_seed, +) from .training_args_tf import TFTrainingArguments from .utils import logging @@ -134,7 +141,7 @@ class TFTrainer: "see https://www.comet.ml/docs/python-sdk/huggingface/" ) - set_seed(self.args.seed) + enable_full_determinism(self.args.seed) if self.args.full_determinism else set_seed(self.args.seed) def get_train_tfdataset(self) -> tf.data.Dataset: """ diff --git a/src/transformers/trainer_utils.py b/src/transformers/trainer_utils.py index 62cab858b7..d74d0aed9f 100644 --- a/src/transformers/trainer_utils.py +++ b/src/transformers/trainer_utils.py @@ -47,6 +47,39 @@ if is_tf_available(): import tensorflow as tf +def seed_worker(_): + """ + Helper function to set worker seed during Dataloader initialization. + """ + worker_seed = torch.initial_seed() % 2**32 + set_seed(worker_seed) + + +def enable_full_determinism(seed: int): + """ + Helper function for reproducible behavior during distributed training. See + - https://pytorch.org/docs/stable/notes/randomness.html for pytorch + - https://www.tensorflow.org/api_docs/python/tf/config/experimental/enable_op_determinism for tensorflow + """ + # set seed first + set_seed(seed) + + if is_torch_available(): + #  Enable PyTorch deterministic mode. This potentially requires either the environment + #  variable 'CUDA_LAUNCH_BLOCKING' or 'CUBLAS_WORKSPACE_CONFIG' to be set, + # depending on the CUDA version, so we set them both here + os.environ["CUDA_LAUNCH_BLOCKING"] = "1" + os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":16:8" + torch.use_deterministic_algorithms(True) + + # Enable CUDNN deterministic mode + torch.backends.cudnn.deterministic = True + torch.backends.cudnn.benchmark = False + + if is_tf_available(): + tf.config.experimental.enable_op_determinism() + + def set_seed(seed: int): """ Helper function for reproducible behavior to set the seed in `random`, `numpy`, `torch` and/or `tf` (if installed). diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py index cd21fb5436..cb929ab631 100644 --- a/src/transformers/training_args.py +++ b/src/transformers/training_args.py @@ -448,6 +448,9 @@ class TrainingArguments: auto_find_batch_size (`bool`, *optional*, defaults to `False`) Whether to find a batch size that will fit into memory automatically through exponential decay, avoiding CUDA Out-of-Memory errors. Requires accelerate to be installed (`pip install accelerate`) + full_determinism (`bool`, *optional*, defaults to `False`) + If `True`, [`enable_full_determinism`] is called instead of [`set_seed`] to ensure reproducible results in + distributed training """ output_dir: str = field( @@ -816,6 +819,12 @@ class TrainingArguments: "help": "Whether to automatically decrease the batch size in half and rerun the training loop again each time a CUDA Out-of-Memory was reached" }, ) + full_determinism: bool = field( + default=False, + metadata={ + "help": "Whether to call enable_full_determinism instead of set_seed for reproducibility in distributed training" + }, + ) def __post_init__(self): # Handle --use_env option in torch.distributed.launch (local_rank not passed as an arg then).