From 650a71e157478cc8c9d9dc648a6a79108f49e047 Mon Sep 17 00:00:00 2001 From: Konstantin Dobler Date: Tue, 9 May 2023 19:05:13 +0200 Subject: [PATCH] Support ratios for `logging_steps`, `eval_steps`, and `save_steps` (#23235) * Ratio option for `logging_steps`, `eval_steps`, `save_steps` * Add guards if arguments are not set * Add more detailed comments + formatting * Update src/transformers/training_args.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Update src/transformers/training_args.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Update src/transformers/training_args.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Convert args values to `int` if bigger than 1 * `black` * `make fixup` --------- Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> --- src/transformers/trainer.py | 8 ++++ src/transformers/training_args.py | 72 +++++++++++++++++++++++++++---- 2 files changed, 71 insertions(+), 9 deletions(-) diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py index d18e3efeb8..f7fb3558df 100755 --- a/src/transformers/trainer.py +++ b/src/transformers/trainer.py @@ -1712,6 +1712,14 @@ class Trainer: f" {args.max_steps}" ) + # Compute absolute values for logging, eval, and save if given as ratio + if args.logging_steps and args.logging_steps < 1: + args.logging_steps = math.ceil(max_steps * args.logging_steps) + if args.eval_steps and args.eval_steps < 1: + args.eval_steps = math.ceil(max_steps * args.eval_steps) + if args.save_steps and args.save_steps < 1: + args.save_steps = math.ceil(max_steps * args.save_steps) + if DebugOption.UNDERFLOW_OVERFLOW in self.args.debug: if self.args.n_gpu > 1: # nn.DataParallel(model) replicates the model, creating new variables and module diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py index 4b0ea975b2..44f28ff99e 100644 --- a/src/transformers/training_args.py +++ b/src/transformers/training_args.py @@ -251,8 +251,9 @@ class TrainingArguments: logging_first_step (`bool`, *optional*, defaults to `False`): Whether to log and evaluate the first `global_step` or not. - logging_steps (`int`, *optional*, defaults to 500): - Number of update steps between two logs if `logging_strategy="steps"`. + logging_steps (`int` or `float`, *optional*, defaults to 500): + Number of update steps between two logs if `logging_strategy="steps"`. Should be an integer or a float in + range `[0,1)`. If smaller than 1, will be interpreted as ratio of total training steps. logging_nan_inf_filter (`bool`, *optional*, defaults to `True`): Whether to filter `nan` and `inf` losses for logging. If set to `True` the loss of every step that is `nan` or `inf` is filtered and the average loss of the current logging window is taken instead. @@ -270,8 +271,9 @@ class TrainingArguments: - `"no"`: No save is done during training. - `"epoch"`: Save is done at the end of each epoch. - `"steps"`: Save is done every `save_steps`. - save_steps (`int`, *optional*, defaults to 500): - Number of updates steps before two checkpoint saves if `save_strategy="steps"`. + save_steps (`int` or `float`, *optional*, defaults to 500): + Number of updates steps before two checkpoint saves if `save_strategy="steps"`. Should be an integer or a + float in range `[0,1)`. If smaller than 1, will be interpreted as ratio of total training steps. save_total_limit (`int`, *optional*): If a value is passed, will limit the total amount of checkpoints. Deletes the older checkpoints in `output_dir`. @@ -332,9 +334,10 @@ class TrainingArguments: dataloader_drop_last (`bool`, *optional*, defaults to `False`): Whether to drop the last incomplete batch (if the length of the dataset is not divisible by the batch size) or not. - eval_steps (`int`, *optional*): + eval_steps (`int` or `float`, *optional*): Number of update steps between two evaluations if `evaluation_strategy="steps"`. Will default to the same - value as `logging_steps` if not set. + value as `logging_steps` if not set. Should be an integer or a float in range `[0,1)`. If smaller than 1, + will be interpreted as ratio of total training steps. dataloader_num_workers (`int`, *optional*, defaults to 0): Number of subprocesses to use for data loading (PyTorch only). 0 means that the data will be loaded in the main process. @@ -721,13 +724,29 @@ class TrainingArguments: metadata={"help": "The logging strategy to use."}, ) logging_first_step: bool = field(default=False, metadata={"help": "Log the first global_step"}) - logging_steps: int = field(default=500, metadata={"help": "Log every X updates steps."}) + logging_steps: float = field( + default=500, + metadata={ + "help": ( + "Log every X updates steps. Should be an integer or a float in range `[0,1)`." + "If smaller than 1, will be interpreted as ratio of total training steps." + ) + }, + ) logging_nan_inf_filter: bool = field(default=True, metadata={"help": "Filter nan and inf losses for logging."}) save_strategy: Union[IntervalStrategy, str] = field( default="steps", metadata={"help": "The checkpoint save strategy to use."}, ) - save_steps: int = field(default=500, metadata={"help": "Save checkpoint every X updates steps."}) + save_steps: float = field( + default=500, + metadata={ + "help": ( + "Save checkpoint every X updates steps. Should be an integer or a float in range `[0,1)`." + "If smaller than 1, will be interpreted as ratio of total training steps." + ) + }, + ) save_total_limit: Optional[int] = field( default=None, metadata={ @@ -854,7 +873,15 @@ class TrainingArguments: dataloader_drop_last: bool = field( default=False, metadata={"help": "Drop the last incomplete batch if it is not divisible by the batch size."} ) - eval_steps: Optional[int] = field(default=None, metadata={"help": "Run an evaluation every X steps."}) + eval_steps: Optional[float] = field( + default=None, + metadata={ + "help": ( + "Run an evaluation every X steps. Should be an integer or a float in range `[0,1)`." + "If smaller than 1, will be interpreted as ratio of total training steps." + ) + }, + ) dataloader_num_workers: int = field( default=0, metadata={ @@ -1186,6 +1213,19 @@ class TrainingArguments: if self.logging_strategy == IntervalStrategy.STEPS and self.logging_steps == 0: raise ValueError(f"logging strategy {self.logging_strategy} requires non-zero --logging_steps") + if self.logging_strategy == IntervalStrategy.STEPS and self.logging_steps > 1: + if self.logging_steps != int(self.logging_steps): + raise ValueError(f"--logging_steps must be an integer if bigger than 1: {self.logging_steps}") + self.logging_steps = int(self.logging_steps) + if self.evaluation_strategy == IntervalStrategy.STEPS and self.eval_steps > 1: + if self.eval_steps != int(self.eval_steps): + raise ValueError(f"--eval_steps must be an integer if bigger than 1: {self.eval_steps}") + self.eval_steps = int(self.eval_steps) + if self.save_strategy == IntervalStrategy.STEPS and self.save_steps > 1: + if self.save_steps != int(self.save_steps): + raise ValueError(f"--save_steps must be an integer if bigger than 1: {self.save_steps}") + self.save_steps = int(self.save_steps) + # Sanity checks for load_best_model_at_end: we require save and eval strategies to be compatible. if self.load_best_model_at_end: if self.evaluation_strategy != self.save_strategy: @@ -1194,6 +1234,20 @@ class TrainingArguments: f"strategy: {self.evaluation_strategy}\n- Save strategy: {self.save_strategy}" ) if self.evaluation_strategy == IntervalStrategy.STEPS and self.save_steps % self.eval_steps != 0: + if self.eval_steps < 1 or self.save_steps < 1: + if not (self.eval_steps < 1 and self.save_steps < 1): + raise ValueError( + "--load_best_model_at_end requires the saving steps to be a multiple of the evaluation " + "steps, which cannot get guaranteed when mixing ratio and absolute steps for save_steps" + f"{self.save_steps} and eval_steps {self.eval_steps}." + ) + # Work around floating point precision issues + LARGE_MULTIPLIER = 1_000_000 + if (self.save_steps * LARGE_MULTIPLIER) % (self.eval_steps * LARGE_MULTIPLIER) != 0: + raise ValueError( + "--load_best_model_at_end requires the saving steps to be a multiple of the evaluation " + f"steps, but found {self.save_steps}, which is not a multiple of {self.eval_steps}." + ) raise ValueError( "--load_best_model_at_end requires the saving steps to be a round multiple of the evaluation " f"steps, but found {self.save_steps}, which is not a round multiple of {self.eval_steps}."