Support ratios for logging_steps, eval_steps, and save_steps (#23235)

* Ratio option for `logging_steps`, `eval_steps`, `save_steps`

* Add guards if arguments are not set

* Add more detailed comments + formatting

* Update src/transformers/training_args.py

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>

* Update src/transformers/training_args.py

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>

* Update src/transformers/training_args.py

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>

* Convert args values to `int` if bigger than 1

* `black`

* `make fixup`

---------

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
This commit is contained in:
Konstantin Dobler
2023-05-09 19:05:13 +02:00
committed by GitHub
parent c34a525d2f
commit 650a71e157
2 changed files with 71 additions and 9 deletions

View File

@@ -1712,6 +1712,14 @@ class Trainer:
f" {args.max_steps}" f" {args.max_steps}"
) )
# Compute absolute values for logging, eval, and save if given as ratio
if args.logging_steps and args.logging_steps < 1:
args.logging_steps = math.ceil(max_steps * args.logging_steps)
if args.eval_steps and args.eval_steps < 1:
args.eval_steps = math.ceil(max_steps * args.eval_steps)
if args.save_steps and args.save_steps < 1:
args.save_steps = math.ceil(max_steps * args.save_steps)
if DebugOption.UNDERFLOW_OVERFLOW in self.args.debug: if DebugOption.UNDERFLOW_OVERFLOW in self.args.debug:
if self.args.n_gpu > 1: if self.args.n_gpu > 1:
# nn.DataParallel(model) replicates the model, creating new variables and module # nn.DataParallel(model) replicates the model, creating new variables and module

View File

@@ -251,8 +251,9 @@ class TrainingArguments:
logging_first_step (`bool`, *optional*, defaults to `False`): logging_first_step (`bool`, *optional*, defaults to `False`):
Whether to log and evaluate the first `global_step` or not. Whether to log and evaluate the first `global_step` or not.
logging_steps (`int`, *optional*, defaults to 500): logging_steps (`int` or `float`, *optional*, defaults to 500):
Number of update steps between two logs if `logging_strategy="steps"`. Number of update steps between two logs if `logging_strategy="steps"`. Should be an integer or a float in
range `[0,1)`. If smaller than 1, will be interpreted as ratio of total training steps.
logging_nan_inf_filter (`bool`, *optional*, defaults to `True`): logging_nan_inf_filter (`bool`, *optional*, defaults to `True`):
Whether to filter `nan` and `inf` losses for logging. If set to `True` the loss of every step that is `nan` Whether to filter `nan` and `inf` losses for logging. If set to `True` the loss of every step that is `nan`
or `inf` is filtered and the average loss of the current logging window is taken instead. or `inf` is filtered and the average loss of the current logging window is taken instead.
@@ -270,8 +271,9 @@ class TrainingArguments:
- `"no"`: No save is done during training. - `"no"`: No save is done during training.
- `"epoch"`: Save is done at the end of each epoch. - `"epoch"`: Save is done at the end of each epoch.
- `"steps"`: Save is done every `save_steps`. - `"steps"`: Save is done every `save_steps`.
save_steps (`int`, *optional*, defaults to 500): save_steps (`int` or `float`, *optional*, defaults to 500):
Number of updates steps before two checkpoint saves if `save_strategy="steps"`. Number of updates steps before two checkpoint saves if `save_strategy="steps"`. Should be an integer or a
float in range `[0,1)`. If smaller than 1, will be interpreted as ratio of total training steps.
save_total_limit (`int`, *optional*): save_total_limit (`int`, *optional*):
If a value is passed, will limit the total amount of checkpoints. Deletes the older checkpoints in If a value is passed, will limit the total amount of checkpoints. Deletes the older checkpoints in
`output_dir`. `output_dir`.
@@ -332,9 +334,10 @@ class TrainingArguments:
dataloader_drop_last (`bool`, *optional*, defaults to `False`): dataloader_drop_last (`bool`, *optional*, defaults to `False`):
Whether to drop the last incomplete batch (if the length of the dataset is not divisible by the batch size) Whether to drop the last incomplete batch (if the length of the dataset is not divisible by the batch size)
or not. or not.
eval_steps (`int`, *optional*): eval_steps (`int` or `float`, *optional*):
Number of update steps between two evaluations if `evaluation_strategy="steps"`. Will default to the same Number of update steps between two evaluations if `evaluation_strategy="steps"`. Will default to the same
value as `logging_steps` if not set. value as `logging_steps` if not set. Should be an integer or a float in range `[0,1)`. If smaller than 1,
will be interpreted as ratio of total training steps.
dataloader_num_workers (`int`, *optional*, defaults to 0): dataloader_num_workers (`int`, *optional*, defaults to 0):
Number of subprocesses to use for data loading (PyTorch only). 0 means that the data will be loaded in the Number of subprocesses to use for data loading (PyTorch only). 0 means that the data will be loaded in the
main process. main process.
@@ -721,13 +724,29 @@ class TrainingArguments:
metadata={"help": "The logging strategy to use."}, metadata={"help": "The logging strategy to use."},
) )
logging_first_step: bool = field(default=False, metadata={"help": "Log the first global_step"}) logging_first_step: bool = field(default=False, metadata={"help": "Log the first global_step"})
logging_steps: int = field(default=500, metadata={"help": "Log every X updates steps."}) logging_steps: float = field(
default=500,
metadata={
"help": (
"Log every X updates steps. Should be an integer or a float in range `[0,1)`."
"If smaller than 1, will be interpreted as ratio of total training steps."
)
},
)
logging_nan_inf_filter: bool = field(default=True, metadata={"help": "Filter nan and inf losses for logging."}) logging_nan_inf_filter: bool = field(default=True, metadata={"help": "Filter nan and inf losses for logging."})
save_strategy: Union[IntervalStrategy, str] = field( save_strategy: Union[IntervalStrategy, str] = field(
default="steps", default="steps",
metadata={"help": "The checkpoint save strategy to use."}, metadata={"help": "The checkpoint save strategy to use."},
) )
save_steps: int = field(default=500, metadata={"help": "Save checkpoint every X updates steps."}) save_steps: float = field(
default=500,
metadata={
"help": (
"Save checkpoint every X updates steps. Should be an integer or a float in range `[0,1)`."
"If smaller than 1, will be interpreted as ratio of total training steps."
)
},
)
save_total_limit: Optional[int] = field( save_total_limit: Optional[int] = field(
default=None, default=None,
metadata={ metadata={
@@ -854,7 +873,15 @@ class TrainingArguments:
dataloader_drop_last: bool = field( dataloader_drop_last: bool = field(
default=False, metadata={"help": "Drop the last incomplete batch if it is not divisible by the batch size."} default=False, metadata={"help": "Drop the last incomplete batch if it is not divisible by the batch size."}
) )
eval_steps: Optional[int] = field(default=None, metadata={"help": "Run an evaluation every X steps."}) eval_steps: Optional[float] = field(
default=None,
metadata={
"help": (
"Run an evaluation every X steps. Should be an integer or a float in range `[0,1)`."
"If smaller than 1, will be interpreted as ratio of total training steps."
)
},
)
dataloader_num_workers: int = field( dataloader_num_workers: int = field(
default=0, default=0,
metadata={ metadata={
@@ -1186,6 +1213,19 @@ class TrainingArguments:
if self.logging_strategy == IntervalStrategy.STEPS and self.logging_steps == 0: if self.logging_strategy == IntervalStrategy.STEPS and self.logging_steps == 0:
raise ValueError(f"logging strategy {self.logging_strategy} requires non-zero --logging_steps") raise ValueError(f"logging strategy {self.logging_strategy} requires non-zero --logging_steps")
if self.logging_strategy == IntervalStrategy.STEPS and self.logging_steps > 1:
if self.logging_steps != int(self.logging_steps):
raise ValueError(f"--logging_steps must be an integer if bigger than 1: {self.logging_steps}")
self.logging_steps = int(self.logging_steps)
if self.evaluation_strategy == IntervalStrategy.STEPS and self.eval_steps > 1:
if self.eval_steps != int(self.eval_steps):
raise ValueError(f"--eval_steps must be an integer if bigger than 1: {self.eval_steps}")
self.eval_steps = int(self.eval_steps)
if self.save_strategy == IntervalStrategy.STEPS and self.save_steps > 1:
if self.save_steps != int(self.save_steps):
raise ValueError(f"--save_steps must be an integer if bigger than 1: {self.save_steps}")
self.save_steps = int(self.save_steps)
# Sanity checks for load_best_model_at_end: we require save and eval strategies to be compatible. # Sanity checks for load_best_model_at_end: we require save and eval strategies to be compatible.
if self.load_best_model_at_end: if self.load_best_model_at_end:
if self.evaluation_strategy != self.save_strategy: if self.evaluation_strategy != self.save_strategy:
@@ -1194,6 +1234,20 @@ class TrainingArguments:
f"strategy: {self.evaluation_strategy}\n- Save strategy: {self.save_strategy}" f"strategy: {self.evaluation_strategy}\n- Save strategy: {self.save_strategy}"
) )
if self.evaluation_strategy == IntervalStrategy.STEPS and self.save_steps % self.eval_steps != 0: if self.evaluation_strategy == IntervalStrategy.STEPS and self.save_steps % self.eval_steps != 0:
if self.eval_steps < 1 or self.save_steps < 1:
if not (self.eval_steps < 1 and self.save_steps < 1):
raise ValueError(
"--load_best_model_at_end requires the saving steps to be a multiple of the evaluation "
"steps, which cannot get guaranteed when mixing ratio and absolute steps for save_steps"
f"{self.save_steps} and eval_steps {self.eval_steps}."
)
# Work around floating point precision issues
LARGE_MULTIPLIER = 1_000_000
if (self.save_steps * LARGE_MULTIPLIER) % (self.eval_steps * LARGE_MULTIPLIER) != 0:
raise ValueError(
"--load_best_model_at_end requires the saving steps to be a multiple of the evaluation "
f"steps, but found {self.save_steps}, which is not a multiple of {self.eval_steps}."
)
raise ValueError( raise ValueError(
"--load_best_model_at_end requires the saving steps to be a round multiple of the evaluation " "--load_best_model_at_end requires the saving steps to be a round multiple of the evaluation "
f"steps, but found {self.save_steps}, which is not a round multiple of {self.eval_steps}." f"steps, but found {self.save_steps}, which is not a round multiple of {self.eval_steps}."