Support ratios for logging_steps, eval_steps, and save_steps (#23235)
* Ratio option for `logging_steps`, `eval_steps`, `save_steps` * Add guards if arguments are not set * Add more detailed comments + formatting * Update src/transformers/training_args.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Update src/transformers/training_args.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Update src/transformers/training_args.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Convert args values to `int` if bigger than 1 * `black` * `make fixup` --------- Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
This commit is contained in:
committed by
GitHub
parent
c34a525d2f
commit
650a71e157
@@ -251,8 +251,9 @@ class TrainingArguments:
|
||||
|
||||
logging_first_step (`bool`, *optional*, defaults to `False`):
|
||||
Whether to log and evaluate the first `global_step` or not.
|
||||
logging_steps (`int`, *optional*, defaults to 500):
|
||||
Number of update steps between two logs if `logging_strategy="steps"`.
|
||||
logging_steps (`int` or `float`, *optional*, defaults to 500):
|
||||
Number of update steps between two logs if `logging_strategy="steps"`. Should be an integer or a float in
|
||||
range `[0,1)`. If smaller than 1, will be interpreted as ratio of total training steps.
|
||||
logging_nan_inf_filter (`bool`, *optional*, defaults to `True`):
|
||||
Whether to filter `nan` and `inf` losses for logging. If set to `True` the loss of every step that is `nan`
|
||||
or `inf` is filtered and the average loss of the current logging window is taken instead.
|
||||
@@ -270,8 +271,9 @@ class TrainingArguments:
|
||||
- `"no"`: No save is done during training.
|
||||
- `"epoch"`: Save is done at the end of each epoch.
|
||||
- `"steps"`: Save is done every `save_steps`.
|
||||
save_steps (`int`, *optional*, defaults to 500):
|
||||
Number of updates steps before two checkpoint saves if `save_strategy="steps"`.
|
||||
save_steps (`int` or `float`, *optional*, defaults to 500):
|
||||
Number of updates steps before two checkpoint saves if `save_strategy="steps"`. Should be an integer or a
|
||||
float in range `[0,1)`. If smaller than 1, will be interpreted as ratio of total training steps.
|
||||
save_total_limit (`int`, *optional*):
|
||||
If a value is passed, will limit the total amount of checkpoints. Deletes the older checkpoints in
|
||||
`output_dir`.
|
||||
@@ -332,9 +334,10 @@ class TrainingArguments:
|
||||
dataloader_drop_last (`bool`, *optional*, defaults to `False`):
|
||||
Whether to drop the last incomplete batch (if the length of the dataset is not divisible by the batch size)
|
||||
or not.
|
||||
eval_steps (`int`, *optional*):
|
||||
eval_steps (`int` or `float`, *optional*):
|
||||
Number of update steps between two evaluations if `evaluation_strategy="steps"`. Will default to the same
|
||||
value as `logging_steps` if not set.
|
||||
value as `logging_steps` if not set. Should be an integer or a float in range `[0,1)`. If smaller than 1,
|
||||
will be interpreted as ratio of total training steps.
|
||||
dataloader_num_workers (`int`, *optional*, defaults to 0):
|
||||
Number of subprocesses to use for data loading (PyTorch only). 0 means that the data will be loaded in the
|
||||
main process.
|
||||
@@ -721,13 +724,29 @@ class TrainingArguments:
|
||||
metadata={"help": "The logging strategy to use."},
|
||||
)
|
||||
logging_first_step: bool = field(default=False, metadata={"help": "Log the first global_step"})
|
||||
logging_steps: int = field(default=500, metadata={"help": "Log every X updates steps."})
|
||||
logging_steps: float = field(
|
||||
default=500,
|
||||
metadata={
|
||||
"help": (
|
||||
"Log every X updates steps. Should be an integer or a float in range `[0,1)`."
|
||||
"If smaller than 1, will be interpreted as ratio of total training steps."
|
||||
)
|
||||
},
|
||||
)
|
||||
logging_nan_inf_filter: bool = field(default=True, metadata={"help": "Filter nan and inf losses for logging."})
|
||||
save_strategy: Union[IntervalStrategy, str] = field(
|
||||
default="steps",
|
||||
metadata={"help": "The checkpoint save strategy to use."},
|
||||
)
|
||||
save_steps: int = field(default=500, metadata={"help": "Save checkpoint every X updates steps."})
|
||||
save_steps: float = field(
|
||||
default=500,
|
||||
metadata={
|
||||
"help": (
|
||||
"Save checkpoint every X updates steps. Should be an integer or a float in range `[0,1)`."
|
||||
"If smaller than 1, will be interpreted as ratio of total training steps."
|
||||
)
|
||||
},
|
||||
)
|
||||
save_total_limit: Optional[int] = field(
|
||||
default=None,
|
||||
metadata={
|
||||
@@ -854,7 +873,15 @@ class TrainingArguments:
|
||||
dataloader_drop_last: bool = field(
|
||||
default=False, metadata={"help": "Drop the last incomplete batch if it is not divisible by the batch size."}
|
||||
)
|
||||
eval_steps: Optional[int] = field(default=None, metadata={"help": "Run an evaluation every X steps."})
|
||||
eval_steps: Optional[float] = field(
|
||||
default=None,
|
||||
metadata={
|
||||
"help": (
|
||||
"Run an evaluation every X steps. Should be an integer or a float in range `[0,1)`."
|
||||
"If smaller than 1, will be interpreted as ratio of total training steps."
|
||||
)
|
||||
},
|
||||
)
|
||||
dataloader_num_workers: int = field(
|
||||
default=0,
|
||||
metadata={
|
||||
@@ -1186,6 +1213,19 @@ class TrainingArguments:
|
||||
if self.logging_strategy == IntervalStrategy.STEPS and self.logging_steps == 0:
|
||||
raise ValueError(f"logging strategy {self.logging_strategy} requires non-zero --logging_steps")
|
||||
|
||||
if self.logging_strategy == IntervalStrategy.STEPS and self.logging_steps > 1:
|
||||
if self.logging_steps != int(self.logging_steps):
|
||||
raise ValueError(f"--logging_steps must be an integer if bigger than 1: {self.logging_steps}")
|
||||
self.logging_steps = int(self.logging_steps)
|
||||
if self.evaluation_strategy == IntervalStrategy.STEPS and self.eval_steps > 1:
|
||||
if self.eval_steps != int(self.eval_steps):
|
||||
raise ValueError(f"--eval_steps must be an integer if bigger than 1: {self.eval_steps}")
|
||||
self.eval_steps = int(self.eval_steps)
|
||||
if self.save_strategy == IntervalStrategy.STEPS and self.save_steps > 1:
|
||||
if self.save_steps != int(self.save_steps):
|
||||
raise ValueError(f"--save_steps must be an integer if bigger than 1: {self.save_steps}")
|
||||
self.save_steps = int(self.save_steps)
|
||||
|
||||
# Sanity checks for load_best_model_at_end: we require save and eval strategies to be compatible.
|
||||
if self.load_best_model_at_end:
|
||||
if self.evaluation_strategy != self.save_strategy:
|
||||
@@ -1194,6 +1234,20 @@ class TrainingArguments:
|
||||
f"strategy: {self.evaluation_strategy}\n- Save strategy: {self.save_strategy}"
|
||||
)
|
||||
if self.evaluation_strategy == IntervalStrategy.STEPS and self.save_steps % self.eval_steps != 0:
|
||||
if self.eval_steps < 1 or self.save_steps < 1:
|
||||
if not (self.eval_steps < 1 and self.save_steps < 1):
|
||||
raise ValueError(
|
||||
"--load_best_model_at_end requires the saving steps to be a multiple of the evaluation "
|
||||
"steps, which cannot get guaranteed when mixing ratio and absolute steps for save_steps"
|
||||
f"{self.save_steps} and eval_steps {self.eval_steps}."
|
||||
)
|
||||
# Work around floating point precision issues
|
||||
LARGE_MULTIPLIER = 1_000_000
|
||||
if (self.save_steps * LARGE_MULTIPLIER) % (self.eval_steps * LARGE_MULTIPLIER) != 0:
|
||||
raise ValueError(
|
||||
"--load_best_model_at_end requires the saving steps to be a multiple of the evaluation "
|
||||
f"steps, but found {self.save_steps}, which is not a multiple of {self.eval_steps}."
|
||||
)
|
||||
raise ValueError(
|
||||
"--load_best_model_at_end requires the saving steps to be a round multiple of the evaluation "
|
||||
f"steps, but found {self.save_steps}, which is not a round multiple of {self.eval_steps}."
|
||||
|
||||
Reference in New Issue
Block a user