Better support for resuming training (#8878)

This commit is contained in:
Sylvain Gugger
2020-12-01 13:45:21 -05:00
committed by GitHub
parent 21db560df3
commit 7c10dd22ae
3 changed files with 65 additions and 11 deletions

View File

@@ -189,6 +189,10 @@ class TrainingArguments:
model_parallel (:obj:`bool`, `optional`, defaults to :obj:`False`):
If there are more than one devices, whether to use model parallelism to distribute the model's modules
across devices or not.
ignore_skip_data (:obj:`bool`, `optional`, defaults to :obj:`False`):
When resuming training, whether or not to skip the epochs and batches to get the data loading at the same
stage as in the previous training. If set to :obj:`True`, the training will begin faster (as that skipping
step can take a long time) but will not yield the same results as the interrupted training would have.
"""
output_dir: str = field(
@@ -350,6 +354,12 @@ class TrainingArguments:
greater_is_better: Optional[bool] = field(
default=None, metadata={"help": "Whether the `metric_for_best_model` should be maximized or not."}
)
ignore_data_skip: bool = field(
default=False,
metadata={
"help": "When resuming training, whether or not to skip the first epochs and batches to get to the same training data."
},
)
def __post_init__(self):
if self.disable_tqdm is None: