Better support for resuming training (#8878)

2020-12-01 13:45:21 -05:00
parent 21db560df3
commit 7c10dd22ae
3 changed files with 65 additions and 11 deletions
--- a/src/transformers/training_args.py
+++ b/src/transformers/training_args.py
@@ -189,6 +189,10 @@ class TrainingArguments:
        model_parallel (:obj:`bool`, `optional`, defaults to :obj:`False`):
            If there are more than one devices, whether to use model parallelism to distribute the model's modules
            across devices or not.
+        ignore_skip_data (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            When resuming training, whether or not to skip the epochs and batches to get the data loading at the same
+            stage as in the previous training. If set to :obj:`True`, the training will begin faster (as that skipping
+            step can take a long time) but will not yield the same results as the interrupted training would have.
    """

    output_dir: str = field(
@@ -350,6 +354,12 @@ class TrainingArguments:
    greater_is_better: Optional[bool] = field(
        default=None, metadata={"help": "Whether the `metric_for_best_model` should be maximized or not."}
    )
+    ignore_data_skip: bool = field(
+        default=False,
+        metadata={
+            "help": "When resuming training, whether or not to skip the first epochs and batches to get to the same training data."
+        },
+    )

    def __post_init__(self):
        if self.disable_tqdm is None: