From 3081d3868e8cefb2d851e713ac7e94cf5f449f28 Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
Date: Tue, 14 Sep 2021 08:02:15 -0400
Subject: [PATCH] Push to hub when saving checkpoints (#13503)

* Push to hub when saving checkpoints

* Add model card

* Revert partial model card

* Small fix for checkpoint

* Add tests

* Add documentation

* Fix tests

* Bump huggingface_hub

* Fix test
---
 docs/source/main_classes/trainer.rst          |  23 +++
 setup.py                                      |   2 +-
 src/transformers/dependency_versions_table.py |   2 +-
 src/transformers/trainer.py                   | 153 +++++++++++++-----
 src/transformers/trainer_utils.py             |   7 +
 src/transformers/training_args.py             |  23 ++-
 tests/test_trainer.py                         |  61 ++++++-
 7 files changed, 227 insertions(+), 44 deletions(-)

diff --git a/docs/source/main_classes/trainer.rst b/docs/source/main_classes/trainer.rst
index 4c3a947743..9429136c49 100644
--- a/docs/source/main_classes/trainer.rst
+++ b/docs/source/main_classes/trainer.rst
@@ -119,6 +119,29 @@ TFTrainingArguments
     :members:
 
 
+Checkpoints
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+By default, :class:`~transformers.Trainer` will save all checkpoints in the :obj:`output_dir` you set in the
+:class:`~transformers.TrainingArguments` you are using. Those will go in subfolder named :obj:`checkpoint-xxx` with xxx
+being the step at which the training was at.
+
+Resuming training from a checkpoint can be done when calling :meth:`~transformers.Trainer.train` with either:
+
+- :obj:`resume_from_checkpoint=True` which will resume training from the latest checkpoint
+- :obj:`resume_from_checkpoint=checkpoint_dir` which will resume training from the specific checkpoint in the directory
+  passed.
+
+In addition, you can easily save your checkpoints on the Model Hub when using :obj:`push_to_hub=True`. By default, all
+the models saved in intermediate checkpoints are saved in different commits, but not the optimizer state. You can adapt
+the :obj:`hub-strategy` value of your :class:`~transformers.TrainingArguments` to either:
+
+- :obj:`"checkpoint"`: the latest checkpoint is also pushed in a subfolder named last-checkpoint, allowing you to
+  resume training easily with :obj:`trainer.train(resume_from_checkpoint="output_dir/last-checkpoint")`.
+- :obj:`"all_checkpoints"`: all checkpoints are pushed like they appear in the output folder (so you will get one
+  checkpoint folder per folder in your final repository)
+
+
 Logging
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
diff --git a/setup.py b/setup.py
index 27095b2cca..c7763775a7 100644
--- a/setup.py
+++ b/setup.py
@@ -100,7 +100,7 @@ _deps = [
     "flax>=0.3.4",
     "fugashi>=1.0",
     "GitPython<3.1.19",
-    "huggingface-hub>=0.0.12",
+    "huggingface-hub>=0.0.17",
     "importlib_metadata",
     "ipadic>=1.0.0,<2.0",
     "isort>=5.5.4",
diff --git a/src/transformers/dependency_versions_table.py b/src/transformers/dependency_versions_table.py
index bf548c73e6..1e61b7acde 100644
--- a/src/transformers/dependency_versions_table.py
+++ b/src/transformers/dependency_versions_table.py
@@ -18,7 +18,7 @@ deps = {
     "flax": "flax>=0.3.4",
     "fugashi": "fugashi>=1.0",
     "GitPython": "GitPython<3.1.19",
-    "huggingface-hub": "huggingface-hub>=0.0.12",
+    "huggingface-hub": "huggingface-hub>=0.0.17",
     "importlib_metadata": "importlib_metadata",
     "ipadic": "ipadic>=1.0.0,<2.0",
     "isort": "isort>=5.5.4",
diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index 77471ea137..26431cd86d 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -110,6 +110,8 @@ from .trainer_utils import (
     EvalLoopOutput,
     EvalPrediction,
     HPSearchBackend,
+    HubStrategy,
+    IntervalStrategy,
     PredictionOutput,
     ShardedDDPOption,
     TrainerMemoryTracker,
@@ -180,6 +182,14 @@ if TYPE_CHECKING:
 logger = logging.get_logger(__name__)
 
 
+# Name of the files used for checkpointing
+TRAINING_ARGS_NAME = "training_args.bin"
+TRAINER_STATE_NAME = "trainer_state.json"
+OPTIMIZER_NAME = "optimizer.pt"
+SCHEDULER_NAME = "scheduler.pt"
+SCALER_NAME = "scaler.pt"
+
+
 class Trainer:
     """
     Trainer is a simple but feature-complete training and eval loop for PyTorch, optimized for 🤗 Transformers.
@@ -389,6 +399,12 @@ class Trainer:
         # Create clone of distant repo and output directory if needed
         if self.args.push_to_hub:
             self.init_git_repo()
+            # In case of pull, we need to make sure every process has the latest.
+            if is_torch_tpu_available():
+                xm.rendezvous("init git repo")
+            elif args.local_rank != -1:
+                dist.barrier()
+
         if self.args.should_save:
             os.makedirs(self.args.output_dir, exist_ok=True)
 
@@ -901,9 +917,9 @@ class Trainer:
             output_dir = os.path.join(checkpoint_dir, f"{PREFIX_CHECKPOINT_DIR}-{self.state.global_step}")
             self.save_model(output_dir)
             if self.args.should_save:
-                self.state.save_to_json(os.path.join(output_dir, "trainer_state.json"))
-                torch.save(self.optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt"))
-                torch.save(self.lr_scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt"))
+                self.state.save_to_json(os.path.join(output_dir, TRAINER_STATE_NAME))
+                torch.save(self.optimizer.state_dict(), os.path.join(output_dir, OPTIMIZER_NAME))
+                torch.save(self.lr_scheduler.state_dict(), os.path.join(output_dir, SCHEDULER_NAME))
 
     def call_model_init(self, trial=None):
         model_init_argcount = number_of_arguments(self.model_init)
@@ -1183,9 +1199,9 @@ class Trainer:
 
         # Check if continuing training from a checkpoint
         if resume_from_checkpoint is not None and os.path.isfile(
-            os.path.join(resume_from_checkpoint, "trainer_state.json")
+            os.path.join(resume_from_checkpoint, TRAINER_STATE_NAME)
         ):
-            self.state = TrainerState.load_from_json(os.path.join(resume_from_checkpoint, "trainer_state.json"))
+            self.state = TrainerState.load_from_json(os.path.join(resume_from_checkpoint, TRAINER_STATE_NAME))
             epochs_trained = self.state.global_step // num_update_steps_per_epoch
             if not args.ignore_data_skip:
                 steps_trained_in_current_epoch = self.state.global_step % (num_update_steps_per_epoch)
@@ -1520,9 +1536,9 @@ class Trainer:
 
         if is_torch_tpu_available():
             xm.rendezvous("saving_optimizer_states")
-            xm.save(self.optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt"))
+            xm.save(self.optimizer.state_dict(), os.path.join(output_dir, OPTIMIZER_NAME))
             with warnings.catch_warnings(record=True) as caught_warnings:
-                xm.save(self.lr_scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt"))
+                xm.save(self.lr_scheduler.state_dict(), os.path.join(output_dir, SCHEDULER_NAME))
                 reissue_pt_warnings(caught_warnings)
         elif is_sagemaker_mp_enabled():
             if smp.dp_rank() == 0:
@@ -1530,20 +1546,20 @@ class Trainer:
                 opt_state_dict = self.optimizer.state_dict()
                 # Save it and the scheduler on the main process
                 if self.args.should_save:
-                    torch.save(opt_state_dict, os.path.join(output_dir, "optimizer.pt"))
+                    torch.save(opt_state_dict, os.path.join(output_dir, OPTIMIZER_NAME))
                     with warnings.catch_warnings(record=True) as caught_warnings:
-                        torch.save(self.lr_scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt"))
+                        torch.save(self.lr_scheduler.state_dict(), os.path.join(output_dir, SCHEDULER_NAME))
                     reissue_pt_warnings(caught_warnings)
                     if self.use_amp:
-                        torch.save(self.scaler.state_dict(), os.path.join(output_dir, "scaler.pt"))
+                        torch.save(self.scaler.state_dict(), os.path.join(output_dir, SCALER_NAME))
         elif self.args.should_save and not self.deepspeed:
             # deepspeed.save_checkpoint above saves model/optim/sched
-            torch.save(self.optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt"))
+            torch.save(self.optimizer.state_dict(), os.path.join(output_dir, OPTIMIZER_NAME))
             with warnings.catch_warnings(record=True) as caught_warnings:
-                torch.save(self.lr_scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt"))
+                torch.save(self.lr_scheduler.state_dict(), os.path.join(output_dir, SCHEDULER_NAME))
             reissue_pt_warnings(caught_warnings)
             if self.use_amp:
-                torch.save(self.scaler.state_dict(), os.path.join(output_dir, "scaler.pt"))
+                torch.save(self.scaler.state_dict(), os.path.join(output_dir, SCALER_NAME))
 
         # Determine the new best metric / best model checkpoint
         if metrics is not None and self.args.metric_for_best_model is not None:
@@ -1563,7 +1579,7 @@ class Trainer:
 
         # Save the Trainer state
         if self.args.should_save:
-            self.state.save_to_json(os.path.join(output_dir, "trainer_state.json"))
+            self.state.save_to_json(os.path.join(output_dir, TRAINER_STATE_NAME))
 
         # Save RNG state in non-distributed training
         rng_states = {
@@ -1590,6 +1606,9 @@ class Trainer:
         else:
             torch.save(rng_states, os.path.join(output_dir, f"rng_state_{local_rank}.pth"))
 
+        if self.args.push_to_hub:
+            self._push_from_checkpoint(output_dir)
+
         # Maybe delete some older checkpoints.
         if self.args.should_save:
             self._rotate_checkpoints(use_mtime=True, output_dir=run_dir)
@@ -1603,15 +1622,15 @@ class Trainer:
             # deepspeed loads optimizer/lr_scheduler together with the model in deepspeed_init
             return
 
-        if os.path.isfile(os.path.join(checkpoint, "optimizer.pt")) and os.path.isfile(
-            os.path.join(checkpoint, "scheduler.pt")
+        if os.path.isfile(os.path.join(checkpoint, OPTIMIZER_NAME)) and os.path.isfile(
+            os.path.join(checkpoint, SCHEDULER_NAME)
         ):
             # Load in optimizer and scheduler states
             if is_torch_tpu_available():
                 # On TPU we have to take some extra precautions to properly load the states on the right device.
-                optimizer_state = torch.load(os.path.join(checkpoint, "optimizer.pt"), map_location="cpu")
+                optimizer_state = torch.load(os.path.join(checkpoint, OPTIMIZER_NAME), map_location="cpu")
                 with warnings.catch_warnings(record=True) as caught_warnings:
-                    lr_scheduler_state = torch.load(os.path.join(checkpoint, "scheduler.pt"), map_location="cpu")
+                    lr_scheduler_state = torch.load(os.path.join(checkpoint, SCHEDULER_NAME), map_location="cpu")
                 reissue_pt_warnings(caught_warnings)
 
                 xm.send_cpu_data_to_device(optimizer_state, self.args.device)
@@ -1622,13 +1641,13 @@ class Trainer:
             else:
                 map_location = "cpu" if is_sagemaker_mp_enabled() else self.args.device
                 self.optimizer.load_state_dict(
-                    torch.load(os.path.join(checkpoint, "optimizer.pt"), map_location=map_location)
+                    torch.load(os.path.join(checkpoint, OPTIMIZER_NAME), map_location=map_location)
                 )
                 with warnings.catch_warnings(record=True) as caught_warnings:
-                    self.lr_scheduler.load_state_dict(torch.load(os.path.join(checkpoint, "scheduler.pt")))
+                    self.lr_scheduler.load_state_dict(torch.load(os.path.join(checkpoint, SCHEDULER_NAME)))
                 reissue_pt_warnings(caught_warnings)
-                if self.use_amp and os.path.isfile(os.path.join(checkpoint, "scaler.pt")):
-                    self.scaler.load_state_dict(torch.load(os.path.join(checkpoint, "scaler.pt")))
+                if self.use_amp and os.path.isfile(os.path.join(checkpoint, SCALER_NAME)):
+                    self.scaler.load_state_dict(torch.load(os.path.join(checkpoint, SCALER_NAME)))
 
     def hyperparameter_search(
         self,
@@ -1908,7 +1927,7 @@ class Trainer:
 
         if xm.is_master_ordinal():
             os.makedirs(output_dir, exist_ok=True)
-            torch.save(self.args, os.path.join(output_dir, "training_args.bin"))
+            torch.save(self.args, os.path.join(output_dir, TRAINING_ARGS_NAME))
 
         # Save a trained model and configuration using `save_pretrained()`.
         # They can then be reloaded using `from_pretrained()`
@@ -1953,7 +1972,7 @@ class Trainer:
             self.tokenizer.save_pretrained(output_dir)
 
         # Good practice: save your training arguments together with the trained model
-        torch.save(self.args, os.path.join(output_dir, "training_args.bin"))
+        torch.save(self.args, os.path.join(output_dir, TRAINING_ARGS_NAME))
 
     def store_flos(self):
         # Storing the number of floating-point operations that went into the model
@@ -2476,9 +2495,9 @@ class Trainer:
 
     def init_git_repo(self):
         """
-        Initializes a git repo in :obj:`self.args.push_to_hub_model_id`.
+        Initializes a git repo in :obj:`self.args.hub_model_id`.
         """
-        if not self.args.should_save:
+        if not self.is_world_process_zero():
             return
         use_auth_token = True if self.args.hub_token is None else self.args.hub_token
         if self.args.hub_model_id is None:
@@ -2486,17 +2505,36 @@ class Trainer:
         else:
             repo_name = self.args.hub_model_id
 
-        self.repo = Repository(
-            self.args.output_dir,
-            clone_from=repo_name,
-            use_auth_token=use_auth_token,
-        )
+        try:
+            self.repo = Repository(
+                self.args.output_dir,
+                clone_from=repo_name,
+                use_auth_token=use_auth_token,
+            )
+        except EnvironmentError:
+            if self.args.overwrite_output_dir:
+                # Try again after wiping output_dir
+                shutil.rmtree(self.args.output_dir)
+                self.repo = Repository(
+                    self.args.output_dir,
+                    clone_from=repo_name,
+                    use_auth_token=use_auth_token,
+                )
+            else:
+                raise
+
+        self.repo.git_pull()
 
         # By default, ignore the checkpoint folders
-        if not os.path.exists(os.path.join(self.args.output_dir, ".gitignore")):
+        if (
+            not os.path.exists(os.path.join(self.args.output_dir, ".gitignore"))
+            and self.args.hub_strategy != HubStrategy.ALL_CHECKPOINTS
+        ):
             with open(os.path.join(self.args.output_dir, ".gitignore"), "w", encoding="utf-8") as writer:
                 writer.writelines(["checkpoint-*/"])
 
+        self.push_in_progress = None
+
     def create_model_card(
         self,
         language: Optional[str] = None,
@@ -2525,18 +2563,61 @@ class Trainer:
         with open(os.path.join(self.args.output_dir, "README.md"), "w") as f:
             f.write(model_card)
 
-    def push_to_hub(self, commit_message: Optional[str] = "add model", **kwargs) -> str:
+    def _push_from_checkpoint(self, checkpoint_folder):
+        # Only push from one node.
+        if not self.is_world_process_zero() or self.args.hub_strategy == HubStrategy.END:
+            return
+        # If we haven't finished the last push, we don't do this one.
+        if self.push_in_progress is not None and not self.push_in_progress.is_done:
+            return
+
+        output_dir = self.args.output_dir
+        # To avoid a new synchronization of all model weights, we just copy the file from the checkpoint folder
+        modeling_files = [CONFIG_NAME, WEIGHTS_NAME]
+        for modeling_file in modeling_files:
+            if os.path.isfile(os.path.join(checkpoint_folder, modeling_file)):
+                shutil.copy(os.path.join(checkpoint_folder, modeling_file), os.path.join(output_dir, modeling_file))
+        # Saving the tokenizer is fast and we don't know how many files it may have spawned, so we resave it to be sure.
+        if self.tokenizer is not None:
+            self.tokenizer.save_pretrained(output_dir)
+        # Same for the training arguments
+        torch.save(self.args, os.path.join(output_dir, TRAINING_ARGS_NAME))
+
+        try:
+            if self.args.hub_strategy == HubStrategy.CHECKPOINT:
+                # Temporarily move the checkpoint just saved for the push
+                tmp_checkpoint = os.path.join(output_dir, "last-checkpoint")
+                # We have to remove the "last-checkpoint" dir if it exists, otherwise the checkpoint is moved as a
+                # subfolder.
+                if os.path.isdir(tmp_checkpoint):
+                    shutil.rmtree(tmp_checkpoint)
+                shutil.move(checkpoint_folder, tmp_checkpoint)
+
+            if self.args.save_strategy == IntervalStrategy.STEPS:
+                commit_message = f"Training in progress, step {self.state.global_step}"
+            else:
+                commit_message = f"Training in progress, epoch {int(self.state.epoch)}"
+            _, self.push_in_progress = self.repo.push_to_hub(commit_message=commit_message, blocking=False)
+        finally:
+            if self.args.hub_strategy == HubStrategy.CHECKPOINT:
+                # Move back the checkpoint to its place
+                shutil.move(tmp_checkpoint, checkpoint_folder)
+
+    def push_to_hub(self, commit_message: Optional[str] = "End of training", blocking: bool = True, **kwargs) -> str:
         """
         Upload `self.model` and `self.tokenizer` to the 🤗 model hub on the repo `self.args.hub_model_id`.
 
         Parameters:
-            commit_message (:obj:`str`, `optional`, defaults to :obj:`"add model"`):
+            commit_message (:obj:`str`, `optional`, defaults to :obj:`"End of training"`):
                 Message to commit while pushing.
+            blocking (:obj:`bool`, `optional`, defaults to :obj:`True`):
+                Whether the function should return only when the :obj:`git push` has finished.
             kwargs:
                 Additional keyword arguments passed along to :meth:`~transformers.Trainer.create_model_card`.
 
         Returns:
-            The url of the commit of your model in the given repository.
+            The url of the commit of your model in the given repository if :obj:`blocking=False`, a tuple with the url
+            of the commit and an object to track the progress of the commit if :obj:`blocking=True`
         """
 
         if self.args.should_save:
@@ -2553,7 +2634,7 @@ class Trainer:
         if not self.is_world_process_zero():
             return
 
-        return self.repo.push_to_hub(commit_message=commit_message)
+        return self.repo.push_to_hub(commit_message=commit_message, blocking=blocking)
 
     #
     # Deprecated code
diff --git a/src/transformers/trainer_utils.py b/src/transformers/trainer_utils.py
index d26217de7a..ea4a8739d8 100644
--- a/src/transformers/trainer_utils.py
+++ b/src/transformers/trainer_utils.py
@@ -125,6 +125,13 @@ class EvaluationStrategy(ExplicitEnum):
     EPOCH = "epoch"
 
 
+class HubStrategy(ExplicitEnum):
+    END = "end"
+    EVERY_SAVE = "every_save"
+    CHECKPOINT = "checkpoint"
+    ALL_CHECKPOINTS = "all_checkpoints"
+
+
 class BestRun(NamedTuple):
     """
     The best run found by an hyperparameter search (see :class:`~transformers.Trainer.hyperparameter_search`).
diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py
index 9829272978..e2bdccd609 100644
--- a/src/transformers/training_args.py
+++ b/src/transformers/training_args.py
@@ -32,7 +32,7 @@ from .file_utils import (
     is_torch_tpu_available,
     torch_required,
 )
-from .trainer_utils import EvaluationStrategy, IntervalStrategy, SchedulerType, ShardedDDPOption
+from .trainer_utils import EvaluationStrategy, HubStrategy, IntervalStrategy, SchedulerType, ShardedDDPOption
 from .utils import logging
 
 
@@ -343,6 +343,22 @@ class TrainingArguments:
 
             Will default to :obj:`user_name/output_dir_name` with `output_dir_name` being the name of
             :obj:`output_dir`.
+        hub_strategy (:obj:`str` or :class:`~transformers.trainer_utils.HubStrategy`, `optional`, defaults to :obj:`"every_save"`):
+            Defines the scope of what is pushed to the Hub and when. Possible values are:
+
+            - :obj:`"end"`: push the model, its configuration, the tokenizer (if passed along to the
+              :class:`~transformers.Trainer`) and a draft of a model card at the end of training.
+            - :obj:`"every_save"`: push the model, its configuration, the tokenizer (if passed along to the
+              :class:`~transformers.Trainer`) and a draft of a model card each time there is a model save. The pushes
+              are asynchronous to not block training, and in case the save are very frequent, a new push is only
+              attempted if the previous one is finished. A last push is made with the final model at the end of
+              training.
+            - :obj:`"checkpoint"`: like :obj:`"every_save"` but the latest checkpoint is also pushed in a subfolder
+              named last-checkpoint, allowing you to resume training easily with
+              :obj:`trainer.train(resume_from_checkpoint="last-checkpoint")`.
+            - :obj:`"all_checkpoints"`: like :obj:`"checkpoint"` but all checkpoints are pushed like they appear in the
+              output folder (so you will get one checkpoint folder per folder in your final repository)
+
         hub_token (:obj:`str`, `optional`):
             The token to use to push the model to the Hub. Will default to the token in the cache folder obtained with
             :obj:`huggingface-cli login`.
@@ -618,6 +634,10 @@ class TrainingArguments:
     hub_model_id: str = field(
         default=None, metadata={"help": "The name of the repository to keep in sync with the local `output_dir`."}
     )
+    hub_strategy: HubStrategy = field(
+        default="every_save",
+        metadata={"help": "The hub strategy to use when `--push_to_hub` is activated."},
+    )
     hub_token: str = field(default=None, metadata={"help": "The token to use to push to the Model Hub."})
     # Deprecated arguments
     push_to_hub_model_id: str = field(
@@ -668,6 +688,7 @@ class TrainingArguments:
         self.evaluation_strategy = IntervalStrategy(self.evaluation_strategy)
         self.logging_strategy = IntervalStrategy(self.logging_strategy)
         self.save_strategy = IntervalStrategy(self.save_strategy)
+        self.hub_strategy = HubStrategy(self.hub_strategy)
 
         self.lr_scheduler_type = SchedulerType(self.lr_scheduler_type)
         if self.do_eval is False and self.evaluation_strategy != IntervalStrategy.NO:
diff --git a/tests/test_trainer.py b/tests/test_trainer.py
index 04abf1d6ce..e47ff2dae0 100644
--- a/tests/test_trainer.py
+++ b/tests/test_trainer.py
@@ -18,13 +18,14 @@ import gc
 import os
 import random
 import re
+import subprocess
 import tempfile
 import unittest
 from pathlib import Path
 
 import numpy as np
 
-from huggingface_hub import HfApi
+from huggingface_hub import HfApi, Repository
 from requests.exceptions import HTTPError
 from transformers import (
     AutoTokenizer,
@@ -1284,10 +1285,11 @@ class TrainerIntegrationWithHubTester(unittest.TestCase):
 
     @classmethod
     def tearDownClass(cls):
-        try:
-            cls._api.delete_repo(token=cls._token, name="test-trainer")
-        except HTTPError:
-            pass
+        for model in ["test-trainer", "test-trainer-epoch", "test-trainer-step"]:
+            try:
+                cls._api.delete_repo(token=cls._token, name=model)
+            except HTTPError:
+                pass
 
         try:
             cls._api.delete_repo(token=cls._token, name="test-trainer-org", organization="valid_org")
@@ -1336,6 +1338,55 @@ class TrainerIntegrationWithHubTester(unittest.TestCase):
             self.assertEqual(model.a.item(), trainer.model.a.item())
             self.assertEqual(model.b.item(), trainer.model.b.item())
 
+    def get_commit_history(self, repo):
+        commit_logs = subprocess.run(
+            "git log".split(),
+            stderr=subprocess.PIPE,
+            stdout=subprocess.PIPE,
+            check=True,
+            encoding="utf-8",
+            cwd=repo,
+        ).stdout
+        commits = commit_logs.split("\n\n")[1::2]
+        return [commit.strip() for commit in commits]
+
+    def test_push_to_hub_with_saves_each_epoch(self):
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            trainer = get_regression_trainer(
+                output_dir=os.path.join(tmp_dir, "test-trainer-epoch"),
+                push_to_hub=True,
+                hub_token=self._token,
+                save_strategy="epoch",
+            )
+            trainer.train()
+
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            _ = Repository(tmp_dir, clone_from=f"{USER}/test-trainer-epoch", use_auth_token=self._token)
+            commits = self.get_commit_history(tmp_dir)
+            expected_commits = [f"Training in progress, epoch {i}" for i in range(3, 0, -1)]
+            expected_commits.append("initial commit")
+            self.assertListEqual(commits, expected_commits)
+            print(commits, len(commits))
+
+    def test_push_to_hub_with_saves_each_n_steps(self):
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            trainer = get_regression_trainer(
+                output_dir=os.path.join(tmp_dir, "test-trainer-step"),
+                push_to_hub=True,
+                hub_token=self._token,
+                save_strategy="steps",
+                save_steps=5,
+            )
+            trainer.train()
+
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            _ = Repository(tmp_dir, clone_from=f"{USER}/test-trainer-step", use_auth_token=self._token)
+            commits = self.get_commit_history(tmp_dir)
+            expected_commits = [f"Training in progress, step {i}" for i in range(20, 0, -5)]
+            expected_commits.append("initial commit")
+            self.assertListEqual(commits, expected_commits)
+            print(commits, len(commits))
+
 
 @require_torch
 @require_optuna