From 3081d3868e8cefb2d851e713ac7e94cf5f449f28 Mon Sep 17 00:00:00 2001 From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> Date: Tue, 14 Sep 2021 08:02:15 -0400 Subject: [PATCH] Push to hub when saving checkpoints (#13503) * Push to hub when saving checkpoints * Add model card * Revert partial model card * Small fix for checkpoint * Add tests * Add documentation * Fix tests * Bump huggingface_hub * Fix test --- docs/source/main_classes/trainer.rst | 23 +++ setup.py | 2 +- src/transformers/dependency_versions_table.py | 2 +- src/transformers/trainer.py | 153 +++++++++++++----- src/transformers/trainer_utils.py | 7 + src/transformers/training_args.py | 23 ++- tests/test_trainer.py | 61 ++++++- 7 files changed, 227 insertions(+), 44 deletions(-) diff --git a/docs/source/main_classes/trainer.rst b/docs/source/main_classes/trainer.rst index 4c3a947743..9429136c49 100644 --- a/docs/source/main_classes/trainer.rst +++ b/docs/source/main_classes/trainer.rst @@ -119,6 +119,29 @@ TFTrainingArguments :members: +Checkpoints +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +By default, :class:`~transformers.Trainer` will save all checkpoints in the :obj:`output_dir` you set in the +:class:`~transformers.TrainingArguments` you are using. Those will go in subfolder named :obj:`checkpoint-xxx` with xxx +being the step at which the training was at. + +Resuming training from a checkpoint can be done when calling :meth:`~transformers.Trainer.train` with either: + +- :obj:`resume_from_checkpoint=True` which will resume training from the latest checkpoint +- :obj:`resume_from_checkpoint=checkpoint_dir` which will resume training from the specific checkpoint in the directory + passed. + +In addition, you can easily save your checkpoints on the Model Hub when using :obj:`push_to_hub=True`. By default, all +the models saved in intermediate checkpoints are saved in different commits, but not the optimizer state. You can adapt +the :obj:`hub-strategy` value of your :class:`~transformers.TrainingArguments` to either: + +- :obj:`"checkpoint"`: the latest checkpoint is also pushed in a subfolder named last-checkpoint, allowing you to + resume training easily with :obj:`trainer.train(resume_from_checkpoint="output_dir/last-checkpoint")`. +- :obj:`"all_checkpoints"`: all checkpoints are pushed like they appear in the output folder (so you will get one + checkpoint folder per folder in your final repository) + + Logging ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/setup.py b/setup.py index 27095b2cca..c7763775a7 100644 --- a/setup.py +++ b/setup.py @@ -100,7 +100,7 @@ _deps = [ "flax>=0.3.4", "fugashi>=1.0", "GitPython<3.1.19", - "huggingface-hub>=0.0.12", + "huggingface-hub>=0.0.17", "importlib_metadata", "ipadic>=1.0.0,<2.0", "isort>=5.5.4", diff --git a/src/transformers/dependency_versions_table.py b/src/transformers/dependency_versions_table.py index bf548c73e6..1e61b7acde 100644 --- a/src/transformers/dependency_versions_table.py +++ b/src/transformers/dependency_versions_table.py @@ -18,7 +18,7 @@ deps = { "flax": "flax>=0.3.4", "fugashi": "fugashi>=1.0", "GitPython": "GitPython<3.1.19", - "huggingface-hub": "huggingface-hub>=0.0.12", + "huggingface-hub": "huggingface-hub>=0.0.17", "importlib_metadata": "importlib_metadata", "ipadic": "ipadic>=1.0.0,<2.0", "isort": "isort>=5.5.4", diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py index 77471ea137..26431cd86d 100755 --- a/src/transformers/trainer.py +++ b/src/transformers/trainer.py @@ -110,6 +110,8 @@ from .trainer_utils import ( EvalLoopOutput, EvalPrediction, HPSearchBackend, + HubStrategy, + IntervalStrategy, PredictionOutput, ShardedDDPOption, TrainerMemoryTracker, @@ -180,6 +182,14 @@ if TYPE_CHECKING: logger = logging.get_logger(__name__) +# Name of the files used for checkpointing +TRAINING_ARGS_NAME = "training_args.bin" +TRAINER_STATE_NAME = "trainer_state.json" +OPTIMIZER_NAME = "optimizer.pt" +SCHEDULER_NAME = "scheduler.pt" +SCALER_NAME = "scaler.pt" + + class Trainer: """ Trainer is a simple but feature-complete training and eval loop for PyTorch, optimized for 🤗 Transformers. @@ -389,6 +399,12 @@ class Trainer: # Create clone of distant repo and output directory if needed if self.args.push_to_hub: self.init_git_repo() + # In case of pull, we need to make sure every process has the latest. + if is_torch_tpu_available(): + xm.rendezvous("init git repo") + elif args.local_rank != -1: + dist.barrier() + if self.args.should_save: os.makedirs(self.args.output_dir, exist_ok=True) @@ -901,9 +917,9 @@ class Trainer: output_dir = os.path.join(checkpoint_dir, f"{PREFIX_CHECKPOINT_DIR}-{self.state.global_step}") self.save_model(output_dir) if self.args.should_save: - self.state.save_to_json(os.path.join(output_dir, "trainer_state.json")) - torch.save(self.optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) - torch.save(self.lr_scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt")) + self.state.save_to_json(os.path.join(output_dir, TRAINER_STATE_NAME)) + torch.save(self.optimizer.state_dict(), os.path.join(output_dir, OPTIMIZER_NAME)) + torch.save(self.lr_scheduler.state_dict(), os.path.join(output_dir, SCHEDULER_NAME)) def call_model_init(self, trial=None): model_init_argcount = number_of_arguments(self.model_init) @@ -1183,9 +1199,9 @@ class Trainer: # Check if continuing training from a checkpoint if resume_from_checkpoint is not None and os.path.isfile( - os.path.join(resume_from_checkpoint, "trainer_state.json") + os.path.join(resume_from_checkpoint, TRAINER_STATE_NAME) ): - self.state = TrainerState.load_from_json(os.path.join(resume_from_checkpoint, "trainer_state.json")) + self.state = TrainerState.load_from_json(os.path.join(resume_from_checkpoint, TRAINER_STATE_NAME)) epochs_trained = self.state.global_step // num_update_steps_per_epoch if not args.ignore_data_skip: steps_trained_in_current_epoch = self.state.global_step % (num_update_steps_per_epoch) @@ -1520,9 +1536,9 @@ class Trainer: if is_torch_tpu_available(): xm.rendezvous("saving_optimizer_states") - xm.save(self.optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) + xm.save(self.optimizer.state_dict(), os.path.join(output_dir, OPTIMIZER_NAME)) with warnings.catch_warnings(record=True) as caught_warnings: - xm.save(self.lr_scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt")) + xm.save(self.lr_scheduler.state_dict(), os.path.join(output_dir, SCHEDULER_NAME)) reissue_pt_warnings(caught_warnings) elif is_sagemaker_mp_enabled(): if smp.dp_rank() == 0: @@ -1530,20 +1546,20 @@ class Trainer: opt_state_dict = self.optimizer.state_dict() # Save it and the scheduler on the main process if self.args.should_save: - torch.save(opt_state_dict, os.path.join(output_dir, "optimizer.pt")) + torch.save(opt_state_dict, os.path.join(output_dir, OPTIMIZER_NAME)) with warnings.catch_warnings(record=True) as caught_warnings: - torch.save(self.lr_scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt")) + torch.save(self.lr_scheduler.state_dict(), os.path.join(output_dir, SCHEDULER_NAME)) reissue_pt_warnings(caught_warnings) if self.use_amp: - torch.save(self.scaler.state_dict(), os.path.join(output_dir, "scaler.pt")) + torch.save(self.scaler.state_dict(), os.path.join(output_dir, SCALER_NAME)) elif self.args.should_save and not self.deepspeed: # deepspeed.save_checkpoint above saves model/optim/sched - torch.save(self.optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) + torch.save(self.optimizer.state_dict(), os.path.join(output_dir, OPTIMIZER_NAME)) with warnings.catch_warnings(record=True) as caught_warnings: - torch.save(self.lr_scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt")) + torch.save(self.lr_scheduler.state_dict(), os.path.join(output_dir, SCHEDULER_NAME)) reissue_pt_warnings(caught_warnings) if self.use_amp: - torch.save(self.scaler.state_dict(), os.path.join(output_dir, "scaler.pt")) + torch.save(self.scaler.state_dict(), os.path.join(output_dir, SCALER_NAME)) # Determine the new best metric / best model checkpoint if metrics is not None and self.args.metric_for_best_model is not None: @@ -1563,7 +1579,7 @@ class Trainer: # Save the Trainer state if self.args.should_save: - self.state.save_to_json(os.path.join(output_dir, "trainer_state.json")) + self.state.save_to_json(os.path.join(output_dir, TRAINER_STATE_NAME)) # Save RNG state in non-distributed training rng_states = { @@ -1590,6 +1606,9 @@ class Trainer: else: torch.save(rng_states, os.path.join(output_dir, f"rng_state_{local_rank}.pth")) + if self.args.push_to_hub: + self._push_from_checkpoint(output_dir) + # Maybe delete some older checkpoints. if self.args.should_save: self._rotate_checkpoints(use_mtime=True, output_dir=run_dir) @@ -1603,15 +1622,15 @@ class Trainer: # deepspeed loads optimizer/lr_scheduler together with the model in deepspeed_init return - if os.path.isfile(os.path.join(checkpoint, "optimizer.pt")) and os.path.isfile( - os.path.join(checkpoint, "scheduler.pt") + if os.path.isfile(os.path.join(checkpoint, OPTIMIZER_NAME)) and os.path.isfile( + os.path.join(checkpoint, SCHEDULER_NAME) ): # Load in optimizer and scheduler states if is_torch_tpu_available(): # On TPU we have to take some extra precautions to properly load the states on the right device. - optimizer_state = torch.load(os.path.join(checkpoint, "optimizer.pt"), map_location="cpu") + optimizer_state = torch.load(os.path.join(checkpoint, OPTIMIZER_NAME), map_location="cpu") with warnings.catch_warnings(record=True) as caught_warnings: - lr_scheduler_state = torch.load(os.path.join(checkpoint, "scheduler.pt"), map_location="cpu") + lr_scheduler_state = torch.load(os.path.join(checkpoint, SCHEDULER_NAME), map_location="cpu") reissue_pt_warnings(caught_warnings) xm.send_cpu_data_to_device(optimizer_state, self.args.device) @@ -1622,13 +1641,13 @@ class Trainer: else: map_location = "cpu" if is_sagemaker_mp_enabled() else self.args.device self.optimizer.load_state_dict( - torch.load(os.path.join(checkpoint, "optimizer.pt"), map_location=map_location) + torch.load(os.path.join(checkpoint, OPTIMIZER_NAME), map_location=map_location) ) with warnings.catch_warnings(record=True) as caught_warnings: - self.lr_scheduler.load_state_dict(torch.load(os.path.join(checkpoint, "scheduler.pt"))) + self.lr_scheduler.load_state_dict(torch.load(os.path.join(checkpoint, SCHEDULER_NAME))) reissue_pt_warnings(caught_warnings) - if self.use_amp and os.path.isfile(os.path.join(checkpoint, "scaler.pt")): - self.scaler.load_state_dict(torch.load(os.path.join(checkpoint, "scaler.pt"))) + if self.use_amp and os.path.isfile(os.path.join(checkpoint, SCALER_NAME)): + self.scaler.load_state_dict(torch.load(os.path.join(checkpoint, SCALER_NAME))) def hyperparameter_search( self, @@ -1908,7 +1927,7 @@ class Trainer: if xm.is_master_ordinal(): os.makedirs(output_dir, exist_ok=True) - torch.save(self.args, os.path.join(output_dir, "training_args.bin")) + torch.save(self.args, os.path.join(output_dir, TRAINING_ARGS_NAME)) # Save a trained model and configuration using `save_pretrained()`. # They can then be reloaded using `from_pretrained()` @@ -1953,7 +1972,7 @@ class Trainer: self.tokenizer.save_pretrained(output_dir) # Good practice: save your training arguments together with the trained model - torch.save(self.args, os.path.join(output_dir, "training_args.bin")) + torch.save(self.args, os.path.join(output_dir, TRAINING_ARGS_NAME)) def store_flos(self): # Storing the number of floating-point operations that went into the model @@ -2476,9 +2495,9 @@ class Trainer: def init_git_repo(self): """ - Initializes a git repo in :obj:`self.args.push_to_hub_model_id`. + Initializes a git repo in :obj:`self.args.hub_model_id`. """ - if not self.args.should_save: + if not self.is_world_process_zero(): return use_auth_token = True if self.args.hub_token is None else self.args.hub_token if self.args.hub_model_id is None: @@ -2486,17 +2505,36 @@ class Trainer: else: repo_name = self.args.hub_model_id - self.repo = Repository( - self.args.output_dir, - clone_from=repo_name, - use_auth_token=use_auth_token, - ) + try: + self.repo = Repository( + self.args.output_dir, + clone_from=repo_name, + use_auth_token=use_auth_token, + ) + except EnvironmentError: + if self.args.overwrite_output_dir: + # Try again after wiping output_dir + shutil.rmtree(self.args.output_dir) + self.repo = Repository( + self.args.output_dir, + clone_from=repo_name, + use_auth_token=use_auth_token, + ) + else: + raise + + self.repo.git_pull() # By default, ignore the checkpoint folders - if not os.path.exists(os.path.join(self.args.output_dir, ".gitignore")): + if ( + not os.path.exists(os.path.join(self.args.output_dir, ".gitignore")) + and self.args.hub_strategy != HubStrategy.ALL_CHECKPOINTS + ): with open(os.path.join(self.args.output_dir, ".gitignore"), "w", encoding="utf-8") as writer: writer.writelines(["checkpoint-*/"]) + self.push_in_progress = None + def create_model_card( self, language: Optional[str] = None, @@ -2525,18 +2563,61 @@ class Trainer: with open(os.path.join(self.args.output_dir, "README.md"), "w") as f: f.write(model_card) - def push_to_hub(self, commit_message: Optional[str] = "add model", **kwargs) -> str: + def _push_from_checkpoint(self, checkpoint_folder): + # Only push from one node. + if not self.is_world_process_zero() or self.args.hub_strategy == HubStrategy.END: + return + # If we haven't finished the last push, we don't do this one. + if self.push_in_progress is not None and not self.push_in_progress.is_done: + return + + output_dir = self.args.output_dir + # To avoid a new synchronization of all model weights, we just copy the file from the checkpoint folder + modeling_files = [CONFIG_NAME, WEIGHTS_NAME] + for modeling_file in modeling_files: + if os.path.isfile(os.path.join(checkpoint_folder, modeling_file)): + shutil.copy(os.path.join(checkpoint_folder, modeling_file), os.path.join(output_dir, modeling_file)) + # Saving the tokenizer is fast and we don't know how many files it may have spawned, so we resave it to be sure. + if self.tokenizer is not None: + self.tokenizer.save_pretrained(output_dir) + # Same for the training arguments + torch.save(self.args, os.path.join(output_dir, TRAINING_ARGS_NAME)) + + try: + if self.args.hub_strategy == HubStrategy.CHECKPOINT: + # Temporarily move the checkpoint just saved for the push + tmp_checkpoint = os.path.join(output_dir, "last-checkpoint") + # We have to remove the "last-checkpoint" dir if it exists, otherwise the checkpoint is moved as a + # subfolder. + if os.path.isdir(tmp_checkpoint): + shutil.rmtree(tmp_checkpoint) + shutil.move(checkpoint_folder, tmp_checkpoint) + + if self.args.save_strategy == IntervalStrategy.STEPS: + commit_message = f"Training in progress, step {self.state.global_step}" + else: + commit_message = f"Training in progress, epoch {int(self.state.epoch)}" + _, self.push_in_progress = self.repo.push_to_hub(commit_message=commit_message, blocking=False) + finally: + if self.args.hub_strategy == HubStrategy.CHECKPOINT: + # Move back the checkpoint to its place + shutil.move(tmp_checkpoint, checkpoint_folder) + + def push_to_hub(self, commit_message: Optional[str] = "End of training", blocking: bool = True, **kwargs) -> str: """ Upload `self.model` and `self.tokenizer` to the 🤗 model hub on the repo `self.args.hub_model_id`. Parameters: - commit_message (:obj:`str`, `optional`, defaults to :obj:`"add model"`): + commit_message (:obj:`str`, `optional`, defaults to :obj:`"End of training"`): Message to commit while pushing. + blocking (:obj:`bool`, `optional`, defaults to :obj:`True`): + Whether the function should return only when the :obj:`git push` has finished. kwargs: Additional keyword arguments passed along to :meth:`~transformers.Trainer.create_model_card`. Returns: - The url of the commit of your model in the given repository. + The url of the commit of your model in the given repository if :obj:`blocking=False`, a tuple with the url + of the commit and an object to track the progress of the commit if :obj:`blocking=True` """ if self.args.should_save: @@ -2553,7 +2634,7 @@ class Trainer: if not self.is_world_process_zero(): return - return self.repo.push_to_hub(commit_message=commit_message) + return self.repo.push_to_hub(commit_message=commit_message, blocking=blocking) # # Deprecated code diff --git a/src/transformers/trainer_utils.py b/src/transformers/trainer_utils.py index d26217de7a..ea4a8739d8 100644 --- a/src/transformers/trainer_utils.py +++ b/src/transformers/trainer_utils.py @@ -125,6 +125,13 @@ class EvaluationStrategy(ExplicitEnum): EPOCH = "epoch" +class HubStrategy(ExplicitEnum): + END = "end" + EVERY_SAVE = "every_save" + CHECKPOINT = "checkpoint" + ALL_CHECKPOINTS = "all_checkpoints" + + class BestRun(NamedTuple): """ The best run found by an hyperparameter search (see :class:`~transformers.Trainer.hyperparameter_search`). diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py index 9829272978..e2bdccd609 100644 --- a/src/transformers/training_args.py +++ b/src/transformers/training_args.py @@ -32,7 +32,7 @@ from .file_utils import ( is_torch_tpu_available, torch_required, ) -from .trainer_utils import EvaluationStrategy, IntervalStrategy, SchedulerType, ShardedDDPOption +from .trainer_utils import EvaluationStrategy, HubStrategy, IntervalStrategy, SchedulerType, ShardedDDPOption from .utils import logging @@ -343,6 +343,22 @@ class TrainingArguments: Will default to :obj:`user_name/output_dir_name` with `output_dir_name` being the name of :obj:`output_dir`. + hub_strategy (:obj:`str` or :class:`~transformers.trainer_utils.HubStrategy`, `optional`, defaults to :obj:`"every_save"`): + Defines the scope of what is pushed to the Hub and when. Possible values are: + + - :obj:`"end"`: push the model, its configuration, the tokenizer (if passed along to the + :class:`~transformers.Trainer`) and a draft of a model card at the end of training. + - :obj:`"every_save"`: push the model, its configuration, the tokenizer (if passed along to the + :class:`~transformers.Trainer`) and a draft of a model card each time there is a model save. The pushes + are asynchronous to not block training, and in case the save are very frequent, a new push is only + attempted if the previous one is finished. A last push is made with the final model at the end of + training. + - :obj:`"checkpoint"`: like :obj:`"every_save"` but the latest checkpoint is also pushed in a subfolder + named last-checkpoint, allowing you to resume training easily with + :obj:`trainer.train(resume_from_checkpoint="last-checkpoint")`. + - :obj:`"all_checkpoints"`: like :obj:`"checkpoint"` but all checkpoints are pushed like they appear in the + output folder (so you will get one checkpoint folder per folder in your final repository) + hub_token (:obj:`str`, `optional`): The token to use to push the model to the Hub. Will default to the token in the cache folder obtained with :obj:`huggingface-cli login`. @@ -618,6 +634,10 @@ class TrainingArguments: hub_model_id: str = field( default=None, metadata={"help": "The name of the repository to keep in sync with the local `output_dir`."} ) + hub_strategy: HubStrategy = field( + default="every_save", + metadata={"help": "The hub strategy to use when `--push_to_hub` is activated."}, + ) hub_token: str = field(default=None, metadata={"help": "The token to use to push to the Model Hub."}) # Deprecated arguments push_to_hub_model_id: str = field( @@ -668,6 +688,7 @@ class TrainingArguments: self.evaluation_strategy = IntervalStrategy(self.evaluation_strategy) self.logging_strategy = IntervalStrategy(self.logging_strategy) self.save_strategy = IntervalStrategy(self.save_strategy) + self.hub_strategy = HubStrategy(self.hub_strategy) self.lr_scheduler_type = SchedulerType(self.lr_scheduler_type) if self.do_eval is False and self.evaluation_strategy != IntervalStrategy.NO: diff --git a/tests/test_trainer.py b/tests/test_trainer.py index 04abf1d6ce..e47ff2dae0 100644 --- a/tests/test_trainer.py +++ b/tests/test_trainer.py @@ -18,13 +18,14 @@ import gc import os import random import re +import subprocess import tempfile import unittest from pathlib import Path import numpy as np -from huggingface_hub import HfApi +from huggingface_hub import HfApi, Repository from requests.exceptions import HTTPError from transformers import ( AutoTokenizer, @@ -1284,10 +1285,11 @@ class TrainerIntegrationWithHubTester(unittest.TestCase): @classmethod def tearDownClass(cls): - try: - cls._api.delete_repo(token=cls._token, name="test-trainer") - except HTTPError: - pass + for model in ["test-trainer", "test-trainer-epoch", "test-trainer-step"]: + try: + cls._api.delete_repo(token=cls._token, name=model) + except HTTPError: + pass try: cls._api.delete_repo(token=cls._token, name="test-trainer-org", organization="valid_org") @@ -1336,6 +1338,55 @@ class TrainerIntegrationWithHubTester(unittest.TestCase): self.assertEqual(model.a.item(), trainer.model.a.item()) self.assertEqual(model.b.item(), trainer.model.b.item()) + def get_commit_history(self, repo): + commit_logs = subprocess.run( + "git log".split(), + stderr=subprocess.PIPE, + stdout=subprocess.PIPE, + check=True, + encoding="utf-8", + cwd=repo, + ).stdout + commits = commit_logs.split("\n\n")[1::2] + return [commit.strip() for commit in commits] + + def test_push_to_hub_with_saves_each_epoch(self): + with tempfile.TemporaryDirectory() as tmp_dir: + trainer = get_regression_trainer( + output_dir=os.path.join(tmp_dir, "test-trainer-epoch"), + push_to_hub=True, + hub_token=self._token, + save_strategy="epoch", + ) + trainer.train() + + with tempfile.TemporaryDirectory() as tmp_dir: + _ = Repository(tmp_dir, clone_from=f"{USER}/test-trainer-epoch", use_auth_token=self._token) + commits = self.get_commit_history(tmp_dir) + expected_commits = [f"Training in progress, epoch {i}" for i in range(3, 0, -1)] + expected_commits.append("initial commit") + self.assertListEqual(commits, expected_commits) + print(commits, len(commits)) + + def test_push_to_hub_with_saves_each_n_steps(self): + with tempfile.TemporaryDirectory() as tmp_dir: + trainer = get_regression_trainer( + output_dir=os.path.join(tmp_dir, "test-trainer-step"), + push_to_hub=True, + hub_token=self._token, + save_strategy="steps", + save_steps=5, + ) + trainer.train() + + with tempfile.TemporaryDirectory() as tmp_dir: + _ = Repository(tmp_dir, clone_from=f"{USER}/test-trainer-step", use_auth_token=self._token) + commits = self.get_commit_history(tmp_dir) + expected_commits = [f"Training in progress, step {i}" for i in range(20, 0, -5)] + expected_commits.append("initial commit") + self.assertListEqual(commits, expected_commits) + print(commits, len(commits)) + @require_torch @require_optuna