From 3a7fdd3f5214d1ec494379e7c65b4eb08146ddb0 Mon Sep 17 00:00:00 2001 From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> Date: Mon, 24 Aug 2020 11:48:45 -0400 Subject: [PATCH] Add hyperparameter search to Trainer (#6576) * Add optuna hyperparameter search to Trainer * @julien-c suggestions Co-authored-by: Julien Chaumond * Make compute_objective an arg function * Formatting * Rework to make it easier to add ray * Formatting * Initial support for Ray * Formatting * Polish and finalize * Add trial id to checkpoint with Ray * Smaller default * Use GPU in ray if available * Formatting * Fix test * Update install instruction Co-authored-by: Richard Liaw * Address review comments * Formatting post-merge Co-authored-by: Julien Chaumond Co-authored-by: Richard Liaw --- src/transformers/__init__.py | 8 +- src/transformers/integrations.py | 29 ++++ src/transformers/trainer.py | 228 +++++++++++++++++++++++++++--- src/transformers/trainer_utils.py | 71 +++++++++- src/transformers/training_args.py | 10 ++ 5 files changed, 325 insertions(+), 21 deletions(-) diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index 833f5c46a4..5c001403b2 100755 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -92,7 +92,13 @@ from .file_utils import ( from .hf_argparser import HfArgumentParser # Integrations -from .integrations import is_comet_available, is_tensorboard_available, is_wandb_available +from .integrations import ( + is_comet_available, + is_optuna_available, + is_ray_available, + is_tensorboard_available, + is_wandb_available, +) # Model Cards from .modelcard import ModelCard diff --git a/src/transformers/integrations.py b/src/transformers/integrations.py index c200d4e1df..3e597b029e 100644 --- a/src/transformers/integrations.py +++ b/src/transformers/integrations.py @@ -35,6 +35,20 @@ except ImportError: except ImportError: _has_tensorboard = False +try: + import optuna # noqa: F401 + + _has_optuna = True +except (ImportError): + _has_optuna = False + +try: + import ray # noqa: F401 + + _has_ray = True +except (ImportError): + _has_ray = False + def is_wandb_available(): return _has_wandb @@ -46,3 +60,18 @@ def is_comet_available(): def is_tensorboard_available(): return _has_tensorboard + + +def is_optuna_available(): + return _has_optuna + + +def is_ray_available(): + return _has_ray + + +def default_hp_search_backend(): + if is_optuna_available(): + return "optuna" + elif is_ray_available(): + return "ray" diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py index 4d5b75380f..97072e689e 100755 --- a/src/transformers/trainer.py +++ b/src/transformers/trainer.py @@ -21,10 +21,27 @@ from tqdm.auto import tqdm, trange from .data.data_collator import DataCollator, default_data_collator from .file_utils import is_nlp_available, is_torch_tpu_available -from .integrations import is_comet_available, is_tensorboard_available, is_wandb_available +from .integrations import ( + default_hp_search_backend, + is_comet_available, + is_optuna_available, + is_ray_available, + is_tensorboard_available, + is_wandb_available, +) from .modeling_utils import PreTrainedModel from .optimization import AdamW, get_linear_schedule_with_warmup -from .trainer_utils import PREFIX_CHECKPOINT_DIR, EvalPrediction, PredictionOutput, TrainOutput, set_seed +from .trainer_utils import ( + PREFIX_CHECKPOINT_DIR, + BestRun, + EvalPrediction, + HPSearchBackend, + PredictionOutput, + TrainOutput, + default_compute_objective, + default_hp_space, + set_seed, +) from .training_args import TrainingArguments @@ -62,6 +79,12 @@ if is_wandb_available(): if is_comet_available(): import comet_ml +if is_optuna_available(): + import optuna + +if is_ray_available(): + from ray import tune + logger = logging.getLogger(__name__) @@ -140,10 +163,11 @@ class Trainer: optimized for 🤗 Transformers. Args: - model (:class:`~transformers.PreTrainedModel`): - The model to train, evaluate or use for predictions. - args (:class:`~transformers.TrainingArguments`): - The arguments to tweak for training. + model (:class:`~transformers.PreTrainedModel`, `optional`): + The model to train, evaluate or use for predictions. If not provided, a ``model_init`` must be passed. + args (:class:`~transformers.TrainingArguments`, `optional`): + The arguments to tweak for training. Will default to a basic instance of :class:`~transformers.TrainingArguments` + with the ``output_dir`` set to a directory named `tmp_trainer` in the current directory if not provided. data_collator (:obj:`DataCollator`, `optional`, defaults to :func:`~transformers.default_data_collator`): The function to use to form a batch from a list of elements of :obj:`train_dataset` or :obj:`eval_dataset`. @@ -151,8 +175,11 @@ class Trainer: The dataset to use for training. If it is an :obj:`nlp.Dataset`, columns not accepted by the ``model.forward()`` method are automatically removed. eval_dataset (:obj:`torch.utils.data.dataset.Dataset`, `optional`): - The dataset to use for evaluation. If it is an :obj:`nlp.Dataset`, columns not accepted by the + The dataset to use for evaluation. If it is an :obj:`nlp.Dataset`, columns not accepted by the ``model.forward()`` method are automatically removed. + model_init (:obj:`Callable[[], PreTrainedModel]`, `optional`): + A function that instantiates the model to be used. If provided, each call to + :meth:`~transformers.Trainer.train` will start from a new instance of the model as given by this function. compute_metrics (:obj:`Callable[[EvalPrediction], Dict]`, `optional`): The function that will be used to compute metrics at evaluation. Must take a :class:`~transformers.EvalPrediction` and return a dictionary string to metric values. @@ -168,21 +195,31 @@ class Trainer: def __init__( self, - model: PreTrainedModel, - args: TrainingArguments, + model: PreTrainedModel = None, + args: TrainingArguments = None, data_collator: Optional[DataCollator] = None, train_dataset: Optional[Dataset] = None, eval_dataset: Optional[Dataset] = None, + model_init: Callable[[], PreTrainedModel] = None, compute_metrics: Optional[Callable[[EvalPrediction], Dict]] = None, tb_writer: Optional["SummaryWriter"] = None, optimizers: Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (None, None), **kwargs, ): - self.model = model.to(args.device) + assert ( + model is not None or model_init is not None + ), "You must provide a model to use `Trainer`, either by using the `model` argument or the `model_init` argument." + if model is None and model_init is not None: + model = model_init() + self.model = model.to(args.device) if model is not None else None + if args is None: + logger.info("No `TrainingArguments` passed, using the current path as `output_dir`.") + args = TrainingArguments("tmp_trainer") self.args = args self.data_collator = data_collator if data_collator is not None else default_data_collator self.train_dataset = train_dataset self.eval_dataset = eval_dataset + self.model_init = model_init self.compute_metrics = compute_metrics self.optimizer, self.lr_scheduler = optimizers self.tb_writer = tb_writer @@ -242,6 +279,7 @@ class Trainer: self.epoch = None if self.args.fp16 and _use_native_amp: self.scaler = torch.cuda.amp.GradScaler() + self.hp_search_backend = None def _remove_unused_columns(self, dataset: "nlp.Dataset", description: Optional[str] = None): if not self.args.remove_unused_columns: @@ -462,7 +500,38 @@ class Trainer: """ return len(dataloader.dataset) - def train(self, model_path: Optional[str] = None): + def _hp_search_setup(self, trial: Union["optuna.Trial", Dict[str, Any]]): + """ HP search setup code """ + if self.hp_search_backend is None or trial is None: + return + params = self.hp_space(trial) if self.hp_search_backend == HPSearchBackend.OPTUNA else trial + for key, value in params.items(): + if not hasattr(self.args, key): + raise AttributeError( + f"Trying to set {key} in the hyperparameter search but there is no corresponding field in `TrainingArguments`." + ) + old_attr = getattr(self.args, key, None) + # Casting value to the proper type + if old_attr is not None: + value = type(old_attr)(value) + setattr(self.args, key, value) + if self.hp_search_backend == HPSearchBackend.OPTUNA: + logger.info("Trial:", trial.params) + + def _report_to_hp_search( + self, trial: Union["optuna.Trial", Dict[str, Any]], epoch: int, metrics: Dict[str, float] + ): + if self.hp_search_backend is None or trial is None: + return + self.objective = self.compute_objective(metrics) + if self.hp_search_backend == HPSearchBackend.OPTUNA: + trial.report(self.objective, epoch) + if trial.should_prune(): + raise optuna.TrialPruned() + elif self.hp_search_backend == HPSearchBackend.RAY: + tune.report(objective=self.objective, **metrics) + + def train(self, model_path: Optional[str] = None, trial: Union["optuna.Trial", Dict[str, Any]] = None): """ Main training entry point. @@ -470,7 +539,17 @@ class Trainer: model_path (:obj:`str`, `optional`): Local path to the model if the model to train has been instantiated from a local path. If present, training will resume from the optimizer/scheduler states loaded here. + trial (:obj:`optuna.Trial` or :obj:`Dict[str, Any]`, `optional`): + The trial run or the hyperparameter dictionary for hyperparameter search. """ + # Model re-init + if self.model_init is not None: + model = self.model_init() + self.model = model.to(self.args.device) + + self._hp_search_setup(trial) + + # Data loader and number of training steps train_dataloader = self.get_train_dataloader() if self.args.max_steps > 0: t_total = self.args.max_steps @@ -561,9 +640,8 @@ class Trainer: tr_loss = 0.0 logging_loss = 0.0 model.zero_grad() - train_iterator = trange( - epochs_trained, int(np.ceil(num_train_epochs)), desc="Epoch", disable=not self.is_local_process_zero() - ) + disable_tqdm = self.args.disable_tqdm or not self.is_local_process_zero() + train_iterator = trange(epochs_trained, int(np.ceil(num_train_epochs)), desc="Epoch", disable=disable_tqdm) for epoch in train_iterator: if isinstance(train_dataloader, DataLoader) and isinstance(train_dataloader.sampler, DistributedSampler): train_dataloader.sampler.set_epoch(epoch) @@ -572,9 +650,9 @@ class Trainer: parallel_loader = pl.ParallelLoader(train_dataloader, [self.args.device]).per_device_loader( self.args.device ) - epoch_iterator = tqdm(parallel_loader, desc="Iteration", disable=not self.is_local_process_zero()) + epoch_iterator = tqdm(parallel_loader, desc="Iteration", disable=disable_tqdm) else: - epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=not self.is_local_process_zero()) + epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=disable_tqdm) # Reset the past mems state at the beginning of each epoch if necessary. if self.args.past_index >= 0: @@ -631,7 +709,8 @@ class Trainer: self.log(logs) if self.args.evaluate_during_training and self.global_step % self.args.eval_steps == 0: - self.evaluate() + metrics = self.evaluate() + self._report_to_hp_search(trial, epoch, metrics) if self.args.save_steps > 0 and self.global_step % self.args.save_steps == 0: # In all cases (even distributed/parallel), self.model is always a reference @@ -643,7 +722,15 @@ class Trainer: else: assert model is self.model, f"Model {model} should be a reference to self.model" # Save model checkpoint - output_dir = os.path.join(self.args.output_dir, f"{PREFIX_CHECKPOINT_DIR}-{self.global_step}") + checkpoint_folder = f"{PREFIX_CHECKPOINT_DIR}-{self.global_step}" + if self.hp_search_backend is not None and trial is not None: + run_id = ( + trial.number + if self.hp_search_backend == HPSearchBackend.OPTUNA + else tune.get_trial_id() + ) + checkpoint_folder += f"-run-{run_id}" + output_dir = os.path.join(self.args.output_dir, checkpoint_folder) self.save_model(output_dir) @@ -683,6 +770,108 @@ class Trainer: logger.info("\n\nTraining completed. Do not forget to share your model on huggingface.co/models =)\n\n") return TrainOutput(self.global_step, tr_loss / self.global_step) + def hyperparameter_search( + self, + hp_space: Optional[Callable[["optuna.Trial"], Dict[str, float]]] = None, + compute_objective: Optional[Callable[[Dict[str, float]], float]] = None, + n_trials: int = 20, + timeout: int = 1800, + n_jobs: int = 1, + direction: str = "minimize", + backend: Optional[Union["str", HPSearchBackend]] = None, + **kwargs + ) -> BestRun: + """ + Launch an hyperparameter search using ``optuna`` or ``Ray Tune``. The optimized quantity is determined by the + method, which is the evaluation loss when no metric is provided, the sum of all metrics otherwise (you can + change that behavior by subclassing and overriding this method). + + Args: + hp_space (:obj:`Callable[["optuna.Trial"], Dict[str, float]]`, `optional`): + A function that defines the hyperparameter search space. Will default to + :func:`~transformers.trainer_utils.default_hp_space_optuna` or + :func:`~transformers.trainer_utils.default_hp_space_ray` depending on your backend. + compute_objective (:obj:`Callable[[Dict[str, float]], float]`, `optional`): + A function computing the objective to minimize or maximize from the metrics returned by the + :obj:`evaluate` method. Will default to :func:`~transformers.trainer_utils.default_compute_objective`. + n_trials (:obj:`int`, `optional`, defaults to 100): + The number of trial runs to test. + direction(:obj:`str`, `optional`, defaults to :obj:`"minimize"`): + Whether to optimize greater or lower objects. Can be :obj:`"minimize"` or :obj:`"maximize"`, you should + pick :obj:`"minimize"` when optimizing the validation loss, :obj:`"maximize"` when optimizing one or + several metrics. + backend(:obj:`str` or :class:`~transformers.training_utils.HPSearchBackend`, `optional`): + The backend to use for hyperparameter search. Will default to optuna or Ray Tune, depending on which + one is installed. If both are installed, will default to optuna. + kwargs: + Additional keyword arguments passed along to :obj:`optuna.create_study` or :obj:`ray.tune.run`. For + more information see: + + - the documentation of `optuna.create_stufy `__ + - the documentation of `tune.run `__ + + Returns: + :class:`transformers.trainer_utils.BestRun`: All the informations about the best run. + """ + if backend is None: + backend = default_hp_search_backend() + if backend is None: + raise RuntimeError( + "At least one of optuna or ray should be installed. " + "To install optuna run `pip install optuna`." + "To install ray run `pip install ray[tune]`." + ) + backend = HPSearchBackend(backend) + if backend == HPSearchBackend.OPTUNA and not is_optuna_available(): + raise RuntimeError(" You picked the optuna backend, but it is not installed. Use `pip install optuna`.") + if backend == HPSearchBackend.RAY and not is_ray_available(): + raise RuntimeError( + " You picked the Ray Tune backend, but it is not installed. Use `pip install 'ray[tune]'`." + ) + self.hp_search_backend = backend + + self.hp_space = default_hp_space[backend] if hp_space is None else hp_space + self.compute_objective = default_compute_objective if compute_objective is None else compute_objective + + def _objective(trial): + # To make sure optimizer and lr_scheduler are reset with the new choices of HPs + self.optimizer = None + self.lr_scheduler = None + self.objective = None + self.train(trial=trial) + # If there hasn't been any evaluation during the training loop. + if getattr(self, "objective", None) is None: + metrics = self.evaluate() + self.objective = self.compute_objective(metrics) + return self.objective + + if self.hp_search_backend == HPSearchBackend.OPTUNA: + timeout = kwargs.pop("timeout", None) + n_jobs = kwargs.pop("n_jobs", 1) + study = optuna.create_study(direction=direction, **kwargs) + study.optimize(_objective, n_trials=n_trials, timeout=timeout, n_jobs=n_jobs) + best_trial = study.best_trial + best_run = BestRun(str(best_trial.number), best_trial.value, best_trial.params) + elif self.hp_search_backend == HPSearchBackend.RAY: + # The TensorBoard writer does not pickle so we have to remove it (if it exists) while doing the ray hp + # search. + _tb_writer = self.tb_writer + self.tb_writer = None + # Setup default `resources_per_trial` and `reporter`. + if "resources_per_trial" not in kwargs and self.args.n_gpu > 0: + kwargs["resources_per_trial"] = {"gpu": self.args.n_gpu} + if "reporter" not in kwargs: + from ray.tune import CLIReporter + + kwargs["progress_reporter"] = CLIReporter(metric_columns=["objective"]) + analysis = tune.run(_objective, config=self.hp_space(None), num_samples=n_trials, **kwargs) + best_trial = analysis.get_best_trial(metric="objective", mode=direction[:3]) + best_run = BestRun(best_trial.trial_id, best_trial.last_result["objective"], best_trial.config) + self.tb_writer = _tb_writer + + self.hp_search_backend = None + return best_run + def log(self, logs: Dict[str, float], iterator: Optional[tqdm] = None) -> None: """ Log :obj:`logs` on the various objects watching training. @@ -1020,8 +1209,9 @@ class Trainer: if self.args.past_index >= 0: self._past = None + disable_tqdm = not self.is_local_process_zero() or self.args.disable_tqdm samples_count = 0 - for inputs in tqdm(dataloader, desc=description): + for inputs in tqdm(dataloader, desc=description, disable=disable_tqdm): loss, logits, labels = self.prediction_step(model, inputs, prediction_loss_only) batch_size = inputs[list(inputs.keys())[0]].shape[0] samples_count += batch_size diff --git a/src/transformers/trainer_utils.py b/src/transformers/trainer_utils.py index 5bfdddb071..d5556f16c3 100644 --- a/src/transformers/trainer_utils.py +++ b/src/transformers/trainer_utils.py @@ -1,9 +1,11 @@ import random -from typing import Dict, NamedTuple, Optional +from typing import Any, Dict, NamedTuple, Optional import numpy as np from .file_utils import is_tf_available, is_torch_available +from .integrations import is_ray_available +from .tokenization_utils_base import ExplicitEnum def set_seed(seed: int): @@ -53,3 +55,70 @@ class TrainOutput(NamedTuple): PREFIX_CHECKPOINT_DIR = "checkpoint" + + +class BestRun(NamedTuple): + """ + The best run found by an hyperparameter search (see :class:`~transformers.Trainer.hyperparameter_search`). + + Parameters: + run_id (:obj:`str`): + The id of the best run (if models were saved, the corresponding checkpoint will be in the folder ending + with run-{run_id}). + objective (:obj:`float`): + The objective that was obtained for this run. + hyperparameters (:obj:`Dict[str, Any]`): + The hyperparameters picked to get this run. + """ + + run_id: str + objective: float + hyperparameters: Dict[str, Any] + + +def default_compute_objective(metrics: Dict[str, float]) -> float: + """ + The default objective to maximize/minimize when doing an hyperparameter search. It is the evaluation loss if no + metrics are provided to the :class:`~transformers.Trainer`, the sum of all metrics otherwise. + + Args: + metrics (:obj:`Dict[str, float]`): The metrics returned by the evaluate method. + + Return: + :obj:`float`: The objective to minimize or maximize + """ + loss = metrics.pop("eval_loss", None) + _ = metrics.pop("epoch", None) + return loss if len(metrics) == 0 else sum(metrics.values()) + + +def default_hp_space_optuna(trial) -> Dict[str, float]: + return { + "learning_rate": trial.suggest_float("learning_rate", 1e-6, 1e-4, log=True), + "num_train_epochs": trial.suggest_int("num_train_epochs", 1, 5), + "seed": trial.suggest_int("seed", 1, 40), + "per_device_train_batch_size": trial.suggest_categorical("per_device_train_batch_size", [4, 8, 16, 32, 64]), + } + + +def default_hp_space_ray(trial) -> Dict[str, float]: + assert is_ray_available(), "This function needs ray installed: `pip install ray[tune]`" + from ray import tune + + return { + "learning_rate": tune.loguniform(1e-6, 1e-4), + "num_train_epochs": tune.choice(range(1, 6)), + "seed": tune.uniform(1, 40), + "per_device_train_batch_size": tune.choice([4, 8, 16, 32, 64]), + } + + +class HPSearchBackend(ExplicitEnum): + OPTUNA = "optuna" + RAY = "ray" + + +default_hp_space = { + HPSearchBackend.OPTUNA: default_hp_space_optuna, + HPSearchBackend.RAY: default_hp_space_ray, +} diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py index 3c543414ff..6e3f22fad8 100644 --- a/src/transformers/training_args.py +++ b/src/transformers/training_args.py @@ -114,6 +114,9 @@ class TrainingArguments: at the next training step under the keyword argument ``mems``. run_name (:obj:`str`, `optional`): A descriptor for the run. Notably used for wandb logging. + disable_tqdm (:obj:`bool`, `optional`): + Whether or not to disable the tqdm progress bars. Will default to :obj:`True` if the logging level is set + to warn or lower (default), :obj:`False` otherwise. remove_unused_columns (:obj:`bool`, `optional`, defaults to :obj:`True`): If using `nlp.Dataset` datasets, whether or not to automatically remove the columns unused by the model forward method. @@ -238,6 +241,13 @@ class TrainingArguments: run_name: Optional[str] = field( default=None, metadata={"help": "An optional descriptor for the run. Notably used for wandb logging."} ) + disable_tqdm: Optional[bool] = field( + default=None, metadata={"help": "Whether or not to disable the tqdm progress bars."} + ) + + def __post_init__(self): + if self.disable_tqdm is None: + self.disable_tqdm = logger.getEffectiveLevel() > logging.WARN remove_unused_columns: Optional[bool] = field( default=True, metadata={"help": "Remove columns not required by the model when using an nlp.Dataset."}