From a59bcefbb1cf834353bd1177f32edfbc95dd4279 Mon Sep 17 00:00:00 2001 From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> Date: Mon, 31 Aug 2020 15:16:39 -0400 Subject: [PATCH] Split hp search methods (#6857) * Split the run_hp_search by backend * Unused import --- src/transformers/integrations.py | 148 ++++++++++++++++--------------- src/transformers/trainer.py | 6 +- 2 files changed, 82 insertions(+), 72 deletions(-) diff --git a/src/transformers/integrations.py b/src/transformers/integrations.py index 2382d50241..d1299282a0 100644 --- a/src/transformers/integrations.py +++ b/src/transformers/integrations.py @@ -3,7 +3,7 @@ import os import numpy as np -from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR, BestRun, HPSearchBackend +from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR, BestRun from transformers.utils import logging @@ -83,7 +83,7 @@ def default_hp_search_backend(): return "ray" -def run_hp_search(trainer, n_trials, direction, kwargs): +def run_hp_search_optuna(trainer, n_trials: int, direction: str, **kwargs) -> BestRun: def _objective(trial, checkpoint_dir=None): model_path = None if checkpoint_dir: @@ -96,80 +96,88 @@ def run_hp_search(trainer, n_trials, direction, kwargs): if getattr(trainer, "objective", None) is None: metrics = trainer.evaluate() trainer.objective = trainer.compute_objective(metrics) - if trainer.hp_search_backend == HPSearchBackend.RAY: - trainer._tune_save_checkpoint() - ray.tune.report(objective=trainer.objective) return trainer.objective - if trainer.hp_search_backend == HPSearchBackend.OPTUNA: - timeout = kwargs.pop("timeout", None) - n_jobs = kwargs.pop("n_jobs", 1) - study = optuna.create_study(direction=direction, **kwargs) - study.optimize(_objective, n_trials=n_trials, timeout=timeout, n_jobs=n_jobs) - best_trial = study.best_trial - best_run = BestRun(str(best_trial.number), best_trial.value, best_trial.params) - elif trainer.hp_search_backend == HPSearchBackend.RAY: - # The model and TensorBoard writer do not pickle so we have to remove them (if they exists) - # while doing the ray hp search. - _tb_writer = trainer.tb_writer - trainer.tb_writer = None - trainer.model = None - # Setup default `resources_per_trial` and `reporter`. - if "resources_per_trial" not in kwargs and trainer.args.n_gpu > 0: - # `args.n_gpu` is considered the total number of GPUs that will be split - # among the `n_jobs` - n_jobs = int(kwargs.pop("n_jobs", 1)) - num_gpus_per_trial = trainer.args.n_gpu - if num_gpus_per_trial / n_jobs >= 1: - num_gpus_per_trial = int(np.ceil(num_gpus_per_trial / n_jobs)) - kwargs["resources_per_trial"] = {"gpu": num_gpus_per_trial} + timeout = kwargs.pop("timeout", None) + n_jobs = kwargs.pop("n_jobs", 1) + study = optuna.create_study(direction=direction, **kwargs) + study.optimize(_objective, n_trials=n_trials, timeout=timeout, n_jobs=n_jobs) + best_trial = study.best_trial + return BestRun(str(best_trial.number), best_trial.value, best_trial.params) - if "reporter" not in kwargs: - from ray.tune import CLIReporter - kwargs["progress_reporter"] = CLIReporter(metric_columns=["objective"]) - if "keep_checkpoints_num" in kwargs and kwargs["keep_checkpoints_num"] > 0: - # `keep_checkpoints_num=0` would disabled checkpointing - trainer.use_tune_checkpoints = True - if kwargs["keep_checkpoints_num"] > 1: +def run_hp_search_ray(trainer, n_trials: int, direction: str, **kwargs) -> BestRun: + def _objective(trial, checkpoint_dir=None): + model_path = None + if checkpoint_dir: + for subdir in os.listdir(checkpoint_dir): + if subdir.startswith(PREFIX_CHECKPOINT_DIR): + model_path = os.path.join(checkpoint_dir, subdir) + trainer.objective = None + trainer.train(model_path=model_path, trial=trial) + # If there hasn't been any evaluation during the training loop. + if getattr(trainer, "objective", None) is None: + metrics = trainer.evaluate() + trainer.objective = trainer.compute_objective(metrics) + trainer._tune_save_checkpoint() + ray.tune.report(objective=trainer.objective) + return trainer.objective + + # The model and TensorBoard writer do not pickle so we have to remove them (if they exists) + # while doing the ray hp search. + _tb_writer = trainer.tb_writer + trainer.tb_writer = None + trainer.model = None + # Setup default `resources_per_trial` and `reporter`. + if "resources_per_trial" not in kwargs and trainer.args.n_gpu > 0: + # `args.n_gpu` is considered the total number of GPUs that will be split + # among the `n_jobs` + n_jobs = int(kwargs.pop("n_jobs", 1)) + num_gpus_per_trial = trainer.args.n_gpu + if num_gpus_per_trial / n_jobs >= 1: + num_gpus_per_trial = int(np.ceil(num_gpus_per_trial / n_jobs)) + kwargs["resources_per_trial"] = {"gpu": num_gpus_per_trial} + + if "reporter" not in kwargs: + from ray.tune import CLIReporter + + kwargs["progress_reporter"] = CLIReporter(metric_columns=["objective"]) + if "keep_checkpoints_num" in kwargs and kwargs["keep_checkpoints_num"] > 0: + # `keep_checkpoints_num=0` would disabled checkpointing + trainer.use_tune_checkpoints = True + if kwargs["keep_checkpoints_num"] > 1: + logger.warning( + "Currently keeping {} checkpoints for each trial. Checkpoints are usually huge, " + "consider setting `keep_checkpoints_num=1`." + ) + if "scheduler" in kwargs: + from ray.tune.schedulers import ASHAScheduler, HyperBandForBOHB, MedianStoppingRule, PopulationBasedTraining + + # Check if checkpointing is enabled for PopulationBasedTraining + if isinstance(kwargs["scheduler"], PopulationBasedTraining): + if not trainer.use_tune_checkpoints: logger.warning( - "Currently keeping {} checkpoints for each trial. Checkpoints are usually huge, " - "consider setting `keep_checkpoints_num=1`." + "You are using PopulationBasedTraining but you haven't enabled checkpointing. " + "This means your trials will train from scratch everytime they are exploiting " + "new configurations. Consider enabling checkpointing by passing " + "`keep_checkpoints_num=1` as an additional argument to `Trainer.hyperparameter_search`." ) - if "scheduler" in kwargs: - from ray.tune.schedulers import ( - ASHAScheduler, - HyperBandForBOHB, - MedianStoppingRule, - PopulationBasedTraining, + + # Check for `do_eval` and `eval_during_training` for schedulers that require intermediate reporting. + if isinstance( + kwargs["scheduler"], (ASHAScheduler, MedianStoppingRule, HyperBandForBOHB, PopulationBasedTraining) + ) and (not trainer.args.do_eval or not trainer.args.evaluate_during_training): + raise RuntimeError( + "You are using {cls} as a scheduler but you haven't enabled evaluation during training. " + "This means your trials will not report intermediate results to Ray Tune, and " + "can thus not be stopped early or used to exploit other trials parameters. " + "If this is what you want, do not use {cls}. If you would like to use {cls}, " + "make sure you pass `do_eval=True` and `evaluate_during_training=True` in the " + "Trainer `args`.".format(cls=type(kwargs["scheduler"]).__name__) ) - # Check if checkpointing is enabled for PopulationBasedTraining - if isinstance(kwargs["scheduler"], PopulationBasedTraining): - if not trainer.use_tune_checkpoints: - logger.warning( - "You are using PopulationBasedTraining but you haven't enabled checkpointing. " - "This means your trials will train from scratch everytime they are exploiting " - "new configurations. Consider enabling checkpointing by passing " - "`keep_checkpoints_num=1` as an additional argument to `Trainer.hyperparameter_search`." - ) - - # Check for `do_eval` and `eval_during_training` for schedulers that require intermediate reporting. - if isinstance( - kwargs["scheduler"], (ASHAScheduler, MedianStoppingRule, HyperBandForBOHB, PopulationBasedTraining) - ) and (not trainer.args.do_eval or not trainer.args.evaluate_during_training): - raise RuntimeError( - "You are using {cls} as a scheduler but you haven't enabled evaluation during training. " - "This means your trials will not report intermediate results to Ray Tune, and " - "can thus not be stopped early or used to exploit other trials parameters. " - "If this is what you want, do not use {cls}. If you would like to use {cls}, " - "make sure you pass `do_eval=True` and `evaluate_during_training=True` in the " - "Trainer `args`.".format(cls=type(kwargs["scheduler"]).__name__) - ) - - analysis = ray.tune.run(_objective, config=trainer.hp_space(None), num_samples=n_trials, **kwargs) - best_trial = analysis.get_best_trial(metric="objective", mode=direction[:3]) - best_run = BestRun(best_trial.trial_id, best_trial.last_result["objective"], best_trial.config) - trainer.tb_writer = _tb_writer - + analysis = ray.tune.run(_objective, config=trainer.hp_space(None), num_samples=n_trials, **kwargs) + best_trial = analysis.get_best_trial(metric="objective", mode=direction[:3]) + best_run = BestRun(best_trial.trial_id, best_trial.last_result["objective"], best_trial.config) + trainer.tb_writer = _tb_writer return best_run diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py index 428259bb48..2a6d757154 100755 --- a/src/transformers/trainer.py +++ b/src/transformers/trainer.py @@ -27,7 +27,8 @@ from .integrations import ( is_ray_available, is_tensorboard_available, is_wandb_available, - run_hp_search, + run_hp_search_optuna, + run_hp_search_ray, ) from .modeling_utils import PreTrainedModel from .optimization import AdamW, get_linear_schedule_with_warmup @@ -884,7 +885,8 @@ class Trainer: self.hp_space = default_hp_space[backend] if hp_space is None else hp_space self.compute_objective = default_compute_objective if compute_objective is None else compute_objective - best_run = run_hp_search(self, n_trials, direction, kwargs) + run_hp_search = run_hp_search_optuna if backend == HPSearchBackend.OPTUNA else run_hp_search_ray + best_run = run_hp_search(self, n_trials, direction, **kwargs) self.hp_search_backend = None return best_run