From c74f3d4c480a6971e302de7cef226e9a157ef0d0 Mon Sep 17 00:00:00 2001 From: Ayush Chaurasia Date: Thu, 3 Feb 2022 00:36:14 +0530 Subject: [PATCH] Add W&B backend for hyperparameter sweep (#14582) # Add support for W&B hyperparameter sweep This PR: * allows using wandb for running hyperparameter search. * The runs are visualized on W&B sweeps dashboard * This supports runnning sweeps on parallel devices, all reporting to the same central dashboard. ### Usage **To run new a hyperparameter search:** ``` trainer.hyperparameter_search( backend="wandb", project="transformers_sweep", # name of the project n_trials=5, metric="eval/loss", # metric to be optimized, default 'eval/loss'. A warning is raised if the passed metric is not found ) ``` This outputs a sweep id. Eg. `my_project/sweep_id` **To run sweeps on parallel devices:** Just pass sweep id which you want to run parallel ``` trainer.hyperparameter_search( backend="wandb", sweep_id = "my_project/sweep_id" ) ``` --- .github/workflows/self-scheduled.yml | 2 + src/transformers/integrations.py | 74 ++++++++++++++++++++++++++++ src/transformers/testing_utils.py | 15 +++++- src/transformers/trainer.py | 13 +++++ src/transformers/trainer_utils.py | 20 ++++++++ tests/test_trainer.py | 57 +++++++++++++++++++++ 6 files changed, 180 insertions(+), 1 deletion(-) diff --git a/.github/workflows/self-scheduled.yml b/.github/workflows/self-scheduled.yml index 973c0b9a4b..1f11e7a522 100644 --- a/.github/workflows/self-scheduled.yml +++ b/.github/workflows/self-scheduled.yml @@ -38,6 +38,7 @@ jobs: pip install .[integrations,sklearn,testing,onnxruntime,sentencepiece,torch-speech,vision,timm] pip install https://github.com/kpu/kenlm/archive/master.zip python -m pip install 'git+https://github.com/facebookresearch/detectron2.git' + wandb login ${{ secrets.WANDB_API_KEY }} - name: Are GPUs recognized by our DL frameworks run: | @@ -271,6 +272,7 @@ jobs: pip install .[integrations,sklearn,testing,onnxruntime,sentencepiece,torch-speech,vision,timm] pip install https://github.com/kpu/kenlm/archive/master.zip python -m pip install 'git+https://github.com/facebookresearch/detectron2.git' + wandb login ${{ secrets.WANDB_API_KEY }} - name: Are GPUs recognized by our DL frameworks run: | diff --git a/src/transformers/integrations.py b/src/transformers/integrations.py index a18d06fe41..e072de4c51 100644 --- a/src/transformers/integrations.py +++ b/src/transformers/integrations.py @@ -125,6 +125,10 @@ def hp_params(trial): if isinstance(trial, dict): return trial + if is_wandb_available(): + if isinstance(trial, dict): + return trial + raise RuntimeError(f"Unknown type for trial {trial.__class__}") @@ -337,6 +341,75 @@ def run_hp_search_sigopt(trainer, n_trials: int, direction: str, **kwargs) -> Be return best_run +def run_hp_search_wandb(trainer, n_trials: int, direction: str, **kwargs) -> BestRun: + from .integrations import is_wandb_available + + if not is_wandb_available(): + raise ImportError("This function needs wandb installed: `pip install wandb`") + import wandb + + # add WandbCallback if not already added in trainer callbacks + reporting_to_wandb = False + for callback in trainer.callback_handler.callbacks: + if isinstance(callback, WandbCallback): + reporting_to_wandb = True + break + if not reporting_to_wandb: + trainer.add_callback(WandbCallback()) + trainer.args.report_to = "wandb" + best_trial = {"run_id": None, "objective": None, "hyperparameters": None} + sweep_id = kwargs.pop("sweep_id", None) + project = kwargs.pop("project", None) + name = kwargs.pop("name", None) + entity = kwargs.pop("entity", None) + metric = kwargs.pop("metric", "eval/loss") + + sweep_config = trainer.hp_space(None) + sweep_config["metric"]["goal"] = direction + sweep_config["metric"]["name"] = metric + if name: + sweep_config["name"] = name + + def _objective(): + + run = wandb.run if wandb.run else wandb.init() + trainer.state.trial_name = run.name + run.config.update({"assignments": {}, "metric": metric}) + config = wandb.config + + trainer.objective = None + + trainer.train(resume_from_checkpoint=None, trial=vars(config)["_items"]) + # If there hasn't been any evaluation during the training loop. + if getattr(trainer, "objective", None) is None: + metrics = trainer.evaluate() + trainer.objective = trainer.compute_objective(metrics) + format_metrics = rewrite_logs(metrics) + if metric not in format_metrics: + logger.warning( + f"Provided metric {metric} not found. This might result in unexpected sweeps charts. The available metrics are {format_metrics.keys()}" + ) + best_score = False + if best_trial["run_id"] is not None: + if direction == "minimize": + best_score = trainer.objective < best_trial["objective"] + elif direction == "maximize": + best_score = trainer.objective > best_trial["objective"] + + if best_score or best_trial["run_id"] is None: + best_trial["run_id"] = run.id + best_trial["objective"] = trainer.objective + best_trial["hyperparameters"] = dict(config) + + return trainer.objective + + sweep_id = wandb.sweep(sweep_config, project=project, entity=entity) if not sweep_id else sweep_id + logger.info(f"wandb sweep id - {sweep_id}") + wandb.agent(sweep_id, function=_objective, count=n_trials) + + return BestRun(best_trial["run_id"], best_trial["objective"], best_trial["hyperparameters"]) + + def get_available_reporting_integrations(): integrations = [] if is_azureml_available(): @@ -542,6 +615,7 @@ class WandbCallback(TrainerCallback): if hp_search: self._wandb.finish() self._initialized = False + args.run_name = None if not self._initialized: self.setup(args, state, model, **kwargs) diff --git a/src/transformers/testing_utils.py b/src/transformers/testing_utils.py index 44bbbd7b99..ca2d1403f3 100644 --- a/src/transformers/testing_utils.py +++ b/src/transformers/testing_utils.py @@ -59,7 +59,7 @@ from .file_utils import ( is_torchaudio_available, is_vision_available, ) -from .integrations import is_optuna_available, is_ray_available, is_sigopt_available +from .integrations import is_optuna_available, is_ray_available, is_sigopt_available, is_wandb_available SMALL_MODEL_IDENTIFIER = "julien-c/bert-xsmall-dummy" @@ -590,6 +590,19 @@ def require_sigopt(test_case): return test_case +def require_wandb(test_case): + """ + Decorator marking a test that requires wandb. + + These tests are skipped when wandb isn't installed. + + """ + if not is_wandb_available(): + return unittest.skip("test requires wandb")(test_case) + else: + return test_case + + def require_soundfile(test_case): """ Decorator marking a test that requires soundfile diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py index 6a68f7d6f4..fcc37919f3 100755 --- a/src/transformers/trainer.py +++ b/src/transformers/trainer.py @@ -43,9 +43,11 @@ from .integrations import ( # isort: split is_optuna_available, is_ray_tune_available, is_sigopt_available, + is_wandb_available, run_hp_search_optuna, run_hp_search_ray, run_hp_search_sigopt, + run_hp_search_wandb, ) import numpy as np @@ -937,6 +939,8 @@ class Trainer: params.pop("wandb", None) elif self.hp_search_backend == HPSearchBackend.SIGOPT: params = {k: int(v) if isinstance(v, str) else v for k, v in trial.assignments.items()} + elif self.hp_search_backend == HPSearchBackend.WANDB: + params = trial for key, value in params.items(): if not hasattr(self.args, key): @@ -953,6 +957,8 @@ class Trainer: logger.info("Trial:", trial.params) if self.hp_search_backend == HPSearchBackend.SIGOPT: logger.info(f"SigOpt Assignments: {trial.assignments}") + if self.hp_search_backend == HPSearchBackend.WANDB: + logger.info(f"W&B Sweep parameters: {trial}") if self.args.deepspeed: # Rebuild the deepspeed config to reflect the updated training parameters from transformers.deepspeed import HfDeepSpeedConfig @@ -1646,6 +1652,10 @@ class Trainer: run_id = tune.get_trial_id() elif self.hp_search_backend == HPSearchBackend.SIGOPT: run_id = trial.id + elif self.hp_search_backend == HPSearchBackend.WANDB: + import wandb + + run_id = wandb.run.id run_name = self.hp_name(trial) if self.hp_name is not None else f"run-{run_id}" run_dir = os.path.join(self.args.output_dir, run_name) else: @@ -1848,6 +1858,8 @@ class Trainer: ) if backend == HPSearchBackend.SIGOPT and not is_sigopt_available(): raise RuntimeError("You picked the sigopt backend, but it is not installed. Use `pip install sigopt`.") + if backend == HPSearchBackend.WANDB and not is_wandb_available(): + raise RuntimeError("You picked the wandb backend, but it is not installed. Use `pip install wandb`.") self.hp_search_backend = backend if self.model_init is None: raise RuntimeError( @@ -1862,6 +1874,7 @@ class Trainer: HPSearchBackend.OPTUNA: run_hp_search_optuna, HPSearchBackend.RAY: run_hp_search_ray, HPSearchBackend.SIGOPT: run_hp_search_sigopt, + HPSearchBackend.WANDB: run_hp_search_wandb, } best_run = backend_dict[backend](self, n_trials, direction, **kwargs) diff --git a/src/transformers/trainer_utils.py b/src/transformers/trainer_utils.py index 135ebf0a48..2f96ff6152 100644 --- a/src/transformers/trainer_utils.py +++ b/src/transformers/trainer_utils.py @@ -210,16 +210,36 @@ def default_hp_space_sigopt(trial): ] +def default_hp_space_wandb(trial) -> Dict[str, float]: + from .integrations import is_wandb_available + + if not is_wandb_available(): + raise ImportError("This function needs wandb installed: `pip install wandb`") + + return { + "method": "random", + "metric": {"name": "objective", "goal": "minimize"}, + "parameters": { + "learning_rate": {"distribution": "uniform", "min": 1e-6, "max": 1e-4}, + "num_train_epochs": {"distribution": "int_uniform", "min": 1, "max": 6}, + "seed": {"distribution": "int_uniform", "min": 1, "max": 40}, + "per_device_train_batch_size": {"values": [4, 8, 16, 32, 64]}, + }, + } + + class HPSearchBackend(ExplicitEnum): OPTUNA = "optuna" RAY = "ray" SIGOPT = "sigopt" + WANDB = "wandb" default_hp_space = { HPSearchBackend.OPTUNA: default_hp_space_optuna, HPSearchBackend.RAY: default_hp_space_ray, HPSearchBackend.SIGOPT: default_hp_space_sigopt, + HPSearchBackend.WANDB: default_hp_space_wandb, } diff --git a/tests/test_trainer.py b/tests/test_trainer.py index f7cb287265..cf275f127e 100644 --- a/tests/test_trainer.py +++ b/tests/test_trainer.py @@ -60,6 +60,7 @@ from transformers.testing_utils import ( require_torch_non_multi_gpu, require_torch_tf32, require_torch_up_to_2_gpus, + require_wandb, slow, ) from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR @@ -1810,3 +1811,59 @@ class TrainerOptimizerChoiceTest(unittest.TestCase): with patch.dict("sys.modules", {"apex.optimizers": None}): with self.assertRaises(ValueError): Trainer.get_optimizer_cls_and_kwargs(args) + + +@require_torch +@require_wandb +class TrainerHyperParameterWandbIntegrationTest(unittest.TestCase): + def setUp(self): + args = TrainingArguments(".") + self.n_epochs = args.num_train_epochs + self.batch_size = args.train_batch_size + + def test_hyperparameter_search(self): + class MyTrialShortNamer(TrialShortNamer): + DEFAULTS = {"a": 0, "b": 0} + + def hp_space(trial): + + return { + "method": "random", + "metric": {}, + "parameters": { + "a": {"distribution": "uniform", "min": 1e-6, "max": 1e-4}, + "b": {"distribution": "int_uniform", "min": 1, "max": 6}, + }, + } + + def model_init(config): + if config is None: + a = 0 + b = 0 + else: + a = config["a"] + b = config["b"] + model_config = RegressionModelConfig(a=a, b=b, double_output=False) + + return RegressionPreTrainedModel(model_config) + + def hp_name(params): + return MyTrialShortNamer.shortname(params) + + with tempfile.TemporaryDirectory() as tmp_dir: + trainer = get_regression_trainer( + output_dir=tmp_dir, + learning_rate=0.1, + logging_steps=1, + evaluation_strategy=IntervalStrategy.EPOCH, + save_strategy=IntervalStrategy.EPOCH, + num_train_epochs=4, + disable_tqdm=True, + load_best_model_at_end=True, + logging_dir="runs", + run_name="test", + model_init=model_init, + ) + trainer.hyperparameter_search( + direction="minimize", hp_space=hp_space, hp_name=hp_name, backend="wandb", n_trials=4, anonymous="must" + )