From ce2fef2ad278cd72748dfe5b049b4d58569e3d9a Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Thu, 14 Apr 2022 17:24:38 -0700 Subject: [PATCH] [trainer / deepspeed] fix hyperparameter_search (#16740) * [trainer / deepspeed] fix hyperparameter_search * require optuna * style * oops * add dep in the right place * create deepspeed-testing dep group * Trigger CI --- .github/workflows/self-nightly-scheduled.yml | 2 +- .github/workflows/self-push.yml | 2 +- .../Dockerfile | 2 +- setup.py | 2 ++ src/transformers/trainer.py | 5 ++-- tests/deepspeed/test_deepspeed.py | 28 +++++++++++++++++++ 6 files changed, 36 insertions(+), 5 deletions(-) diff --git a/.github/workflows/self-nightly-scheduled.yml b/.github/workflows/self-nightly-scheduled.yml index d1f3ce9a77..5acb21debf 100644 --- a/.github/workflows/self-nightly-scheduled.yml +++ b/.github/workflows/self-nightly-scheduled.yml @@ -157,7 +157,7 @@ jobs: apt -y update && apt install -y libaio-dev libsndfile1-dev git espeak-ng pip install --upgrade pip pip install --pre torch torchvision torchaudio -f https://download.pytorch.org/whl/nightly/cu113/torch_nightly.html -U - pip install .[testing,deepspeed] + pip install .[deepspeed-testing] pip install https://github.com/kpu/kenlm/archive/master.zip pip install git+https://github.com/microsoft/DeepSpeed diff --git a/.github/workflows/self-push.yml b/.github/workflows/self-push.yml index 66550d25e3..1889e4d928 100644 --- a/.github/workflows/self-push.yml +++ b/.github/workflows/self-push.yml @@ -384,7 +384,7 @@ jobs: run: | apt -y update && apt install -y libaio-dev pip install --upgrade pip - pip install .[testing,deepspeed] + pip install .[deepspeed-testing] - name: Are GPUs recognized by our DL frameworks run: | diff --git a/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile b/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile index 9ef0ac4623..1dd080c319 100644 --- a/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile +++ b/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile @@ -9,7 +9,7 @@ RUN python3 -m pip install --no-cache-dir --upgrade pip ARG REF=main RUN git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF -RUN python3 -m pip install --no-cache-dir -e ./transformers[testing,deepspeed] +RUN python3 -m pip install --no-cache-dir -e ./transformers[deepspeed-testing] RUN git clone https://github.com/microsoft/DeepSpeed && cd DeepSpeed && rm -rf build && \ DS_BUILD_CPU_ADAM=1 DS_BUILD_AIO=1 DS_BUILD_UTILS=1 python3 -m pip install -e . --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check 2>&1 diff --git a/setup.py b/setup.py index 4d386ae008..df483c1ec9 100644 --- a/setup.py +++ b/setup.py @@ -290,6 +290,8 @@ extras["testing"] = ( + extras["modelcreation"] ) +extras["deepspeed-testing"] = extras["deepspeed"] + extras["testing"] + extras["optuna"] + extras["quality"] = deps_list("black", "isort", "flake8", "GitPython", "hf-doc-builder") extras["all"] = ( diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py index 9bd6f0f62b..aff8b0114d 100755 --- a/src/transformers/trainer.py +++ b/src/transformers/trainer.py @@ -976,9 +976,10 @@ class Trainer: logger.info(f"W&B Sweep parameters: {trial}") if self.args.deepspeed: # Rebuild the deepspeed config to reflect the updated training parameters - from transformers.deepspeed import HfDeepSpeedConfig + from transformers.deepspeed import HfTrainerDeepSpeedConfig - self.args.hf_deepspeed_config = HfDeepSpeedConfig(self.args.deepspeed) + self.args.hf_deepspeed_config = HfTrainerDeepSpeedConfig(self.args.deepspeed) + self.args.hf_deepspeed_config.trainer_config_process(self.args) def _report_to_hp_search( self, trial: Union["optuna.Trial", Dict[str, Any]], epoch: int, metrics: Dict[str, float] diff --git a/tests/deepspeed/test_deepspeed.py b/tests/deepspeed/test_deepspeed.py index acb72886bb..9fba62815b 100644 --- a/tests/deepspeed/test_deepspeed.py +++ b/tests/deepspeed/test_deepspeed.py @@ -34,6 +34,7 @@ from transformers.testing_utils import ( get_gpu_count, mockenv_context, require_deepspeed, + require_optuna, require_torch_gpu, require_torch_multi_gpu, slow, @@ -363,6 +364,33 @@ class TrainerIntegrationDeepSpeed(TestCasePlus, TrainerIntegrationCommon): trainer.train() self.assertIn("DeepSpeed info", cl.out, "expected DeepSpeed logger output but got none") + @require_optuna + def test_hyperparameter_search(self): + with mockenv_context(**self.dist_env_1_gpu): + + ds_config_zero3_dict = self.get_config_dict(ZERO3) + + # hyperparameter_search requires model_init() to recreate the model for each trial + def model_init(): + config = RegressionModelConfig(a=0, b=0, double_output=False) + model = RegressionPreTrainedModel(config) + return model + + trainer = get_regression_trainer( + local_rank=0, + fp16=True, + model_init=model_init, + deepspeed=ds_config_zero3_dict, + ) + + n_trials = 3 + with CaptureLogger(deepspeed_logger) as cl: + with CaptureStd() as cs: + trainer.hyperparameter_search(direction="maximize", n_trials=n_trials) + self.assertIn("DeepSpeed info", cl.out, "expected DeepSpeed logger output but got none") + self.assertIn(f"Trial {n_trials-1} finished with value", cs.err, "expected hyperparameter_search output") + self.assertIn("Best is trial", cs.err, "expected hyperparameter_search output") + # --- These tests need to run on both zero stages --- # @parameterized.expand(params, name_func=parameterized_custom_name_func)