[trainer/deepspeed] load_best_model (reimplement re-init) (#17151)

* [trainer/deepspeed] load_best_model * to sync with DS PR #1947 * simplify * rework load_best_model test * cleanup * bump deepspeed>=0.6.5 Co-authored-by: Olatunji Ruwase <olruwase@microsoft.com>
2022-06-02 09:14:21 -07:00
parent 046c5ea906
commit 2f59ad1609
5 changed files with 141 additions and 93 deletions
--- a/tests/deepspeed/test_deepspeed.py
+++ b/tests/deepspeed/test_deepspeed.py
@@ -20,6 +20,8 @@ import os
 import unittest
 from copy import deepcopy

+import datasets
+
 from parameterized import parameterized
 from tests.trainer.test_trainer import TrainerIntegrationCommon  # noqa
 from transformers import AutoModel, TrainingArguments, is_torch_available, logging
@@ -195,28 +197,7 @@ class CoreIntegrationDeepSpeed(TestCasePlus, TrainerIntegrationCommon):
        self.assertNotIn("Detected DeepSpeed ZeRO-3", cl.out)


-@require_deepspeed
-@require_torch_gpu
-class TrainerIntegrationDeepSpeed(TestCasePlus, TrainerIntegrationCommon):
-    """
-
-    This class is for testing directly via get_regression_trainer
-
-    It mixes in `TrainerIntegrationCommon` which already has a lot of helper validation methods
-    which we can re-use here.
-
-    Important: this class' setup can only work with a single gpu because it runs within the current
-    pytest worker. For multi-gpu tests use TestDeepSpeedWithLauncher.
-
-    Note: if any of the tests of this class get run there will be at least one gpu occupied by them
-    until this pytest worker exits. This is because the gpu memory allocated by the cuda-kernels
-    won't be released until this pytest worker exits.
-
-    This may appear as some run-away tests if you watch `nvidia-smi` while other tests that fork new
-    processes are run. So there will be one or two "stale" processes reported in `nvidia-smi`. This
-    is not a bug.
-    """
-
+class TrainerIntegrationDeepSpeedWithCustomConfig(TestCasePlus):
    def setUp(self):
        super().setUp()

@@ -252,6 +233,29 @@ class TrainerIntegrationDeepSpeed(TestCasePlus, TrainerIntegrationCommon):
        # As some tests modify the dict, always make a copy
        return deepcopy(self.ds_config_dict[stage])

+
+@require_deepspeed
+@require_torch_gpu
+class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, TrainerIntegrationCommon):
+    """
+
+    This class is for testing directly via get_regression_trainer
+
+    It mixes in `TrainerIntegrationCommon` which already has a lot of helper validation methods
+    which we can re-use here.
+
+    Important: this class' setup can only work with a single gpu because it runs within the current
+    pytest worker. For multi-gpu tests use TestDeepSpeedWithLauncher.
+
+    Note: if any of the tests of this class get run there will be at least one gpu occupied by them
+    until this pytest worker exits. This is because the gpu memory allocated by the cuda-kernels
+    won't be released until this pytest worker exits.
+
+    This may appear as some run-away tests if you watch `nvidia-smi` while other tests that fork new
+    processes are run. So there will be one or two "stale" processes reported in `nvidia-smi`. This
+    is not a bug.
+    """
+
    # --- These tests are enough to run on one of zero stages --- #

    def test_hf_ds_config_mismatch(self):
@@ -725,6 +729,95 @@ class TrainerIntegrationDeepSpeed(TestCasePlus, TrainerIntegrationCommon):
            self.assertFalse(is_deepspeed_zero3_enabled())
            self.assertFalse(bool(config), "Deepspeed config should not be accessible")

+    @parameterized.expand(params, name_func=parameterized_custom_name_func)
+    def test_load_best_model(self, stage, dtype):
+        # Test that forced deepspeed reinit doesn't break the model. the forced re-init after
+        # loading the best model in Trainer is there to workaround this bug in Deepspeed
+        # https://github.com/microsoft/DeepSpeed/issues/1612
+        #
+        # The test is derived from a repro script submitted in this Issue:
+        # https://github.com/huggingface/transformers/issues/17114
+        #
+        # One additional feature of this test is that we use a non-AdamW optimizer to test that
+        # deepspeed doesn't fallback to AdamW, which would prevent the optimizer states from loading
+        # correctly
+
+        from transformers import T5ForConditionalGeneration, T5Tokenizer, Trainer  # noqa
+
+        output_dir = self.get_auto_remove_tmp_dir()  # "./xxx", after=False, before=False)
+
+        ds_config_dict = self.get_config_dict(stage)
+        del ds_config_dict["optimizer"]  # will use HF Trainer optimizer
+        del ds_config_dict["scheduler"]  # will use HF Trainer scheduler
+        # must use this setting to get the reload path exercised
+        ds_config_dict["zero_optimization"]["stage3_gather_16bit_weights_on_model_save"] = True
+
+        tokenizer = T5Tokenizer.from_pretrained(T5_TINY)
+        model = T5ForConditionalGeneration.from_pretrained(T5_TINY)
+
+        def _add_eos_to_examples(example):
+            example["input_text"] = f"question: {example['question']}  context: {example['context']}"
+            example["target_text"] = example["answers"]["text"][0] if len(example["answers"]["text"]) > 0 else ""
+            return example
+
+        def _convert_to_features(example_batch):
+            input_encodings = tokenizer.batch_encode_plus(
+                example_batch["input_text"], pad_to_max_length=True, max_length=512, truncation=True
+            )
+            target_encodings = tokenizer.batch_encode_plus(
+                example_batch["target_text"], pad_to_max_length=True, max_length=16, truncation=True
+            )
+
+            encodings = {
+                "input_ids": input_encodings["input_ids"],
+                "attention_mask": input_encodings["attention_mask"],
+                "labels": target_encodings["input_ids"],
+            }
+
+            return encodings
+
+        def get_dataset():
+            data_file = str(self.tests_dir / "fixtures/tests_samples/SQUAD/sample.json")
+            data_files = dict(train=data_file, validation=data_file)
+            raw_datasets = datasets.load_dataset("json", data_files=data_files, field="data")
+            train_dataset = raw_datasets["train"].map(_add_eos_to_examples).map(_convert_to_features, batched=True)
+            valid_dataset = deepcopy(train_dataset)
+            return train_dataset, valid_dataset
+
+        train_dataset, eval_dataset = get_dataset()
+
+        args_dict = {
+            "per_gpu_train_batch_size": 1,
+            "per_gpu_eval_batch_size": 1,
+            "gradient_accumulation_steps": 1,
+            "learning_rate": 1e-4,
+            "num_train_epochs": 1,
+            "do_train": True,
+            "do_eval": True,
+            "optim": "adafactor",
+            "evaluation_strategy": "steps",
+            "eval_steps": 1,
+            "save_strategy": "steps",
+            "save_steps": 1,
+            "load_best_model_at_end": True,
+            "max_steps": 1,
+            "deepspeed": ds_config_dict,
+        }
+
+        with mockenv_context(**self.dist_env_1_gpu):
+
+            training_args = TrainingArguments(output_dir, **args_dict)
+
+            trainer = Trainer(
+                model=model,
+                tokenizer=tokenizer,
+                args=training_args,
+                train_dataset=train_dataset,
+                eval_dataset=eval_dataset,
+            )
+            trainer.train()  # crash 1 was here
+            trainer.evaluate()  # crash 2 was here
+

@slow
@require_deepspeed
@@ -1035,50 +1128,3 @@ class TestDeepSpeedWithLauncher(TestCasePlus):
        with CaptureStderr() as cs:
            execute_subprocess_async(cmd, env=self.get_env())
        self.assertIn("Detected DeepSpeed ZeRO-3", cs.err)
-
-    @parameterized.expand(params, name_func=parameterized_custom_name_func)
-    def test_load_best_model(self, stage, dtype):
-        # this test exercises --load_best_model_at_end - the key is being able to resume after some training
-
-        data_dir = self.tests_dir / "fixtures/tests_samples/wmt_en_ro"
-        output_dir = self.get_auto_remove_tmp_dir()
-        args = f"""
-            --model_name_or_path {T5_TINY}
-            --tokenizer_name {T5_TINY}
-            --train_file {data_dir}/train.json
-            --validation_file {data_dir}/val.json
-            --output_dir {output_dir}
-            --overwrite_output_dir
-            --source_lang en
-            --target_lang ro
-            --do_train
-            --max_train_samples 3
-            --do_eval
-            --max_eval_samples 1
-            --logging_strategy steps
-            --logging_steps 1
-            --evaluation_strategy steps
-            --eval_steps 1
-            --save_strategy steps
-            --save_steps 1
-            --load_best_model_at_end
-            --per_device_train_batch_size 1
-            --per_device_eval_batch_size 1
-            --num_train_epochs 1
-            --report_to none
-            """.split()
-        args.extend(["--source_prefix", "translate English to Romanian: "])
-
-        args.extend([f"--{dtype}"])
-
-        ds_args = f"--deepspeed {self.test_file_dir_str}/ds_config_{stage}.json".split()
-        script = [f"{self.examples_dir_str}/pytorch/translation/run_translation.py"]
-        launcher = get_launcher(distributed=False)
-
-        cmd = launcher + script + args + ds_args
-        # keep for quick debug
-        # print(" ".join([f"\nPYTHONPATH={self.src_dir_str}"] +cmd)); die
-        with CaptureStd() as cs:
-            execute_subprocess_async(cmd, env=self.get_env())
-        # enough to test it didn't fail
-        self.assertIn("DeepSpeed info", cs.out)