[DeepSpeed] decouple DeepSpeedConfigHF from Trainer (#11966)

* decouple DeepSpeedConfigHF from Trainer

* add LoggingLevel ctx manager; add new test

* cleanup

* add docs

* Apply suggestions from code review

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>

* implemented suggested renames

* formatter workaround

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
This commit is contained in:
Stas Bekman
2021-06-01 13:24:52 -07:00
committed by GitHub
parent 1c3ab3e5d6
commit 7ec596ecda
7 changed files with 216 additions and 75 deletions

View File

@@ -20,13 +20,14 @@ import unittest
from copy import deepcopy
from parameterized import parameterized
from transformers import TrainingArguments, is_torch_available
from transformers import AutoModel, TrainingArguments, is_torch_available, logging
from transformers.file_utils import WEIGHTS_NAME
from transformers.integrations import is_deepspeed_available
from transformers.integrations import HfDeepSpeedConfig, is_deepspeed_available
from transformers.testing_utils import (
CaptureLogger,
CaptureStderr,
ExtendSysPath,
LoggingLevel,
TestCasePlus,
execute_subprocess_async,
get_gpu_count,
@@ -77,6 +78,56 @@ ZERO3 = "zero3"
stages = [ZERO2, ZERO3]
@require_deepspeed
@require_torch_gpu
class CoreIntegrationDeepSpeed(TestCasePlus, TrainerIntegrationCommon):
"""
Testing non-Trainer DeepSpeed integration
"""
def setUp(self):
super().setUp()
self.dist_env_1_gpu = dict(
MASTER_ADDR="localhost", MASTER_PORT="10999", RANK="0", LOCAL_RANK="0", WORLD_SIZE="1"
)
def test_init_zero3(self):
# test that zero.Init() works correctly under zero3
ds_config = {
"train_batch_size": 1,
"zero_optimization": {
"stage": 3,
},
}
dschf = HfDeepSpeedConfig(ds_config)
self.assertTrue(dschf.is_zero3())
self.assertTrue(is_deepspeed_zero3_enabled())
with LoggingLevel(logging.INFO):
with mockenv_context(**self.dist_env_1_gpu):
logger = logging.get_logger("transformers.modeling_utils")
with CaptureLogger(logger) as cl:
AutoModel.from_pretrained(T5_TINY)
self.assertIn("Detected DeepSpeed ZeRO-3", cl.out)
# now remove zero optimization
del ds_config["zero_optimization"]
dschf = HfDeepSpeedConfig(ds_config)
self.assertFalse(dschf.is_zero3())
self.assertFalse(is_deepspeed_zero3_enabled())
with LoggingLevel(logging.INFO):
with mockenv_context(**self.dist_env_1_gpu):
logger = logging.get_logger("transformers.modeling_utils")
with CaptureLogger(logger) as cl:
AutoModel.from_pretrained(T5_TINY)
self.assertNotIn("Detected DeepSpeed ZeRO-3", cl.out)
@require_deepspeed
@require_torch_gpu
class TrainerIntegrationDeepSpeed(TestCasePlus, TrainerIntegrationCommon):
@@ -194,9 +245,9 @@ class TrainerIntegrationDeepSpeed(TestCasePlus, TrainerIntegrationCommon):
ds_config_zero3_dict["zero_optimization"]["offload_optimizer"] = nvme_config
ds_config_zero3_dict["zero_optimization"]["offload_param"] = nvme_config
trainer = get_regression_trainer(local_rank=0, deepspeed=ds_config_zero3_dict)
with CaptureLogger(deepspeed_logger) as cs:
with CaptureLogger(deepspeed_logger) as cl:
trainer.train()
self.assertIn("DeepSpeed info", cs.out, "expected DeepSpeed logger output but got none")
self.assertIn("DeepSpeed info", cl.out, "expected DeepSpeed logger output but got none")
# --- These tests need to run on both zero stages --- #
@@ -230,9 +281,9 @@ class TrainerIntegrationDeepSpeed(TestCasePlus, TrainerIntegrationCommon):
# to reset `deepspeed_logger.handlers[0].setStream(sys.stdout)` or directly capture from the deepspeed_logger.
with mockenv_context(**self.dist_env_1_gpu):
trainer = get_regression_trainer(local_rank=0, deepspeed=self.get_config_dict(stage))
with CaptureLogger(deepspeed_logger) as cs:
with CaptureLogger(deepspeed_logger) as cl:
trainer.train()
self.assertIn("DeepSpeed info", cs.out, "expected DeepSpeed logger output but got none")
self.assertIn("DeepSpeed info", cl.out, "expected DeepSpeed logger output but got none")
@parameterized.expand(stages)
def test_early_get_last_lr(self, stage):