[Deepspeed Wav2vec2] integration (#11638)

* wip

* wip - but working with https://github.com/microsoft/DeepSpeed/pull/1044

* cleanup

* workaround

* working 5/8 modes

* solve fp32 distributed zero3

* style

* sync

* sync

* rework

* deprecation

* cleanup

* https://github.com/microsoft/DeepSpeed/pull/1044 pr was merged

* clean up

* add a guide

* more prose

* more prose

* fix

* more prose

* sub_group_size was too big

* Apply suggestions from code review

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>

* refactor

* bug fix

* make the true check explicit

* new deepspeed release

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
This commit is contained in:
Stas Bekman
2021-06-08 12:32:03 -07:00
committed by GitHub
parent 32290d87f6
commit 11d86d3de4
11 changed files with 496 additions and 64 deletions

View File

@@ -32,6 +32,7 @@ from transformers.testing_utils import (
execute_subprocess_async,
get_gpu_count,
mockenv_context,
require_deepspeed,
require_torch_gpu,
require_torch_multi_gpu,
slow,
@@ -58,17 +59,6 @@ def load_json(path):
return json.load(f)
# a candidate for testing_utils
def require_deepspeed(test_case):
"""
Decorator marking a test that requires deepspeed
"""
if not is_deepspeed_available():
return unittest.skip("test requires deepspeed")(test_case)
else:
return test_case
def require_deepspeed_aio(test_case):
"""
Decorator marking a test that requires deepspeed aio (nvme)
@@ -404,15 +394,19 @@ class TrainerIntegrationDeepSpeed(TestCasePlus, TrainerIntegrationCommon):
train_len = 64
a = b = 0.0
kwargs = dict(
a=a,
b=b,
local_rank=0,
train_len=train_len,
fp16=True,
deepspeed=self.get_config_dict(stage),
)
with mockenv_context(**self.dist_env_1_gpu):
no_grad_accum_trainer = get_regression_trainer(
a=a,
b=b,
local_rank=0,
train_len=train_len,
fp16=True,
deepspeed=self.get_config_dict(stage),
per_device_train_batch_size=8,
**kwargs,
per_device_train_batch_size=16,
gradient_accumulation_steps=1,
)
no_grad_accum_result = no_grad_accum_trainer.train()
@@ -424,14 +418,9 @@ class TrainerIntegrationDeepSpeed(TestCasePlus, TrainerIntegrationCommon):
with mockenv_context(**self.dist_env_1_gpu):
yes_grad_accum_trainer = get_regression_trainer(
a=a,
b=b,
local_rank=0,
train_len=train_len,
fp16=True,
deepspeed=self.get_config_dict(stage),
**kwargs,
per_device_train_batch_size=4,
gradient_accumulation_steps=2,
gradient_accumulation_steps=4,
)
yes_grad_accum_result = yes_grad_accum_trainer.train()
yes_grad_accum_loss = yes_grad_accum_result.training_loss
@@ -445,7 +434,7 @@ class TrainerIntegrationDeepSpeed(TestCasePlus, TrainerIntegrationCommon):
self.assertAlmostEqual(no_grad_accum_b, yes_grad_accum_b, places=5)
# see the note above how to get identical loss on a small bs
self.assertAlmostEqual(no_grad_accum_loss, yes_grad_accum_loss, places=5)
self.assertAlmostEqual(no_grad_accum_loss, yes_grad_accum_loss, places=2)
def check_saved_checkpoints_deepspeed(self, output_dir, freq, total, stage):
# adapted from TrainerIntegrationCommon.check_saved_checkpoints