HPU support (#36424)
* test * fix * fix * skip some and run some first * test fsdp * fix * patches for generate * test distributed * copy * don't test distributed loss for hpu * require fp16 and run first * changes from marc's PR fixing zero3 * better alternative * return True when fp16 support on gaudi without creating bridge * fix * fix tested dtype in deepspeed inference test * test * fix * test * fix * skip * require fp16 * run first fsdp * Apply suggestions from code review * address comments * address comments and refactor test * reduce precison * avoid doing gaudi1 specific stuff in the genreation loop * document test_gradient_accumulation_loss_alignment_with_model_loss test a bit more
This commit is contained in:
committed by
GitHub
parent
50d3530aa0
commit
89f6956015
@@ -45,12 +45,14 @@ from transformers.testing_utils import (
|
||||
require_deepspeed,
|
||||
require_optuna,
|
||||
require_torch_accelerator,
|
||||
require_torch_fp16,
|
||||
require_torch_multi_accelerator,
|
||||
run_first,
|
||||
slow,
|
||||
torch_device,
|
||||
)
|
||||
from transformers.trainer_utils import get_last_checkpoint, set_seed
|
||||
from transformers.utils import SAFE_WEIGHTS_NAME, is_torch_bf16_available_on_device
|
||||
from transformers.utils import SAFE_WEIGHTS_NAME, is_torch_bf16_available_on_device, is_torch_fp16_available_on_device
|
||||
|
||||
|
||||
if is_torch_available():
|
||||
@@ -150,10 +152,12 @@ optims = [HF_OPTIM, DS_OPTIM]
|
||||
schedulers = [HF_SCHEDULER, DS_SCHEDULER]
|
||||
|
||||
stages = [ZERO2, ZERO3]
|
||||
|
||||
dtypes = []
|
||||
if is_torch_bf16_available_on_device(torch_device):
|
||||
dtypes = [FP16, BF16]
|
||||
else:
|
||||
dtypes = [FP16]
|
||||
dtypes.append(BF16)
|
||||
if is_torch_fp16_available_on_device(torch_device):
|
||||
dtypes.append(FP16)
|
||||
|
||||
|
||||
def parameterized_custom_name_func(func, param_num, param):
|
||||
@@ -228,6 +232,7 @@ class CoreIntegrationDeepSpeed(TestCasePlus, TrainerIntegrationCommon):
|
||||
AutoModel.from_pretrained(T5_TINY)
|
||||
self.assertNotIn("Detected DeepSpeed ZeRO-3", cl.out)
|
||||
|
||||
@require_torch_fp16
|
||||
@require_torch_accelerator
|
||||
def test_init_zero3_fp16(self):
|
||||
# test that zero.Init() works correctly under zero3/fp16
|
||||
@@ -456,6 +461,7 @@ class TrainerIntegrationDeepSpeedWithCustomConfig(TestCasePlus):
|
||||
|
||||
|
||||
@require_deepspeed
|
||||
@require_torch_fp16
|
||||
@require_torch_accelerator
|
||||
class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, TrainerIntegrationCommon):
|
||||
"""
|
||||
@@ -714,7 +720,7 @@ class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, T
|
||||
# dynamic loss scale value set to:
|
||||
# "fp16.initial_scale_power": 1
|
||||
# plus having the same WarmupLR's warmup_min_lr == warmup_max_lr in the config file
|
||||
# but for some reason going to train_len=64 the weights, weights start to mismatch with this setup.
|
||||
# but for some reason going to train_len=64, the weights start to mismatch with this setup.
|
||||
# the culprit seems to be `initial_scale_power` - putting it back to its default 32 keeps the weights identical
|
||||
|
||||
train_len = 64
|
||||
@@ -757,8 +763,12 @@ class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, T
|
||||
|
||||
# training with half the batch size but accumulation steps as 2 should give the same
|
||||
# weights, but sometimes get a slight difference still of 1e-6
|
||||
self.assertAlmostEqual(no_grad_accum_a, yes_grad_accum_a, places=5)
|
||||
self.assertAlmostEqual(no_grad_accum_b, yes_grad_accum_b, places=5)
|
||||
if torch_device == "hpu":
|
||||
self.assertAlmostEqual(no_grad_accum_a, yes_grad_accum_a, delta=1e-4)
|
||||
self.assertAlmostEqual(no_grad_accum_b, yes_grad_accum_b, delta=1e-4)
|
||||
else:
|
||||
self.assertAlmostEqual(no_grad_accum_a, yes_grad_accum_a, places=5)
|
||||
self.assertAlmostEqual(no_grad_accum_b, yes_grad_accum_b, places=5)
|
||||
|
||||
# Relative difference. See the note above how to get identical loss on a small bs
|
||||
self.assertTrue((no_grad_accum_loss - yes_grad_accum_loss) / (no_grad_accum_loss + 1e-15) <= 1e-3)
|
||||
@@ -1100,6 +1110,7 @@ class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, T
|
||||
|
||||
|
||||
@slow
|
||||
@run_first
|
||||
@require_deepspeed
|
||||
@require_torch_accelerator
|
||||
class TestDeepSpeedWithLauncher(TestCasePlus):
|
||||
@@ -1126,6 +1137,7 @@ class TestDeepSpeedWithLauncher(TestCasePlus):
|
||||
def test_basic_distributed(self, stage, dtype):
|
||||
self.run_and_check(stage=stage, dtype=dtype, distributed=True)
|
||||
|
||||
@require_torch_fp16
|
||||
def test_do_eval_no_train(self):
|
||||
# testing only zero3 since zero2 makes no sense with inference
|
||||
self.run_and_check(
|
||||
@@ -1199,12 +1211,15 @@ class TestDeepSpeedWithLauncher(TestCasePlus):
|
||||
if dtype == "bf16" and not is_torch_bf16_available_on_device(torch_device):
|
||||
self.skipTest(reason="test requires bfloat16 hardware support")
|
||||
|
||||
if dtype == "fp16" and not is_torch_fp16_available_on_device(torch_device):
|
||||
self.skipTest(reason="test requires fp16 hardware support")
|
||||
|
||||
# this is just inference, so no optimizer should be loaded
|
||||
# it only works for z3 (makes no sense with z1-z2)
|
||||
fp32 = True if dtype == "fp32" else False
|
||||
self.run_and_check(
|
||||
stage=ZERO3,
|
||||
dtype=FP16,
|
||||
dtype=dtype,
|
||||
model_name=T5_TINY,
|
||||
distributed=True,
|
||||
do_train=False,
|
||||
@@ -1381,6 +1396,7 @@ class TestDeepSpeedWithLauncher(TestCasePlus):
|
||||
# print(" ".join([f"\nPYTHONPATH={self.src_dir_str}"] +cmd)); die
|
||||
execute_subprocess_async(cmd, env=self.get_env())
|
||||
|
||||
@require_torch_fp16
|
||||
def test_clm_from_config_zero3_fp16(self):
|
||||
# this test exercises AutoModel.from_config(config) - to ensure zero.Init is called
|
||||
|
||||
|
||||
Reference in New Issue
Block a user