Gaudi3 CI (#38790)
This commit is contained in:
committed by
GitHub
parent
2166b6b4ff
commit
984ff89e73
@@ -1134,10 +1134,12 @@ class TestDeepSpeedWithLauncher(TestCasePlus):
|
||||
|
||||
@parameterized.expand(params, name_func=parameterized_custom_name_func)
|
||||
@require_torch_multi_accelerator
|
||||
@run_first
|
||||
def test_basic_distributed(self, stage, dtype):
|
||||
self.run_and_check(stage=stage, dtype=dtype, distributed=True)
|
||||
|
||||
@require_torch_fp16
|
||||
@run_first
|
||||
def test_do_eval_no_train(self):
|
||||
# testing only zero3 since zero2 makes no sense with inference
|
||||
self.run_and_check(
|
||||
@@ -1150,6 +1152,7 @@ class TestDeepSpeedWithLauncher(TestCasePlus):
|
||||
)
|
||||
|
||||
@parameterized.expand(params, name_func=parameterized_custom_name_func)
|
||||
@run_first
|
||||
def test_fp32_non_distributed(self, stage, dtype):
|
||||
# real model needs too much GPU memory under stage2+fp32, so using tiny random model here -
|
||||
# therefore no quality checks, just basic completion checks are done
|
||||
@@ -1166,6 +1169,7 @@ class TestDeepSpeedWithLauncher(TestCasePlus):
|
||||
|
||||
@parameterized.expand(params, name_func=parameterized_custom_name_func)
|
||||
@require_torch_multi_accelerator
|
||||
@run_first
|
||||
def test_fp32_distributed(self, stage, dtype):
|
||||
# real model needs too much GPU memory under stage2+fp32, so using tiny random model here -
|
||||
# therefore no quality checks, just basic completion checks are done
|
||||
@@ -1181,6 +1185,7 @@ class TestDeepSpeedWithLauncher(TestCasePlus):
|
||||
)
|
||||
|
||||
@parameterized.expand(params, name_func=parameterized_custom_name_func)
|
||||
@run_first
|
||||
def test_resume_train_not_from_ds_checkpoint(self, stage, dtype):
|
||||
# do normal training and then resume not from the deepspeed checkpoint but explicitly from
|
||||
# the saved model dir
|
||||
@@ -1207,6 +1212,7 @@ class TestDeepSpeedWithLauncher(TestCasePlus):
|
||||
|
||||
@parameterized.expand(["bf16", "fp16", "fp32"])
|
||||
@require_torch_multi_accelerator
|
||||
@run_first
|
||||
def test_inference(self, dtype):
|
||||
if dtype == "bf16" and not is_torch_bf16_available_on_device(torch_device):
|
||||
self.skipTest(reason="test requires bfloat16 hardware support")
|
||||
@@ -1361,6 +1367,7 @@ class TestDeepSpeedWithLauncher(TestCasePlus):
|
||||
return output_dir
|
||||
|
||||
@parameterized.expand(params, name_func=parameterized_custom_name_func)
|
||||
@run_first
|
||||
def test_clm(self, stage, dtype):
|
||||
# this test exercises model.resize_token_embeddings() which requires param gathering outside
|
||||
# of forward - it's not used by `run_translation.py`, but it is in `run_clm.py`
|
||||
@@ -1397,6 +1404,7 @@ class TestDeepSpeedWithLauncher(TestCasePlus):
|
||||
execute_subprocess_async(cmd, env=self.get_env())
|
||||
|
||||
@require_torch_fp16
|
||||
@run_first
|
||||
def test_clm_from_config_zero3_fp16(self):
|
||||
# this test exercises AutoModel.from_config(config) - to ensure zero.Init is called
|
||||
|
||||
|
||||
@@ -28,6 +28,7 @@ from transformers.testing_utils import (
|
||||
get_tests_dir,
|
||||
require_deepspeed,
|
||||
require_torch_accelerator,
|
||||
run_first,
|
||||
slow,
|
||||
torch_device,
|
||||
)
|
||||
@@ -327,6 +328,7 @@ params = list(itertools.product(stages, task_cmds.keys()))
|
||||
|
||||
|
||||
@slow
|
||||
@run_first
|
||||
@require_deepspeed
|
||||
@require_torch_accelerator
|
||||
class TestDeepSpeedModelZoo(TestCasePlus):
|
||||
|
||||
Reference in New Issue
Block a user