HPU support (#36424)
* test * fix * fix * skip some and run some first * test fsdp * fix * patches for generate * test distributed * copy * don't test distributed loss for hpu * require fp16 and run first * changes from marc's PR fixing zero3 * better alternative * return True when fp16 support on gaudi without creating bridge * fix * fix tested dtype in deepspeed inference test * test * fix * test * fix * skip * require fp16 * run first fsdp * Apply suggestions from code review * address comments * address comments and refactor test * reduce precison * avoid doing gaudi1 specific stuff in the genreation loop * document test_gradient_accumulation_loss_alignment_with_model_loss test a bit more
This commit is contained in:
committed by
GitHub
parent
50d3530aa0
commit
89f6956015
@@ -45,12 +45,14 @@ from transformers.testing_utils import (
|
||||
require_deepspeed,
|
||||
require_optuna,
|
||||
require_torch_accelerator,
|
||||
require_torch_fp16,
|
||||
require_torch_multi_accelerator,
|
||||
run_first,
|
||||
slow,
|
||||
torch_device,
|
||||
)
|
||||
from transformers.trainer_utils import get_last_checkpoint, set_seed
|
||||
from transformers.utils import SAFE_WEIGHTS_NAME, is_torch_bf16_available_on_device
|
||||
from transformers.utils import SAFE_WEIGHTS_NAME, is_torch_bf16_available_on_device, is_torch_fp16_available_on_device
|
||||
|
||||
|
||||
if is_torch_available():
|
||||
@@ -150,10 +152,12 @@ optims = [HF_OPTIM, DS_OPTIM]
|
||||
schedulers = [HF_SCHEDULER, DS_SCHEDULER]
|
||||
|
||||
stages = [ZERO2, ZERO3]
|
||||
|
||||
dtypes = []
|
||||
if is_torch_bf16_available_on_device(torch_device):
|
||||
dtypes = [FP16, BF16]
|
||||
else:
|
||||
dtypes = [FP16]
|
||||
dtypes.append(BF16)
|
||||
if is_torch_fp16_available_on_device(torch_device):
|
||||
dtypes.append(FP16)
|
||||
|
||||
|
||||
def parameterized_custom_name_func(func, param_num, param):
|
||||
@@ -228,6 +232,7 @@ class CoreIntegrationDeepSpeed(TestCasePlus, TrainerIntegrationCommon):
|
||||
AutoModel.from_pretrained(T5_TINY)
|
||||
self.assertNotIn("Detected DeepSpeed ZeRO-3", cl.out)
|
||||
|
||||
@require_torch_fp16
|
||||
@require_torch_accelerator
|
||||
def test_init_zero3_fp16(self):
|
||||
# test that zero.Init() works correctly under zero3/fp16
|
||||
@@ -456,6 +461,7 @@ class TrainerIntegrationDeepSpeedWithCustomConfig(TestCasePlus):
|
||||
|
||||
|
||||
@require_deepspeed
|
||||
@require_torch_fp16
|
||||
@require_torch_accelerator
|
||||
class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, TrainerIntegrationCommon):
|
||||
"""
|
||||
@@ -714,7 +720,7 @@ class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, T
|
||||
# dynamic loss scale value set to:
|
||||
# "fp16.initial_scale_power": 1
|
||||
# plus having the same WarmupLR's warmup_min_lr == warmup_max_lr in the config file
|
||||
# but for some reason going to train_len=64 the weights, weights start to mismatch with this setup.
|
||||
# but for some reason going to train_len=64, the weights start to mismatch with this setup.
|
||||
# the culprit seems to be `initial_scale_power` - putting it back to its default 32 keeps the weights identical
|
||||
|
||||
train_len = 64
|
||||
@@ -757,8 +763,12 @@ class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, T
|
||||
|
||||
# training with half the batch size but accumulation steps as 2 should give the same
|
||||
# weights, but sometimes get a slight difference still of 1e-6
|
||||
self.assertAlmostEqual(no_grad_accum_a, yes_grad_accum_a, places=5)
|
||||
self.assertAlmostEqual(no_grad_accum_b, yes_grad_accum_b, places=5)
|
||||
if torch_device == "hpu":
|
||||
self.assertAlmostEqual(no_grad_accum_a, yes_grad_accum_a, delta=1e-4)
|
||||
self.assertAlmostEqual(no_grad_accum_b, yes_grad_accum_b, delta=1e-4)
|
||||
else:
|
||||
self.assertAlmostEqual(no_grad_accum_a, yes_grad_accum_a, places=5)
|
||||
self.assertAlmostEqual(no_grad_accum_b, yes_grad_accum_b, places=5)
|
||||
|
||||
# Relative difference. See the note above how to get identical loss on a small bs
|
||||
self.assertTrue((no_grad_accum_loss - yes_grad_accum_loss) / (no_grad_accum_loss + 1e-15) <= 1e-3)
|
||||
@@ -1100,6 +1110,7 @@ class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, T
|
||||
|
||||
|
||||
@slow
|
||||
@run_first
|
||||
@require_deepspeed
|
||||
@require_torch_accelerator
|
||||
class TestDeepSpeedWithLauncher(TestCasePlus):
|
||||
@@ -1126,6 +1137,7 @@ class TestDeepSpeedWithLauncher(TestCasePlus):
|
||||
def test_basic_distributed(self, stage, dtype):
|
||||
self.run_and_check(stage=stage, dtype=dtype, distributed=True)
|
||||
|
||||
@require_torch_fp16
|
||||
def test_do_eval_no_train(self):
|
||||
# testing only zero3 since zero2 makes no sense with inference
|
||||
self.run_and_check(
|
||||
@@ -1199,12 +1211,15 @@ class TestDeepSpeedWithLauncher(TestCasePlus):
|
||||
if dtype == "bf16" and not is_torch_bf16_available_on_device(torch_device):
|
||||
self.skipTest(reason="test requires bfloat16 hardware support")
|
||||
|
||||
if dtype == "fp16" and not is_torch_fp16_available_on_device(torch_device):
|
||||
self.skipTest(reason="test requires fp16 hardware support")
|
||||
|
||||
# this is just inference, so no optimizer should be loaded
|
||||
# it only works for z3 (makes no sense with z1-z2)
|
||||
fp32 = True if dtype == "fp32" else False
|
||||
self.run_and_check(
|
||||
stage=ZERO3,
|
||||
dtype=FP16,
|
||||
dtype=dtype,
|
||||
model_name=T5_TINY,
|
||||
distributed=True,
|
||||
do_train=False,
|
||||
@@ -1381,6 +1396,7 @@ class TestDeepSpeedWithLauncher(TestCasePlus):
|
||||
# print(" ".join([f"\nPYTHONPATH={self.src_dir_str}"] +cmd)); die
|
||||
execute_subprocess_async(cmd, env=self.get_env())
|
||||
|
||||
@require_torch_fp16
|
||||
def test_clm_from_config_zero3_fp16(self):
|
||||
# this test exercises AutoModel.from_config(config) - to ensure zero.Init is called
|
||||
|
||||
|
||||
@@ -33,12 +33,17 @@ from transformers.testing_utils import (
|
||||
require_fsdp,
|
||||
require_torch_accelerator,
|
||||
require_torch_multi_accelerator,
|
||||
run_first,
|
||||
slow,
|
||||
torch_device,
|
||||
)
|
||||
from transformers.trainer_callback import TrainerState
|
||||
from transformers.trainer_utils import FSDPOption, set_seed
|
||||
from transformers.utils import is_accelerate_available, is_torch_bf16_available_on_device
|
||||
from transformers.utils import (
|
||||
is_accelerate_available,
|
||||
is_torch_bf16_available_on_device,
|
||||
is_torch_fp16_available_on_device,
|
||||
)
|
||||
|
||||
|
||||
if is_torch_available():
|
||||
@@ -49,14 +54,19 @@ else:
|
||||
|
||||
# default torch.distributed port
|
||||
DEFAULT_MASTER_PORT = "10999"
|
||||
dtypes = ["fp16"]
|
||||
|
||||
dtypes = []
|
||||
if is_torch_bf16_available_on_device(torch_device):
|
||||
dtypes += ["bf16"]
|
||||
if is_torch_fp16_available_on_device(torch_device):
|
||||
dtypes += ["fp16"]
|
||||
|
||||
sharding_strategies = ["full_shard", "shard_grad_op"]
|
||||
state_dict_types = ["FULL_STATE_DICT", "SHARDED_STATE_DICT"]
|
||||
set_seed(42)
|
||||
params = list(itertools.product(sharding_strategies, dtypes))
|
||||
|
||||
set_seed(42)
|
||||
|
||||
|
||||
def get_master_port(real_launcher=False):
|
||||
"""
|
||||
@@ -140,13 +150,13 @@ class TrainerIntegrationFSDP(TestCasePlus, TrainerIntegrationCommon):
|
||||
}
|
||||
|
||||
self.fsdp_config = {
|
||||
"backward_prefetch": "backward_pre",
|
||||
"forward_prefetch": "False",
|
||||
"limit_all_gathers": "False",
|
||||
"use_orig_params": "True",
|
||||
"sync_module_states": "True",
|
||||
"cpu_ram_efficient_loading": "True",
|
||||
"activation_checkpointing": "False",
|
||||
"backward_prefetch": "BACKWARD_PRE",
|
||||
"forward_prefetch": "false",
|
||||
"limit_all_gathers": "false",
|
||||
"use_orig_params": "true",
|
||||
"sync_module_states": "true",
|
||||
"cpu_ram_efficient_loading": "true",
|
||||
"activation_checkpointing": "false",
|
||||
"min_num_params": 1,
|
||||
}
|
||||
|
||||
@@ -202,7 +212,7 @@ class TrainerIntegrationFSDP(TestCasePlus, TrainerIntegrationCommon):
|
||||
self.assertEqual(
|
||||
os.environ[f"{prefix}TRANSFORMER_CLS_TO_WRAP"], ",".join(fsdp_config["transformer_layer_cls_to_wrap"])
|
||||
)
|
||||
self.assertEqual(os.environ[f"{prefix}BACKWARD_PREFETCH"], fsdp_config["backward_prefetch"].upper())
|
||||
self.assertEqual(os.environ[f"{prefix}BACKWARD_PREFETCH"], fsdp_config["backward_prefetch"])
|
||||
self.assertEqual(os.environ[f"{prefix}FORWARD_PREFETCH"], fsdp_config["forward_prefetch"])
|
||||
self.assertEqual(os.environ[f"{prefix}USE_ORIG_PARAMS"], fsdp_config["use_orig_params"])
|
||||
self.assertEqual(os.environ[f"{prefix}SYNC_MODULE_STATES"], fsdp_config["sync_module_states"])
|
||||
@@ -213,6 +223,7 @@ class TrainerIntegrationFSDP(TestCasePlus, TrainerIntegrationCommon):
|
||||
|
||||
@parameterized.expand(params, name_func=_parameterized_custom_name_func)
|
||||
@require_torch_multi_accelerator
|
||||
@run_first
|
||||
@slow
|
||||
def test_basic_run(self, sharding_strategy, dtype):
|
||||
launcher = get_launcher(distributed=True, use_accelerate=False)
|
||||
@@ -225,6 +236,7 @@ class TrainerIntegrationFSDP(TestCasePlus, TrainerIntegrationCommon):
|
||||
|
||||
@parameterized.expand(params, name_func=_parameterized_custom_name_func)
|
||||
@require_torch_multi_accelerator
|
||||
@run_first
|
||||
@slow
|
||||
def test_basic_run_with_gradient_accumulation(self, sharding_strategy, dtype):
|
||||
launcher = get_launcher(distributed=True, use_accelerate=False)
|
||||
@@ -237,6 +249,7 @@ class TrainerIntegrationFSDP(TestCasePlus, TrainerIntegrationCommon):
|
||||
|
||||
@parameterized.expand(dtypes)
|
||||
@require_torch_multi_accelerator
|
||||
@run_first
|
||||
@slow
|
||||
@unittest.skipIf(not is_torch_greater_or_equal_than_2_1, reason="This test on pytorch 2.0 takes 4 hours.")
|
||||
def test_basic_run_with_cpu_offload(self, dtype):
|
||||
@@ -250,6 +263,7 @@ class TrainerIntegrationFSDP(TestCasePlus, TrainerIntegrationCommon):
|
||||
|
||||
@parameterized.expand(state_dict_types, name_func=_parameterized_custom_name_func)
|
||||
@require_torch_multi_accelerator
|
||||
@run_first
|
||||
@slow
|
||||
def test_training_and_can_resume_normally(self, state_dict_type):
|
||||
output_dir = self.get_auto_remove_tmp_dir("./xxx", after=False)
|
||||
@@ -286,10 +300,13 @@ class TrainerIntegrationFSDP(TestCasePlus, TrainerIntegrationCommon):
|
||||
self.assertAlmostEqual(log["learning_rate"], log1["learning_rate"], delta=1e-5)
|
||||
|
||||
@require_torch_multi_accelerator
|
||||
@run_first
|
||||
@slow
|
||||
@require_torch_accelerator
|
||||
@require_fsdp
|
||||
def test_fsdp_cpu_offloading(self):
|
||||
# TODO: This file is missing and should be added or the test should be removed
|
||||
if not os.path.exists("utils/testing_scripts/fsdp_cpu_offloading.py"):
|
||||
raise unittest.SkipTest("FSDP CPU offloading script not found!")
|
||||
|
||||
try:
|
||||
subprocess.run(
|
||||
"accelerate launch utils/testing_scripts/fsdp_cpu_offloading.py --config utils/testing_scripts/dummy_fsdp_config.yml",
|
||||
|
||||
@@ -2770,7 +2770,7 @@ class ModelTesterMixin:
|
||||
elif param_device in ["mps"]:
|
||||
self.assertEqual(param.device, torch.device("mps"))
|
||||
else:
|
||||
# when loaded with device_map, `param_device` are integer values for cuda/xpu/npu/mlu
|
||||
# when loaded with device_map, `param_device` are integer values for cuda/xpu/hpu/npu/mlu
|
||||
self.assertEqual(param.device, torch.device(f"{torch_device}:{param_device}"))
|
||||
|
||||
@require_accelerate
|
||||
|
||||
@@ -75,6 +75,7 @@ from transformers.testing_utils import (
|
||||
require_intel_extension_for_pytorch,
|
||||
require_liger_kernel,
|
||||
require_lomo,
|
||||
require_non_hpu,
|
||||
require_non_xpu,
|
||||
require_optuna,
|
||||
require_peft,
|
||||
@@ -88,6 +89,7 @@ from transformers.testing_utils import (
|
||||
require_torch,
|
||||
require_torch_accelerator,
|
||||
require_torch_bf16,
|
||||
require_torch_fp16,
|
||||
require_torch_gpu,
|
||||
require_torch_multi_accelerator,
|
||||
require_torch_non_multi_accelerator,
|
||||
@@ -98,6 +100,7 @@ from transformers.testing_utils import (
|
||||
require_torchdynamo,
|
||||
require_vision,
|
||||
require_wandb,
|
||||
run_first,
|
||||
run_test_using_subprocess,
|
||||
slow,
|
||||
torch_device,
|
||||
@@ -119,6 +122,13 @@ from transformers.utils import (
|
||||
from transformers.utils.hp_naming import TrialShortNamer
|
||||
|
||||
|
||||
if torch_device == "hpu":
|
||||
RTOL = 1e-3
|
||||
ATOL = 1e-3
|
||||
else:
|
||||
RTOL = 1e-5
|
||||
ATOL = 1e-5
|
||||
|
||||
if is_torch_available():
|
||||
import torch
|
||||
from torch import nn
|
||||
@@ -726,11 +736,11 @@ class TrainerIntegrationPrerunTest(TestCasePlus, TrainerIntegrationCommon):
|
||||
trainer.train()
|
||||
self.alternate_trained_model = (trainer.model.a, trainer.model.b)
|
||||
|
||||
def check_trained_model(self, model, alternate_seed=False):
|
||||
def check_trained_model(self, model, alternate_seed=False, **kwargs):
|
||||
# Checks a training seeded with learning_rate = 0.1
|
||||
(a, b) = self.alternate_trained_model if alternate_seed else self.default_trained_model
|
||||
torch.testing.assert_close(model.a, a)
|
||||
torch.testing.assert_close(model.b, b)
|
||||
torch.testing.assert_close(model.a, a, **kwargs)
|
||||
torch.testing.assert_close(model.b, b, **kwargs)
|
||||
|
||||
def test_reproducible_training(self):
|
||||
# Checks that training worked, model trained and seed made a reproducible training.
|
||||
@@ -812,11 +822,6 @@ class TrainerIntegrationPrerunTest(TestCasePlus, TrainerIntegrationCommon):
|
||||
|
||||
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
|
||||
|
||||
model = AutoModelForCausalLM.from_pretrained(model_name)
|
||||
state_dict = model.state_dict()
|
||||
|
||||
base_loss_callback = StoreLossCallback()
|
||||
|
||||
args_kwargs = {
|
||||
"report_to": "none",
|
||||
"logging_steps": 1,
|
||||
@@ -830,6 +835,10 @@ class TrainerIntegrationPrerunTest(TestCasePlus, TrainerIntegrationCommon):
|
||||
tmp_dir,
|
||||
**args_kwargs,
|
||||
)
|
||||
# train with base loss
|
||||
set_seed(42)
|
||||
model = AutoModelForCausalLM.from_pretrained(model_name)
|
||||
base_loss_callback = StoreLossCallback()
|
||||
trainer = Trainer(
|
||||
model,
|
||||
args,
|
||||
@@ -840,16 +849,17 @@ class TrainerIntegrationPrerunTest(TestCasePlus, TrainerIntegrationCommon):
|
||||
assert trainer.model_accepts_loss_kwargs
|
||||
trainer.train()
|
||||
|
||||
grad_accum_loss_callback = StoreLossCallback()
|
||||
with tempfile.TemporaryDirectory() as tmp_dir:
|
||||
args = TrainingArguments(
|
||||
tmp_dir,
|
||||
**args_kwargs,
|
||||
gradient_accumulation_steps=2,
|
||||
per_device_train_batch_size=4,
|
||||
)
|
||||
|
||||
# train with gradient accumulation
|
||||
set_seed(42)
|
||||
model = AutoModelForCausalLM.from_pretrained(model_name)
|
||||
grad_accum_loss_callback = StoreLossCallback()
|
||||
trainer = Trainer(
|
||||
model,
|
||||
args,
|
||||
@@ -857,10 +867,12 @@ class TrainerIntegrationPrerunTest(TestCasePlus, TrainerIntegrationCommon):
|
||||
callbacks=[grad_accum_loss_callback],
|
||||
data_collator=data_collator,
|
||||
)
|
||||
assert trainer.model_accepts_loss_kwargs
|
||||
trainer.train()
|
||||
|
||||
# train with broken loss
|
||||
set_seed(42)
|
||||
model.load_state_dict(state_dict)
|
||||
model = AutoModelForCausalLM.from_pretrained(model_name)
|
||||
broken_loss_callback = StoreLossCallback()
|
||||
trainer = Trainer(
|
||||
model,
|
||||
@@ -869,30 +881,28 @@ class TrainerIntegrationPrerunTest(TestCasePlus, TrainerIntegrationCommon):
|
||||
callbacks=[broken_loss_callback],
|
||||
data_collator=data_collator,
|
||||
)
|
||||
# disable model_accepts_loss_kwargs
|
||||
# disable model_accepts_loss_kwargs so that "num_items_in_batch" is not passed to the model
|
||||
trainer.model_accepts_loss_kwargs = False
|
||||
trainer.train()
|
||||
|
||||
# Calculate the difference between the base loss and the grad_accum loss
|
||||
diff_truth = [
|
||||
abs(base - grad) for base, grad in zip(base_loss_callback.losses, grad_accum_loss_callback.losses)
|
||||
]
|
||||
diff_broken = [
|
||||
abs(base - grad) for base, grad in zip(base_loss_callback.losses, broken_loss_callback.losses)
|
||||
]
|
||||
# Calculate the difference between the base loss and the grad_accum loss
|
||||
diff_truth = [
|
||||
abs(base - grad) for base, grad in zip(base_loss_callback.losses, grad_accum_loss_callback.losses)
|
||||
]
|
||||
diff_broken = [abs(base - grad) for base, grad in zip(base_loss_callback.losses, broken_loss_callback.losses)]
|
||||
|
||||
# all diff truth should be quite close
|
||||
self.assertLess(max(diff_truth), 0.01, f"Difference {max(diff_truth)} is not within 0.01")
|
||||
# all diff truth should be quite close
|
||||
self.assertLess(max(diff_truth), 0.01, f"Difference {max(diff_truth)} is not within 0.01")
|
||||
|
||||
# max diff broken should be very off
|
||||
self.assertGreater(max(diff_broken), 1.5, f"Difference {max(diff_broken)} is not greater than 2")
|
||||
# max diff broken should be very off
|
||||
self.assertGreater(max(diff_broken), 1.3, f"Difference {max(diff_broken)} is not greater than 1.3")
|
||||
|
||||
loss_base = sum(base_loss_callback.losses)
|
||||
loss_broken = sum(broken_loss_callback.losses)
|
||||
loss_base = sum(base_loss_callback.losses)
|
||||
loss_broken = sum(broken_loss_callback.losses)
|
||||
|
||||
# mean/sum loss should not vary too much.
|
||||
relative_diff = abs(loss_base - loss_broken) / max(loss_base, loss_broken)
|
||||
self.assertLess(relative_diff, 0.2, f"Relative difference {relative_diff} is not within 0.2")
|
||||
# mean/sum loss should not vary too much.
|
||||
relative_diff = abs(loss_base - loss_broken) / max(loss_base, loss_broken)
|
||||
self.assertLess(relative_diff, 0.2, f"Relative difference {relative_diff} is not within 0.2")
|
||||
|
||||
def test_gradient_accumulation_loss_alignment_with_loss_func(self):
|
||||
set_seed(42)
|
||||
@@ -1214,14 +1224,14 @@ class TrainerIntegrationPrerunTest(TestCasePlus, TrainerIntegrationCommon):
|
||||
self.assertFalse(torch.allclose(trainer.model.b, b))
|
||||
self.assertGreater(trainer.optimizer.state_dict()["param_groups"][0]["lr"], 0)
|
||||
|
||||
@require_torch_accelerator
|
||||
@require_torch_bf16
|
||||
@require_torch_accelerator
|
||||
def test_mixed_bf16(self):
|
||||
# very basic test
|
||||
with tempfile.TemporaryDirectory() as tmp_dir:
|
||||
trainer = get_regression_trainer(learning_rate=0.1, bf16=True, output_dir=tmp_dir)
|
||||
trainer.train()
|
||||
self.check_trained_model(trainer.model)
|
||||
self.check_trained_model(trainer.model, atol=ATOL, rtol=RTOL)
|
||||
|
||||
# --bf16 --half_precision_backend apex can't be used together
|
||||
with tempfile.TemporaryDirectory() as tmp_dir:
|
||||
@@ -3582,6 +3592,7 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
|
||||
)
|
||||
|
||||
@slow
|
||||
@run_first
|
||||
def test_trainer_eval_mrpc(self):
|
||||
MODEL_ID = "google-bert/bert-base-cased-finetuned-mrpc"
|
||||
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
|
||||
@@ -3598,6 +3609,7 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
|
||||
self.assertLess(result["eval_loss"], 0.2)
|
||||
|
||||
@slow
|
||||
@run_first
|
||||
def test_trainer_eval_multiple(self):
|
||||
MODEL_ID = "openai-community/gpt2"
|
||||
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
|
||||
@@ -3897,6 +3909,7 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
|
||||
trainer = get_regression_trainer(skip_memory_metrics=True, output_dir=tmp_dir)
|
||||
self.check_mem_metrics(trainer, self.assertNotIn)
|
||||
|
||||
@require_torch_fp16
|
||||
@require_torch_accelerator
|
||||
def test_fp16_full_eval(self):
|
||||
# this is a sensitive test so let's keep debugging printouts in place for quick diagnosis.
|
||||
@@ -4152,6 +4165,7 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
|
||||
self.assertListEqual(trainer.optimizer.param_groups[1]["params"], no_wd_params)
|
||||
|
||||
@slow
|
||||
@require_non_hpu
|
||||
@require_torch_multi_accelerator
|
||||
def test_end_to_end_example(self):
|
||||
# Tests that `translation.py` will run without issues
|
||||
|
||||
@@ -19,12 +19,11 @@ import numpy as np
|
||||
from transformers import EvalPrediction, HfArgumentParser, TrainingArguments, is_torch_available
|
||||
from transformers.testing_utils import (
|
||||
TestCasePlus,
|
||||
backend_device_count,
|
||||
execute_subprocess_async,
|
||||
get_torch_dist_unique_port,
|
||||
require_torch_multi_gpu,
|
||||
require_torch_multi_xpu,
|
||||
require_torch_neuroncore,
|
||||
require_torch_npu,
|
||||
require_torch_multi_accelerator,
|
||||
torch_device,
|
||||
)
|
||||
from transformers.training_args import ParallelMode
|
||||
from transformers.utils import logging
|
||||
@@ -117,38 +116,10 @@ if is_torch_available():
|
||||
return result
|
||||
|
||||
|
||||
class TestTrainerDistributedNeuronCore(TestCasePlus):
|
||||
@require_torch_neuroncore
|
||||
def test_trainer(self):
|
||||
distributed_args = f"""--nproc_per_node=2
|
||||
--master_port={get_torch_dist_unique_port()}
|
||||
{self.test_file_dir}/test_trainer_distributed.py
|
||||
""".split()
|
||||
output_dir = self.get_auto_remove_tmp_dir()
|
||||
args = f"--output_dir {output_dir}".split()
|
||||
cmd = ["torchrun"] + distributed_args + args
|
||||
execute_subprocess_async(cmd, env=self.get_env())
|
||||
# successful return here == success - any errors would have caused an error in the sub-call
|
||||
|
||||
|
||||
class TestTrainerDistributedNPU(TestCasePlus):
|
||||
@require_torch_npu
|
||||
def test_trainer(self):
|
||||
distributed_args = f"""--nproc_per_node=2
|
||||
--master_port={get_torch_dist_unique_port()}
|
||||
{self.test_file_dir}/test_trainer_distributed.py
|
||||
""".split()
|
||||
output_dir = self.get_auto_remove_tmp_dir()
|
||||
args = f"--output_dir {output_dir}".split()
|
||||
cmd = ["torchrun"] + distributed_args + args
|
||||
execute_subprocess_async(cmd, env=self.get_env())
|
||||
# successful return here == success - any errors would have caused an error in the sub-call
|
||||
|
||||
|
||||
class TestTrainerDistributed(TestCasePlus):
|
||||
@require_torch_multi_gpu
|
||||
@require_torch_multi_accelerator
|
||||
def test_trainer(self):
|
||||
distributed_args = f"""--nproc_per_node={torch.cuda.device_count()}
|
||||
distributed_args = f"""--nproc_per_node={backend_device_count(torch_device)}
|
||||
--master_port={get_torch_dist_unique_port()}
|
||||
{self.test_file_dir}/test_trainer_distributed.py
|
||||
""".split()
|
||||
@@ -159,20 +130,6 @@ class TestTrainerDistributed(TestCasePlus):
|
||||
# successful return here == success - any errors would have caused an error in the sub-call
|
||||
|
||||
|
||||
@require_torch_multi_xpu
|
||||
class TestTrainerDistributedXPU(TestCasePlus):
|
||||
def test_trainer(self):
|
||||
distributed_args = f"""--nproc_per_node={torch.xpu.device_count()}
|
||||
--master_port={get_torch_dist_unique_port()}
|
||||
{self.test_file_dir}/test_trainer_distributed.py
|
||||
""".split()
|
||||
output_dir = self.get_auto_remove_tmp_dir()
|
||||
args = f"--output_dir {output_dir}".split()
|
||||
cmd = ["torchrun"] + distributed_args + args
|
||||
execute_subprocess_async(cmd, env=self.get_env())
|
||||
# successful return here == success - any errors would have caused an error in the sub-call
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# The script below is meant to be run under torch.distributed, on a machine with multiple GPUs:
|
||||
#
|
||||
|
||||
@@ -17,12 +17,15 @@ from typing import Dict
|
||||
from transformers import is_torch_available
|
||||
from transformers.testing_utils import (
|
||||
TestCasePlus,
|
||||
backend_device_count,
|
||||
execute_subprocess_async,
|
||||
get_torch_dist_unique_port,
|
||||
require_accelerate,
|
||||
require_fp8,
|
||||
require_fsdp,
|
||||
require_torch_multi_gpu,
|
||||
require_torch_multi_accelerator,
|
||||
run_first,
|
||||
torch_device,
|
||||
)
|
||||
|
||||
|
||||
@@ -64,9 +67,10 @@ if is_torch_available():
|
||||
|
||||
|
||||
class TestFSDPTrainer(TestCasePlus):
|
||||
@require_torch_multi_accelerator
|
||||
@require_accelerate
|
||||
@require_torch_multi_gpu
|
||||
@require_fsdp
|
||||
@run_first
|
||||
def test_trainer(self):
|
||||
output_dir = self.get_auto_remove_tmp_dir()
|
||||
cmd = [
|
||||
@@ -76,7 +80,7 @@ class TestFSDPTrainer(TestCasePlus):
|
||||
"--main_process_port",
|
||||
f"{get_torch_dist_unique_port()}",
|
||||
"--num_processes",
|
||||
f"{torch.cuda.device_count()}",
|
||||
f"{backend_device_count(torch_device)}",
|
||||
"--fsdp_transformer_layer_cls_to_wrap",
|
||||
"GPT2Block",
|
||||
f"{self.test_file_dir}/test_trainer_fsdp.py",
|
||||
@@ -90,10 +94,11 @@ class TestFSDPTrainer(TestCasePlus):
|
||||
|
||||
|
||||
class TestFSDPTrainerFP8(TestCasePlus):
|
||||
@require_torch_multi_accelerator
|
||||
@require_accelerate
|
||||
@require_torch_multi_gpu
|
||||
@require_fsdp
|
||||
@require_fp8
|
||||
@run_first
|
||||
def test_trainer(self):
|
||||
output_dir = self.get_auto_remove_tmp_dir()
|
||||
cmd = [
|
||||
@@ -103,7 +108,7 @@ class TestFSDPTrainerFP8(TestCasePlus):
|
||||
"--main_process_port",
|
||||
f"{get_torch_dist_unique_port()}",
|
||||
"--num_processes",
|
||||
f"{torch.cuda.device_count()}",
|
||||
f"{backend_device_count(torch_device)}",
|
||||
"--mixed_precision",
|
||||
"fp8",
|
||||
"--fsdp_transformer_layer_cls_to_wrap",
|
||||
@@ -117,32 +122,34 @@ class TestFSDPTrainerFP8(TestCasePlus):
|
||||
execute_subprocess_async(cmd, env=self.get_env())
|
||||
# successful return here == success - any errors would have caused an error in the sub-call
|
||||
|
||||
class TestFSDPTrainerWrap(TestCasePlus):
|
||||
@require_accelerate
|
||||
@require_torch_multi_gpu
|
||||
@require_fsdp
|
||||
def test_trainer(self):
|
||||
output_dir = self.get_auto_remove_tmp_dir()
|
||||
cmd = [
|
||||
"accelerate",
|
||||
"launch",
|
||||
"--use_fsdp",
|
||||
"--main_process_port",
|
||||
f"{get_torch_dist_unique_port()}",
|
||||
"--num_processes",
|
||||
f"{torch.cuda.device_count()}",
|
||||
"--fsdp_transformer_layer_cls_to_wrap",
|
||||
"GPT2Block",
|
||||
f"{self.test_file_dir}/test_trainer_fsdp.py",
|
||||
"--output_dir",
|
||||
f"{output_dir}",
|
||||
"--report_to",
|
||||
"none",
|
||||
"--auto_find_batch_size",
|
||||
"True",
|
||||
]
|
||||
execute_subprocess_async(cmd, env=self.get_env())
|
||||
# successful return here == success - any errors would have caused an error in the sub-call
|
||||
|
||||
class TestFSDPTrainerWrap(TestCasePlus):
|
||||
@require_torch_multi_accelerator
|
||||
@require_accelerate
|
||||
@require_fsdp
|
||||
@run_first
|
||||
def test_trainer(self):
|
||||
output_dir = self.get_auto_remove_tmp_dir()
|
||||
cmd = [
|
||||
"accelerate",
|
||||
"launch",
|
||||
"--use_fsdp",
|
||||
"--main_process_port",
|
||||
f"{get_torch_dist_unique_port()}",
|
||||
"--num_processes",
|
||||
f"{backend_device_count(torch_device)}",
|
||||
"--fsdp_transformer_layer_cls_to_wrap",
|
||||
"GPT2Block",
|
||||
f"{self.test_file_dir}/test_trainer_fsdp.py",
|
||||
"--output_dir",
|
||||
f"{output_dir}",
|
||||
"--report_to",
|
||||
"none",
|
||||
"--auto_find_batch_size",
|
||||
"True",
|
||||
]
|
||||
execute_subprocess_async(cmd, env=self.get_env())
|
||||
# successful return here == success - any errors would have caused an error in the sub-call
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
Reference in New Issue
Block a user