Revert PR 32299, flag users when Zero-3 was missed (#32851)
Revert PR 32299
This commit is contained in:
@@ -1478,9 +1478,6 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
|
|||||||
else:
|
else:
|
||||||
model = cls(config, **kwargs)
|
model = cls(config, **kwargs)
|
||||||
|
|
||||||
# Flag for if we init with `zero3`, add an attr to the model so we can check downstream for issues
|
|
||||||
model._transformers_zero3_init_used = is_deepspeed_zero3_enabled()
|
|
||||||
|
|
||||||
# restore default dtype if it was modified
|
# restore default dtype if it was modified
|
||||||
if dtype_orig is not None:
|
if dtype_orig is not None:
|
||||||
torch.set_default_dtype(dtype_orig)
|
torch.set_default_dtype(dtype_orig)
|
||||||
@@ -3810,9 +3807,6 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
|
|||||||
# Let's make sure we don't run the init function of buffer modules
|
# Let's make sure we don't run the init function of buffer modules
|
||||||
model = cls(config, *model_args, **model_kwargs)
|
model = cls(config, *model_args, **model_kwargs)
|
||||||
|
|
||||||
# If we init with `zero3`, add an attr to the model so we can check downstream for issues
|
|
||||||
model._transformers_zero3_init_used = is_deepspeed_zero3_enabled() and not is_quantized
|
|
||||||
|
|
||||||
# make sure we use the model's config since the __init__ call might have copied it
|
# make sure we use the model's config since the __init__ call might have copied it
|
||||||
config = model.config
|
config = model.config
|
||||||
|
|
||||||
|
|||||||
@@ -100,7 +100,6 @@ from .trainer_pt_utils import (
|
|||||||
get_model_param_count,
|
get_model_param_count,
|
||||||
get_module_class_from_name,
|
get_module_class_from_name,
|
||||||
get_parameter_names,
|
get_parameter_names,
|
||||||
is_deepspeed_zero3_enabled,
|
|
||||||
nested_concat,
|
nested_concat,
|
||||||
nested_detach,
|
nested_detach,
|
||||||
nested_numpify,
|
nested_numpify,
|
||||||
@@ -435,15 +434,6 @@ class Trainer:
|
|||||||
)
|
)
|
||||||
self.model_init = model_init
|
self.model_init = model_init
|
||||||
|
|
||||||
# Will reach this branch if the user has
|
|
||||||
# 1. Used `.from_pretrained` or `.from_config` to initialize their model
|
|
||||||
# 2. Did not configure Zero-3 via `TrainingArguments` or `accelerate launch` beforehand
|
|
||||||
# New models init such as `MyModel()` will not hit this step
|
|
||||||
if is_deepspeed_zero3_enabled() and not getattr(model, "_transformers_zero3_init_used", True):
|
|
||||||
raise ValueError(
|
|
||||||
"Model was not initialized with `Zero-3` despite being configured for DeepSpeed Zero-3. Please re-initialize your model via `Model.from_pretrained(...)` or `Model.from_config(...)` after creating your `TrainingArguments`!"
|
|
||||||
)
|
|
||||||
|
|
||||||
if model.__class__.__name__ in MODEL_MAPPING_NAMES:
|
if model.__class__.__name__ in MODEL_MAPPING_NAMES:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"The model you have picked ({model.__class__.__name__}) cannot be used as is for training: it only "
|
f"The model you have picked ({model.__class__.__name__}) cannot be used as is for training: it only "
|
||||||
|
|||||||
@@ -709,30 +709,34 @@ class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, T
|
|||||||
# Relative difference. See the note above how to get identical loss on a small bs
|
# Relative difference. See the note above how to get identical loss on a small bs
|
||||||
self.assertTrue((no_grad_accum_loss - yes_grad_accum_loss) / (no_grad_accum_loss + 1e-15) <= 1e-3)
|
self.assertTrue((no_grad_accum_loss - yes_grad_accum_loss) / (no_grad_accum_loss + 1e-15) <= 1e-3)
|
||||||
|
|
||||||
def test_missed_zero3_init(self):
|
# NOTE: Currently a disabled test. In the future we should re-enable it.
|
||||||
from transformers import Trainer # noqa
|
# Issue resolves around Zero-3 w/ DPO/TRL + DeepSpeed
|
||||||
|
# As well as Zero-3 inference
|
||||||
|
# Related PR: https://github.com/huggingface/transformers/pull/32299
|
||||||
|
# def test_missed_zero3_init(self):
|
||||||
|
# from transformers import Trainer # noqa
|
||||||
|
|
||||||
with mockenv_context(**self.dist_env_1_gpu):
|
# with mockenv_context(**self.dist_env_1_gpu):
|
||||||
model = AutoModel.from_pretrained(T5_TINY)
|
# model = AutoModel.from_pretrained(T5_TINY)
|
||||||
training_args = TrainingArguments(
|
# training_args = TrainingArguments(
|
||||||
output_dir="./test_missed_zero3_init",
|
# output_dir="./test_missed_zero3_init",
|
||||||
deepspeed=self.get_config_dict(ZERO3),
|
# deepspeed=self.get_config_dict(ZERO3),
|
||||||
)
|
# )
|
||||||
with self.assertRaises(
|
# with self.assertRaises(
|
||||||
ValueError, msg="Model was not initialized with `Zero-3` despite being configured."
|
# ValueError, msg="Model was not initialized with `Zero-3` despite being configured."
|
||||||
):
|
# ):
|
||||||
_ = Trainer(
|
# _ = Trainer(
|
||||||
model=model,
|
# model=model,
|
||||||
args=training_args,
|
# args=training_args,
|
||||||
)
|
# )
|
||||||
# Now do it properly, triggered from our `TrainingArguments` earlier
|
# # Now do it properly, triggered from our `TrainingArguments` earlier
|
||||||
model = AutoModel.from_pretrained(T5_TINY)
|
# model = AutoModel.from_pretrained(T5_TINY)
|
||||||
trainer = Trainer(
|
# trainer = Trainer(
|
||||||
model=model,
|
# model=model,
|
||||||
args=training_args,
|
# args=training_args,
|
||||||
)
|
# )
|
||||||
assert trainer.is_deepspeed_enabled
|
# assert trainer.is_deepspeed_enabled
|
||||||
assert model._transformers_zero3_init_used
|
# assert model._transformers_zero3_init_used
|
||||||
|
|
||||||
def check_saved_checkpoints_deepspeed(self, output_dir, freq, total, stage, dtype):
|
def check_saved_checkpoints_deepspeed(self, output_dir, freq, total, stage, dtype):
|
||||||
# adapted from TrainerIntegrationCommon.check_saved_checkpoints
|
# adapted from TrainerIntegrationCommon.check_saved_checkpoints
|
||||||
|
|||||||
Reference in New Issue
Block a user