AdamW is now supported by default (#9624)
This commit is contained in:
@@ -655,7 +655,6 @@ enables FP16, uses AdamW optimizer and WarmupLR scheduler:
|
|||||||
"weight_decay": 3e-7
|
"weight_decay": 3e-7
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"zero_allow_untested_optimizer": true,
|
|
||||||
|
|
||||||
"scheduler": {
|
"scheduler": {
|
||||||
"type": "WarmupLR",
|
"type": "WarmupLR",
|
||||||
@@ -766,8 +765,8 @@ Optimizer
|
|||||||
=======================================================================================================================
|
=======================================================================================================================
|
||||||
|
|
||||||
|
|
||||||
DeepSpeed's main optimizers are Adam, OneBitAdam, and Lamb. These have been thoroughly tested with ZeRO and are thus
|
DeepSpeed's main optimizers are Adam, AdamW, OneBitAdam, and Lamb. These have been thoroughly tested with ZeRO and are
|
||||||
recommended to be used. It, however, can import other optimizers from ``torch``. The full documentation is `here
|
thus recommended to be used. It, however, can import other optimizers from ``torch``. The full documentation is `here
|
||||||
<https://www.deepspeed.ai/docs/config-json/#optimizer-parameters>`__.
|
<https://www.deepspeed.ai/docs/config-json/#optimizer-parameters>`__.
|
||||||
|
|
||||||
If you don't configure the ``optimizer`` entry in the configuration file, the :class:`~transformers.Trainer` will
|
If you don't configure the ``optimizer`` entry in the configuration file, the :class:`~transformers.Trainer` will
|
||||||
@@ -779,7 +778,6 @@ Here is an example of the pre-configured ``optimizer`` entry for AdamW:
|
|||||||
.. code-block:: json
|
.. code-block:: json
|
||||||
|
|
||||||
{
|
{
|
||||||
"zero_allow_untested_optimizer": true,
|
|
||||||
"optimizer": {
|
"optimizer": {
|
||||||
"type": "AdamW",
|
"type": "AdamW",
|
||||||
"params": {
|
"params": {
|
||||||
@@ -791,8 +789,8 @@ Here is an example of the pre-configured ``optimizer`` entry for AdamW:
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
Since AdamW isn't on the list of tested with DeepSpeed/ZeRO optimizers, we have to add
|
If you want to use another optimizer which is not listed above, you will have to add ``"zero_allow_untested_optimizer":
|
||||||
``zero_allow_untested_optimizer`` flag.
|
true`` to the top level configuration.
|
||||||
|
|
||||||
If you want to use one of the officially supported optimizers, configure them explicitly in the configuration file, and
|
If you want to use one of the officially supported optimizers, configure them explicitly in the configuration file, and
|
||||||
make sure to adjust the values. e.g. if use Adam you will want ``weight_decay`` around ``0.01``.
|
make sure to adjust the values. e.g. if use Adam you will want ``weight_decay`` around ``0.01``.
|
||||||
|
|||||||
@@ -19,8 +19,6 @@
|
|||||||
"cpu_offload": true
|
"cpu_offload": true
|
||||||
},
|
},
|
||||||
|
|
||||||
"zero_allow_untested_optimizer": true,
|
|
||||||
|
|
||||||
"optimizer": {
|
"optimizer": {
|
||||||
"type": "AdamW",
|
"type": "AdamW",
|
||||||
"params": {
|
"params": {
|
||||||
|
|||||||
@@ -26,6 +26,7 @@ from types import SimpleNamespace
|
|||||||
|
|
||||||
from .trainer_utils import SchedulerType
|
from .trainer_utils import SchedulerType
|
||||||
from .utils import logging
|
from .utils import logging
|
||||||
|
from .utils.versions import require_version
|
||||||
|
|
||||||
|
|
||||||
logger = logging.get_logger(__name__)
|
logger = logging.get_logger(__name__)
|
||||||
@@ -281,6 +282,8 @@ def init_deepspeed(trainer, num_training_steps):
|
|||||||
"""
|
"""
|
||||||
import deepspeed
|
import deepspeed
|
||||||
|
|
||||||
|
require_version("deepspeed>0.3.10")
|
||||||
|
|
||||||
args = trainer.args
|
args = trainer.args
|
||||||
ds_config_file = args.deepspeed
|
ds_config_file = args.deepspeed
|
||||||
model = trainer.model
|
model = trainer.model
|
||||||
@@ -323,9 +326,8 @@ def init_deepspeed(trainer, num_training_steps):
|
|||||||
f"Keeping the `optimizer` config from {ds_config_file} intact, ignoring any optimizer-specific cl args"
|
f"Keeping the `optimizer` config from {ds_config_file} intact, ignoring any optimizer-specific cl args"
|
||||||
)
|
)
|
||||||
else: # override only if the ds config doesn't already have this section
|
else: # override only if the ds config doesn't already have this section
|
||||||
# ds supports Adam, OneBitAdam, and Lamb optimizers and can import other optimizers from torch.
|
# ds supports Adam, AdamW, OneBitAdam, and Lamb optimizers and can import other optimizers from torch.
|
||||||
# But trainer uses AdamW by default.
|
# To use other optimizers requires voiding warranty with: `"zero_allow_untested_optimizer": true"`
|
||||||
# To use other optimizers so using a different scheduler requires voiding warranty with: `zero_allow_untested_optimizer`
|
|
||||||
|
|
||||||
optimizer_configs = {
|
optimizer_configs = {
|
||||||
"AdamW": {
|
"AdamW": {
|
||||||
@@ -337,7 +339,6 @@ def init_deepspeed(trainer, num_training_steps):
|
|||||||
}
|
}
|
||||||
optimizer = "AdamW"
|
optimizer = "AdamW"
|
||||||
|
|
||||||
config["zero_allow_untested_optimizer"] = True
|
|
||||||
config["optimizer"] = {
|
config["optimizer"] = {
|
||||||
"type": optimizer,
|
"type": optimizer,
|
||||||
"params": optimizer_configs[optimizer],
|
"params": optimizer_configs[optimizer],
|
||||||
|
|||||||
Reference in New Issue
Block a user