AdamW is now supported by default (#9624)

This commit is contained in:
Stas Bekman
2021-03-12 13:40:07 -08:00
committed by GitHub
parent fa35cda91e
commit 4c32f9f26e
3 changed files with 9 additions and 12 deletions

View File

@@ -655,7 +655,6 @@ enables FP16, uses AdamW optimizer and WarmupLR scheduler:
"weight_decay": 3e-7 "weight_decay": 3e-7
} }
}, },
"zero_allow_untested_optimizer": true,
"scheduler": { "scheduler": {
"type": "WarmupLR", "type": "WarmupLR",
@@ -766,8 +765,8 @@ Optimizer
======================================================================================================================= =======================================================================================================================
DeepSpeed's main optimizers are Adam, OneBitAdam, and Lamb. These have been thoroughly tested with ZeRO and are thus DeepSpeed's main optimizers are Adam, AdamW, OneBitAdam, and Lamb. These have been thoroughly tested with ZeRO and are
recommended to be used. It, however, can import other optimizers from ``torch``. The full documentation is `here thus recommended to be used. It, however, can import other optimizers from ``torch``. The full documentation is `here
<https://www.deepspeed.ai/docs/config-json/#optimizer-parameters>`__. <https://www.deepspeed.ai/docs/config-json/#optimizer-parameters>`__.
If you don't configure the ``optimizer`` entry in the configuration file, the :class:`~transformers.Trainer` will If you don't configure the ``optimizer`` entry in the configuration file, the :class:`~transformers.Trainer` will
@@ -779,7 +778,6 @@ Here is an example of the pre-configured ``optimizer`` entry for AdamW:
.. code-block:: json .. code-block:: json
{ {
"zero_allow_untested_optimizer": true,
"optimizer": { "optimizer": {
"type": "AdamW", "type": "AdamW",
"params": { "params": {
@@ -791,8 +789,8 @@ Here is an example of the pre-configured ``optimizer`` entry for AdamW:
} }
} }
Since AdamW isn't on the list of tested with DeepSpeed/ZeRO optimizers, we have to add If you want to use another optimizer which is not listed above, you will have to add ``"zero_allow_untested_optimizer":
``zero_allow_untested_optimizer`` flag. true`` to the top level configuration.
If you want to use one of the officially supported optimizers, configure them explicitly in the configuration file, and If you want to use one of the officially supported optimizers, configure them explicitly in the configuration file, and
make sure to adjust the values. e.g. if use Adam you will want ``weight_decay`` around ``0.01``. make sure to adjust the values. e.g. if use Adam you will want ``weight_decay`` around ``0.01``.

View File

@@ -19,8 +19,6 @@
"cpu_offload": true "cpu_offload": true
}, },
"zero_allow_untested_optimizer": true,
"optimizer": { "optimizer": {
"type": "AdamW", "type": "AdamW",
"params": { "params": {

View File

@@ -26,6 +26,7 @@ from types import SimpleNamespace
from .trainer_utils import SchedulerType from .trainer_utils import SchedulerType
from .utils import logging from .utils import logging
from .utils.versions import require_version
logger = logging.get_logger(__name__) logger = logging.get_logger(__name__)
@@ -281,6 +282,8 @@ def init_deepspeed(trainer, num_training_steps):
""" """
import deepspeed import deepspeed
require_version("deepspeed>0.3.10")
args = trainer.args args = trainer.args
ds_config_file = args.deepspeed ds_config_file = args.deepspeed
model = trainer.model model = trainer.model
@@ -323,9 +326,8 @@ def init_deepspeed(trainer, num_training_steps):
f"Keeping the `optimizer` config from {ds_config_file} intact, ignoring any optimizer-specific cl args" f"Keeping the `optimizer` config from {ds_config_file} intact, ignoring any optimizer-specific cl args"
) )
else: # override only if the ds config doesn't already have this section else: # override only if the ds config doesn't already have this section
# ds supports Adam, OneBitAdam, and Lamb optimizers and can import other optimizers from torch. # ds supports Adam, AdamW, OneBitAdam, and Lamb optimizers and can import other optimizers from torch.
# But trainer uses AdamW by default. # To use other optimizers requires voiding warranty with: `"zero_allow_untested_optimizer": true"`
# To use other optimizers so using a different scheduler requires voiding warranty with: `zero_allow_untested_optimizer`
optimizer_configs = { optimizer_configs = {
"AdamW": { "AdamW": {
@@ -337,7 +339,6 @@ def init_deepspeed(trainer, num_training_steps):
} }
optimizer = "AdamW" optimizer = "AdamW"
config["zero_allow_untested_optimizer"] = True
config["optimizer"] = { config["optimizer"] = {
"type": optimizer, "type": optimizer,
"params": optimizer_configs[optimizer], "params": optimizer_configs[optimizer],