🚨🚨🚨Change default from adamw_hf to adamw_torch 🚨🚨🚨 (#25109)
* Change defaults * Sylvain's comments
This commit is contained in:
@@ -493,7 +493,7 @@ class TrainingArguments:
|
|||||||
- `"tpu_metrics_debug"`: print debug metrics on TPU
|
- `"tpu_metrics_debug"`: print debug metrics on TPU
|
||||||
|
|
||||||
The options should be separated by whitespaces.
|
The options should be separated by whitespaces.
|
||||||
optim (`str` or [`training_args.OptimizerNames`], *optional*, defaults to `"adamw_hf"`):
|
optim (`str` or [`training_args.OptimizerNames`], *optional*, defaults to `"adamw_torch"`):
|
||||||
The optimizer to use: adamw_hf, adamw_torch, adamw_torch_fused, adamw_apex_fused, adamw_anyprecision or
|
The optimizer to use: adamw_hf, adamw_torch, adamw_torch_fused, adamw_apex_fused, adamw_anyprecision or
|
||||||
adafactor.
|
adafactor.
|
||||||
optim_args (`str`, *optional*):
|
optim_args (`str`, *optional*):
|
||||||
@@ -1034,12 +1034,12 @@ class TrainingArguments:
|
|||||||
default=0.0, metadata={"help": "The label smoothing epsilon to apply (zero means no label smoothing)."}
|
default=0.0, metadata={"help": "The label smoothing epsilon to apply (zero means no label smoothing)."}
|
||||||
)
|
)
|
||||||
|
|
||||||
default_optim = "adamw_hf"
|
default_optim = "adamw_torch"
|
||||||
# XXX: enable when pytorch==2.0.1 comes out - we want to give it time to get all the bugs sorted out
|
# XXX: enable when pytorch==2.0.1 comes out - we want to give it time to get all the bugs sorted out
|
||||||
# if is_torch_available() and version.parse(version.parse(torch.__version__).base_version) >= version.parse("2.1.0"):
|
# if is_torch_available() and version.parse(version.parse(torch.__version__).base_version) >= version.parse("2.1.0"):
|
||||||
# default_optim = "adamw_torch_fused"
|
# default_optim = "adamw_torch_fused"
|
||||||
# and update the doc above to:
|
# and update the doc above to:
|
||||||
# optim (`str` or [`training_args.OptimizerNames`], *optional*, defaults to `"adamw_torch_fused"` (for torch<2.1.0 `"adamw_hf"`):
|
# optim (`str` or [`training_args.OptimizerNames`], *optional*, defaults to `"adamw_torch_fused"` (for torch<2.1.0 `"adamw_torch"`):
|
||||||
optim: Union[OptimizerNames, str] = field(
|
optim: Union[OptimizerNames, str] = field(
|
||||||
default=default_optim,
|
default=default_optim,
|
||||||
metadata={"help": "The optimizer to use."},
|
metadata={"help": "The optimizer to use."},
|
||||||
@@ -2421,7 +2421,7 @@ class TrainingArguments:
|
|||||||
|
|
||||||
def set_optimizer(
|
def set_optimizer(
|
||||||
self,
|
self,
|
||||||
name: Union[str, OptimizerNames] = "adamw_hf",
|
name: Union[str, OptimizerNames] = "adamw_torch",
|
||||||
learning_rate: float = 5e-5,
|
learning_rate: float = 5e-5,
|
||||||
weight_decay: float = 0,
|
weight_decay: float = 0,
|
||||||
beta1: float = 0.9,
|
beta1: float = 0.9,
|
||||||
@@ -2433,7 +2433,7 @@ class TrainingArguments:
|
|||||||
A method that regroups all arguments linked to the optimizer and its hyperparameters.
|
A method that regroups all arguments linked to the optimizer and its hyperparameters.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
name (`str` or [`training_args.OptimizerNames`], *optional*, defaults to `"adamw_hf"`):
|
name (`str` or [`training_args.OptimizerNames`], *optional*, defaults to `"adamw_torch"`):
|
||||||
The optimizer to use: `"adamw_hf"`, `"adamw_torch"`, `"adamw_torch_fused"`, `"adamw_apex_fused"`,
|
The optimizer to use: `"adamw_hf"`, `"adamw_torch"`, `"adamw_torch_fused"`, `"adamw_apex_fused"`,
|
||||||
`"adamw_anyprecision"` or `"adafactor"`.
|
`"adamw_anyprecision"` or `"adafactor"`.
|
||||||
learning_rate (`float`, *optional*, defaults to 5e-5):
|
learning_rate (`float`, *optional*, defaults to 5e-5):
|
||||||
|
|||||||
Reference in New Issue
Block a user