diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py index b409a84bed..b6e35b73b9 100644 --- a/src/transformers/training_args.py +++ b/src/transformers/training_args.py @@ -493,7 +493,7 @@ class TrainingArguments: - `"tpu_metrics_debug"`: print debug metrics on TPU The options should be separated by whitespaces. - optim (`str` or [`training_args.OptimizerNames`], *optional*, defaults to `"adamw_hf"`): + optim (`str` or [`training_args.OptimizerNames`], *optional*, defaults to `"adamw_torch"`): The optimizer to use: adamw_hf, adamw_torch, adamw_torch_fused, adamw_apex_fused, adamw_anyprecision or adafactor. optim_args (`str`, *optional*): @@ -1034,12 +1034,12 @@ class TrainingArguments: default=0.0, metadata={"help": "The label smoothing epsilon to apply (zero means no label smoothing)."} ) - default_optim = "adamw_hf" + default_optim = "adamw_torch" # XXX: enable when pytorch==2.0.1 comes out - we want to give it time to get all the bugs sorted out # if is_torch_available() and version.parse(version.parse(torch.__version__).base_version) >= version.parse("2.1.0"): # default_optim = "adamw_torch_fused" # and update the doc above to: - # optim (`str` or [`training_args.OptimizerNames`], *optional*, defaults to `"adamw_torch_fused"` (for torch<2.1.0 `"adamw_hf"`): + # optim (`str` or [`training_args.OptimizerNames`], *optional*, defaults to `"adamw_torch_fused"` (for torch<2.1.0 `"adamw_torch"`): optim: Union[OptimizerNames, str] = field( default=default_optim, metadata={"help": "The optimizer to use."}, @@ -2421,7 +2421,7 @@ class TrainingArguments: def set_optimizer( self, - name: Union[str, OptimizerNames] = "adamw_hf", + name: Union[str, OptimizerNames] = "adamw_torch", learning_rate: float = 5e-5, weight_decay: float = 0, beta1: float = 0.9, @@ -2433,7 +2433,7 @@ class TrainingArguments: A method that regroups all arguments linked to the optimizer and its hyperparameters. Args: - name (`str` or [`training_args.OptimizerNames`], *optional*, defaults to `"adamw_hf"`): + name (`str` or [`training_args.OptimizerNames`], *optional*, defaults to `"adamw_torch"`): The optimizer to use: `"adamw_hf"`, `"adamw_torch"`, `"adamw_torch_fused"`, `"adamw_apex_fused"`, `"adamw_anyprecision"` or `"adafactor"`. learning_rate (`float`, *optional*, defaults to 5e-5):