From a1c4954d25ca030c85319dd78395a4eff816e852 Mon Sep 17 00:00:00 2001 From: Zach Mueller Date: Thu, 27 Jul 2023 09:11:28 -0400 Subject: [PATCH] =?UTF-8?q?=F0=9F=9A=A8=F0=9F=9A=A8=F0=9F=9A=A8Change=20de?= =?UTF-8?q?fault=20from=20`adamw=5Fhf`=20to=20`adamw=5Ftorch`=20?= =?UTF-8?q?=F0=9F=9A=A8=F0=9F=9A=A8=F0=9F=9A=A8=20(#25109)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Change defaults * Sylvain's comments --- src/transformers/training_args.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py index b409a84bed..b6e35b73b9 100644 --- a/src/transformers/training_args.py +++ b/src/transformers/training_args.py @@ -493,7 +493,7 @@ class TrainingArguments: - `"tpu_metrics_debug"`: print debug metrics on TPU The options should be separated by whitespaces. - optim (`str` or [`training_args.OptimizerNames`], *optional*, defaults to `"adamw_hf"`): + optim (`str` or [`training_args.OptimizerNames`], *optional*, defaults to `"adamw_torch"`): The optimizer to use: adamw_hf, adamw_torch, adamw_torch_fused, adamw_apex_fused, adamw_anyprecision or adafactor. optim_args (`str`, *optional*): @@ -1034,12 +1034,12 @@ class TrainingArguments: default=0.0, metadata={"help": "The label smoothing epsilon to apply (zero means no label smoothing)."} ) - default_optim = "adamw_hf" + default_optim = "adamw_torch" # XXX: enable when pytorch==2.0.1 comes out - we want to give it time to get all the bugs sorted out # if is_torch_available() and version.parse(version.parse(torch.__version__).base_version) >= version.parse("2.1.0"): # default_optim = "adamw_torch_fused" # and update the doc above to: - # optim (`str` or [`training_args.OptimizerNames`], *optional*, defaults to `"adamw_torch_fused"` (for torch<2.1.0 `"adamw_hf"`): + # optim (`str` or [`training_args.OptimizerNames`], *optional*, defaults to `"adamw_torch_fused"` (for torch<2.1.0 `"adamw_torch"`): optim: Union[OptimizerNames, str] = field( default=default_optim, metadata={"help": "The optimizer to use."}, @@ -2421,7 +2421,7 @@ class TrainingArguments: def set_optimizer( self, - name: Union[str, OptimizerNames] = "adamw_hf", + name: Union[str, OptimizerNames] = "adamw_torch", learning_rate: float = 5e-5, weight_decay: float = 0, beta1: float = 0.9, @@ -2433,7 +2433,7 @@ class TrainingArguments: A method that regroups all arguments linked to the optimizer and its hyperparameters. Args: - name (`str` or [`training_args.OptimizerNames`], *optional*, defaults to `"adamw_hf"`): + name (`str` or [`training_args.OptimizerNames`], *optional*, defaults to `"adamw_torch"`): The optimizer to use: `"adamw_hf"`, `"adamw_torch"`, `"adamw_torch_fused"`, `"adamw_apex_fused"`, `"adamw_anyprecision"` or `"adafactor"`. learning_rate (`float`, *optional*, defaults to 5e-5):