From 538245b0c2199a7546bb92eed566965609acb5bb Mon Sep 17 00:00:00 2001 From: Gunjan Chhablani Date: Wed, 20 Jan 2021 22:29:31 +0530 Subject: [PATCH] Fix Trainer and Args to mention AdamW, not Adam. (#9685) * Fix Trainer and Args to mention AdamW, not Adam. * Update the docs for Training Arguments. * Change arguments adamw_* to adam_* * Fixed links to AdamW in TrainerArguments docs * Fix line length in Training Args docs. --- src/transformers/training_args.py | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py index abef9e35c6..c9d436ebc0 100644 --- a/src/transformers/training_args.py +++ b/src/transformers/training_args.py @@ -104,15 +104,16 @@ class TrainingArguments: left unset, the whole predictions are accumulated on GPU/TPU before being moved to the CPU (faster but requires more memory). learning_rate (:obj:`float`, `optional`, defaults to 5e-5): - The initial learning rate for Adam. + The initial learning rate for :class:`~transformers.AdamW` optimizer. weight_decay (:obj:`float`, `optional`, defaults to 0): - The weight decay to apply (if not zero). + The weight decay to apply (if not zero) to all layers except all bias and LayerNorm weights in + :class:`~transformers.AdamW` optimizer. adam_beta1 (:obj:`float`, `optional`, defaults to 0.9): - The beta1 hyperparameter for the Adam optimizer. + The beta1 hyperparameter for the :class:`~transformers.AdamW` optimizer. adam_beta2 (:obj:`float`, `optional`, defaults to 0.999): - The beta2 hyperparameter for the Adam optimizer. + The beta2 hyperparameter for the :class:`~transformers.AdamW` optimizer. adam_epsilon (:obj:`float`, `optional`, defaults to 1e-8): - The epsilon hyperparameter for the Adam optimizer. + The epsilon hyperparameter for the :class:`~transformers.AdamW` optimizer. max_grad_norm (:obj:`float`, `optional`, defaults to 1.0): Maximum gradient norm (for gradient clipping). num_train_epochs(:obj:`float`, `optional`, defaults to 3.0): @@ -288,11 +289,11 @@ class TrainingArguments: metadata={"help": "Number of predictions steps to accumulate before moving the tensors to the CPU."}, ) - learning_rate: float = field(default=5e-5, metadata={"help": "The initial learning rate for Adam."}) - weight_decay: float = field(default=0.0, metadata={"help": "Weight decay if we apply some."}) - adam_beta1: float = field(default=0.9, metadata={"help": "Beta1 for Adam optimizer"}) - adam_beta2: float = field(default=0.999, metadata={"help": "Beta2 for Adam optimizer"}) - adam_epsilon: float = field(default=1e-8, metadata={"help": "Epsilon for Adam optimizer."}) + learning_rate: float = field(default=5e-5, metadata={"help": "The initial learning rate for AdamW."}) + weight_decay: float = field(default=0.0, metadata={"help": "Weight decay for AdamW if we apply some."}) + adam_beta1: float = field(default=0.9, metadata={"help": "Beta1 for AdamW optimizer"}) + adam_beta2: float = field(default=0.999, metadata={"help": "Beta2 for AdamW optimizer"}) + adam_epsilon: float = field(default=1e-8, metadata={"help": "Epsilon for AdamW optimizer."}) max_grad_norm: float = field(default=1.0, metadata={"help": "Max gradient norm."}) num_train_epochs: float = field(default=3.0, metadata={"help": "Total number of training epochs to perform."}) @@ -407,7 +408,7 @@ class TrainingArguments: label_smoothing_factor: float = field( default=0.0, metadata={"help": "The label smoothing epsilon to apply (zero means no label smoothing)."} ) - adafactor: bool = field(default=False, metadata={"help": "Whether or not to replace Adam by Adafactor."}) + adafactor: bool = field(default=False, metadata={"help": "Whether or not to replace AdamW by Adafactor."}) group_by_length: bool = field( default=False, metadata={"help": "Whether or not to group samples of roughly the same length together when batching."},