Just import torch AdamW instead (#36177)
* Just import torch AdamW instead * Update docs too * Make AdamW undocumented * make fixup * Add a basic wrapper class * Add it back to the docs * Just remove AdamW entirely * Remove some AdamW references * Drop AdamW from the public init * make fix-copies * Cleanup some references * make fixup * Delete lots of transformers.AdamW references * Remove extra references to adamw_hf
This commit is contained in:
@@ -8,7 +8,6 @@ import pytorch_lightning as pl
|
||||
from pytorch_lightning.utilities import rank_zero_info
|
||||
|
||||
from transformers import (
|
||||
AdamW,
|
||||
AutoConfig,
|
||||
AutoModel,
|
||||
AutoModelForPreTraining,
|
||||
@@ -20,6 +19,7 @@ from transformers import (
|
||||
AutoTokenizer,
|
||||
PretrainedConfig,
|
||||
PreTrainedTokenizer,
|
||||
is_torch_available,
|
||||
)
|
||||
from transformers.optimization import (
|
||||
Adafactor,
|
||||
@@ -31,6 +31,10 @@ from transformers.optimization import (
|
||||
from transformers.utils.versions import require_version
|
||||
|
||||
|
||||
if is_torch_available():
|
||||
import torch
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
require_version("pytorch_lightning>=1.0.4")
|
||||
@@ -146,7 +150,7 @@ class BaseTransformer(pl.LightningModule):
|
||||
)
|
||||
|
||||
else:
|
||||
optimizer = AdamW(
|
||||
optimizer = torch.optim.AdamW(
|
||||
optimizer_grouped_parameters, lr=self.hparams.learning_rate, eps=self.hparams.adam_epsilon
|
||||
)
|
||||
self.opt = optimizer
|
||||
|
||||
@@ -32,7 +32,6 @@ import transformers
|
||||
from transformers import (
|
||||
MODEL_FOR_QUESTION_ANSWERING_MAPPING,
|
||||
WEIGHTS_NAME,
|
||||
AdamW,
|
||||
AutoConfig,
|
||||
AutoModelForQuestionAnswering,
|
||||
AutoTokenizer,
|
||||
@@ -96,7 +95,7 @@ def train(args, train_dataset, model, tokenizer):
|
||||
},
|
||||
{"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
|
||||
]
|
||||
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
|
||||
optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
|
||||
scheduler = get_linear_schedule_with_warmup(
|
||||
optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
|
||||
)
|
||||
|
||||
@@ -43,7 +43,6 @@ from tqdm import tqdm, trange
|
||||
from transformers import (
|
||||
CONFIG_NAME,
|
||||
WEIGHTS_NAME,
|
||||
AdamW,
|
||||
OpenAIGPTDoubleHeadsModel,
|
||||
OpenAIGPTTokenizer,
|
||||
get_linear_schedule_with_warmup,
|
||||
@@ -236,7 +235,7 @@ def main():
|
||||
},
|
||||
{"params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
|
||||
]
|
||||
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
|
||||
optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
|
||||
scheduler = get_linear_schedule_with_warmup(
|
||||
optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
|
||||
)
|
||||
|
||||
@@ -34,7 +34,6 @@ from tqdm import tqdm, trange
|
||||
import transformers
|
||||
from transformers import (
|
||||
WEIGHTS_NAME,
|
||||
AdamW,
|
||||
AutoConfig,
|
||||
AutoModelForMultipleChoice,
|
||||
AutoTokenizer,
|
||||
@@ -298,7 +297,7 @@ def train(args, train_dataset, model, tokenizer):
|
||||
},
|
||||
{"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
|
||||
]
|
||||
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
|
||||
optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
|
||||
scheduler = get_linear_schedule_with_warmup(
|
||||
optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
|
||||
)
|
||||
|
||||
@@ -22,7 +22,6 @@ from transformers import PreTrainedModel, Trainer, logging
|
||||
from transformers.models.fsmt.configuration_fsmt import FSMTConfig
|
||||
from transformers.optimization import (
|
||||
Adafactor,
|
||||
AdamW,
|
||||
get_constant_schedule,
|
||||
get_constant_schedule_with_warmup,
|
||||
get_cosine_schedule_with_warmup,
|
||||
@@ -102,12 +101,11 @@ class Seq2SeqTrainer(Trainer):
|
||||
"weight_decay": 0.0,
|
||||
},
|
||||
]
|
||||
optimizer_cls = Adafactor if self.args.adafactor else AdamW
|
||||
if self.args.adafactor:
|
||||
optimizer_cls = Adafactor
|
||||
optimizer_kwargs = {"scale_parameter": False, "relative_step": False}
|
||||
else:
|
||||
optimizer_cls = AdamW
|
||||
optimizer_cls = torch.optim.AdamW
|
||||
optimizer_kwargs = {
|
||||
"betas": (self.args.adam_beta1, self.args.adam_beta2),
|
||||
"eps": self.args.adam_epsilon,
|
||||
|
||||
@@ -41,7 +41,6 @@ from utils_qa import postprocess_qa_predictions_with_beam_search
|
||||
|
||||
import transformers
|
||||
from transformers import (
|
||||
AdamW,
|
||||
DataCollatorWithPadding,
|
||||
EvalPrediction,
|
||||
SchedulerType,
|
||||
@@ -767,7 +766,7 @@ def main():
|
||||
"weight_decay": 0.0,
|
||||
},
|
||||
]
|
||||
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate)
|
||||
optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=args.learning_rate)
|
||||
|
||||
# Scheduler and math around the number of training steps.
|
||||
overrode_max_train_steps = False
|
||||
|
||||
@@ -33,7 +33,6 @@ from tqdm.auto import tqdm
|
||||
|
||||
import transformers
|
||||
from transformers import (
|
||||
AdamW,
|
||||
SchedulerType,
|
||||
Wav2Vec2Config,
|
||||
Wav2Vec2FeatureExtractor,
|
||||
@@ -583,7 +582,7 @@ def main():
|
||||
)
|
||||
|
||||
# Optimizer
|
||||
optimizer = AdamW(
|
||||
optimizer = torch.optim.AdamW(
|
||||
list(model.parameters()),
|
||||
lr=args.learning_rate,
|
||||
betas=[args.adam_beta1, args.adam_beta2],
|
||||
|
||||
Reference in New Issue
Block a user