Just import torch AdamW instead (#36177)

* Just import torch AdamW instead

* Update docs too

* Make AdamW undocumented

* make fixup

* Add a basic wrapper class

* Add it back to the docs

* Just remove AdamW entirely

* Remove some AdamW references

* Drop AdamW from the public init

* make fix-copies

* Cleanup some references

* make fixup

* Delete lots of transformers.AdamW references

* Remove extra references to adamw_hf
This commit is contained in:
Matt
2025-03-19 18:29:40 +00:00
committed by GitHub
parent 51bd0ceb9e
commit 9be4728af8
18 changed files with 18 additions and 174 deletions

View File

@@ -8,7 +8,6 @@ import pytorch_lightning as pl
from pytorch_lightning.utilities import rank_zero_info
from transformers import (
AdamW,
AutoConfig,
AutoModel,
AutoModelForPreTraining,
@@ -20,6 +19,7 @@ from transformers import (
AutoTokenizer,
PretrainedConfig,
PreTrainedTokenizer,
is_torch_available,
)
from transformers.optimization import (
Adafactor,
@@ -31,6 +31,10 @@ from transformers.optimization import (
from transformers.utils.versions import require_version
if is_torch_available():
import torch
logger = logging.getLogger(__name__)
require_version("pytorch_lightning>=1.0.4")
@@ -146,7 +150,7 @@ class BaseTransformer(pl.LightningModule):
)
else:
optimizer = AdamW(
optimizer = torch.optim.AdamW(
optimizer_grouped_parameters, lr=self.hparams.learning_rate, eps=self.hparams.adam_epsilon
)
self.opt = optimizer

View File

@@ -32,7 +32,6 @@ import transformers
from transformers import (
MODEL_FOR_QUESTION_ANSWERING_MAPPING,
WEIGHTS_NAME,
AdamW,
AutoConfig,
AutoModelForQuestionAnswering,
AutoTokenizer,
@@ -96,7 +95,7 @@ def train(args, train_dataset, model, tokenizer):
},
{"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
]
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
scheduler = get_linear_schedule_with_warmup(
optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
)

View File

@@ -43,7 +43,6 @@ from tqdm import tqdm, trange
from transformers import (
CONFIG_NAME,
WEIGHTS_NAME,
AdamW,
OpenAIGPTDoubleHeadsModel,
OpenAIGPTTokenizer,
get_linear_schedule_with_warmup,
@@ -236,7 +235,7 @@ def main():
},
{"params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
]
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
scheduler = get_linear_schedule_with_warmup(
optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
)

View File

@@ -34,7 +34,6 @@ from tqdm import tqdm, trange
import transformers
from transformers import (
WEIGHTS_NAME,
AdamW,
AutoConfig,
AutoModelForMultipleChoice,
AutoTokenizer,
@@ -298,7 +297,7 @@ def train(args, train_dataset, model, tokenizer):
},
{"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
]
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
scheduler = get_linear_schedule_with_warmup(
optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
)

View File

@@ -22,7 +22,6 @@ from transformers import PreTrainedModel, Trainer, logging
from transformers.models.fsmt.configuration_fsmt import FSMTConfig
from transformers.optimization import (
Adafactor,
AdamW,
get_constant_schedule,
get_constant_schedule_with_warmup,
get_cosine_schedule_with_warmup,
@@ -102,12 +101,11 @@ class Seq2SeqTrainer(Trainer):
"weight_decay": 0.0,
},
]
optimizer_cls = Adafactor if self.args.adafactor else AdamW
if self.args.adafactor:
optimizer_cls = Adafactor
optimizer_kwargs = {"scale_parameter": False, "relative_step": False}
else:
optimizer_cls = AdamW
optimizer_cls = torch.optim.AdamW
optimizer_kwargs = {
"betas": (self.args.adam_beta1, self.args.adam_beta2),
"eps": self.args.adam_epsilon,

View File

@@ -41,7 +41,6 @@ from utils_qa import postprocess_qa_predictions_with_beam_search
import transformers
from transformers import (
AdamW,
DataCollatorWithPadding,
EvalPrediction,
SchedulerType,
@@ -767,7 +766,7 @@ def main():
"weight_decay": 0.0,
},
]
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate)
optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=args.learning_rate)
# Scheduler and math around the number of training steps.
overrode_max_train_steps = False

View File

@@ -33,7 +33,6 @@ from tqdm.auto import tqdm
import transformers
from transformers import (
AdamW,
SchedulerType,
Wav2Vec2Config,
Wav2Vec2FeatureExtractor,
@@ -583,7 +582,7 @@ def main():
)
# Optimizer
optimizer = AdamW(
optimizer = torch.optim.AdamW(
list(model.parameters()),
lr=args.learning_rate,
betas=[args.adam_beta1, args.adam_beta2],