Resizing embedding matrix before sending it to the optimizer. (#3532)
* Resizing embedding matrix after sending it to the optimizer prevents from updating the newly resized matrix. * Remove space for style matter
This commit is contained in:
@@ -233,6 +233,9 @@ def train(args, train_dataset, model: PreTrainedModel, tokenizer: PreTrainedToke
|
|||||||
else:
|
else:
|
||||||
t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
|
t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
|
||||||
|
|
||||||
|
model = model.module if hasattr(model, "module") else model # Take care of distributed/parallel training
|
||||||
|
model.resize_token_embeddings(len(tokenizer))
|
||||||
|
|
||||||
# Prepare optimizer and schedule (linear warmup and decay)
|
# Prepare optimizer and schedule (linear warmup and decay)
|
||||||
no_decay = ["bias", "LayerNorm.weight"]
|
no_decay = ["bias", "LayerNorm.weight"]
|
||||||
optimizer_grouped_parameters = [
|
optimizer_grouped_parameters = [
|
||||||
@@ -309,9 +312,6 @@ def train(args, train_dataset, model: PreTrainedModel, tokenizer: PreTrainedToke
|
|||||||
|
|
||||||
tr_loss, logging_loss = 0.0, 0.0
|
tr_loss, logging_loss = 0.0, 0.0
|
||||||
|
|
||||||
model_to_resize = model.module if hasattr(model, "module") else model # Take care of distributed/parallel training
|
|
||||||
model_to_resize.resize_token_embeddings(len(tokenizer))
|
|
||||||
|
|
||||||
model.zero_grad()
|
model.zero_grad()
|
||||||
train_iterator = trange(
|
train_iterator = trange(
|
||||||
epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]
|
epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]
|
||||||
|
|||||||
Reference in New Issue
Block a user