Merge pull request #1832 from huggingface/memory-leak-schedulers
replace LambdaLR scheduler wrappers by function
This commit is contained in:
@@ -520,12 +520,12 @@ Here is a conversion examples from `BertAdam` with a linear warmup and decay sch
|
|||||||
# Parameters:
|
# Parameters:
|
||||||
lr = 1e-3
|
lr = 1e-3
|
||||||
max_grad_norm = 1.0
|
max_grad_norm = 1.0
|
||||||
num_total_steps = 1000
|
num_training_steps = 1000
|
||||||
num_warmup_steps = 100
|
num_warmup_steps = 100
|
||||||
warmup_proportion = float(num_warmup_steps) / float(num_total_steps) # 0.1
|
warmup_proportion = float(num_warmup_steps) / float(num_training_steps) # 0.1
|
||||||
|
|
||||||
### Previously BertAdam optimizer was instantiated like this:
|
### Previously BertAdam optimizer was instantiated like this:
|
||||||
optimizer = BertAdam(model.parameters(), lr=lr, schedule='warmup_linear', warmup=warmup_proportion, t_total=num_total_steps)
|
optimizer = BertAdam(model.parameters(), lr=lr, schedule='warmup_linear', warmup=warmup_proportion, t_total=num_training_steps)
|
||||||
### and used like this:
|
### and used like this:
|
||||||
for batch in train_data:
|
for batch in train_data:
|
||||||
loss = model(batch)
|
loss = model(batch)
|
||||||
@@ -534,7 +534,7 @@ for batch in train_data:
|
|||||||
|
|
||||||
### In Transformers, optimizer and schedules are splitted and instantiated like this:
|
### In Transformers, optimizer and schedules are splitted and instantiated like this:
|
||||||
optimizer = AdamW(model.parameters(), lr=lr, correct_bias=False) # To reproduce BertAdam specific behavior set correct_bias=False
|
optimizer = AdamW(model.parameters(), lr=lr, correct_bias=False) # To reproduce BertAdam specific behavior set correct_bias=False
|
||||||
scheduler = WarmupLinearSchedule(optimizer, warmup_steps=num_warmup_steps, t_total=num_total_steps) # PyTorch scheduler
|
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps) # PyTorch scheduler
|
||||||
### and used like this:
|
### and used like this:
|
||||||
for batch in train_data:
|
for batch in train_data:
|
||||||
model.train()
|
model.train()
|
||||||
|
|||||||
@@ -18,19 +18,17 @@ Schedules
|
|||||||
Learning Rate Schedules
|
Learning Rate Schedules
|
||||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||||
|
|
||||||
.. autoclass:: transformers.ConstantLRSchedule
|
.. autofunction:: transformers.get_constant_schedule
|
||||||
:members:
|
|
||||||
|
|
||||||
|
|
||||||
.. autoclass:: transformers.WarmupConstantSchedule
|
.. autofunction:: transformers.get_constant_schedule_with_warmup
|
||||||
:members:
|
|
||||||
|
|
||||||
.. image:: /imgs/warmup_constant_schedule.png
|
.. image:: /imgs/warmup_constant_schedule.png
|
||||||
:target: /imgs/warmup_constant_schedule.png
|
:target: /imgs/warmup_constant_schedule.png
|
||||||
:alt:
|
:alt:
|
||||||
|
|
||||||
|
|
||||||
.. autoclass:: transformers.WarmupCosineSchedule
|
.. autofunction:: transformers.get_cosine_schedule_with_warmup
|
||||||
:members:
|
:members:
|
||||||
|
|
||||||
.. image:: /imgs/warmup_cosine_schedule.png
|
.. image:: /imgs/warmup_cosine_schedule.png
|
||||||
@@ -38,8 +36,7 @@ Learning Rate Schedules
|
|||||||
:alt:
|
:alt:
|
||||||
|
|
||||||
|
|
||||||
.. autoclass:: transformers.WarmupCosineWithHardRestartsSchedule
|
.. autofunction:: transformers.get_cosine_with_hard_restarts_schedule_with_warmup
|
||||||
:members:
|
|
||||||
|
|
||||||
.. image:: /imgs/warmup_cosine_hard_restarts_schedule.png
|
.. image:: /imgs/warmup_cosine_hard_restarts_schedule.png
|
||||||
:target: /imgs/warmup_cosine_hard_restarts_schedule.png
|
:target: /imgs/warmup_cosine_hard_restarts_schedule.png
|
||||||
@@ -47,8 +44,7 @@ Learning Rate Schedules
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
.. autoclass:: transformers.WarmupLinearSchedule
|
.. autofunction:: transformers.get_linear_schedule_with_warmup
|
||||||
:members:
|
|
||||||
|
|
||||||
.. image:: /imgs/warmup_linear_schedule.png
|
.. image:: /imgs/warmup_linear_schedule.png
|
||||||
:target: /imgs/warmup_linear_schedule.png
|
:target: /imgs/warmup_linear_schedule.png
|
||||||
|
|||||||
@@ -84,12 +84,12 @@ Here is a conversion examples from `BertAdam` with a linear warmup and decay sch
|
|||||||
# Parameters:
|
# Parameters:
|
||||||
lr = 1e-3
|
lr = 1e-3
|
||||||
max_grad_norm = 1.0
|
max_grad_norm = 1.0
|
||||||
num_total_steps = 1000
|
num_training_steps = 1000
|
||||||
num_warmup_steps = 100
|
num_warmup_steps = 100
|
||||||
warmup_proportion = float(num_warmup_steps) / float(num_total_steps) # 0.1
|
warmup_proportion = float(num_warmup_steps) / float(num_training_steps) # 0.1
|
||||||
|
|
||||||
### Previously BertAdam optimizer was instantiated like this:
|
### Previously BertAdam optimizer was instantiated like this:
|
||||||
optimizer = BertAdam(model.parameters(), lr=lr, schedule='warmup_linear', warmup=warmup_proportion, t_total=num_total_steps)
|
optimizer = BertAdam(model.parameters(), lr=lr, schedule='warmup_linear', warmup=warmup_proportion, num_training_steps=num_training_steps)
|
||||||
### and used like this:
|
### and used like this:
|
||||||
for batch in train_data:
|
for batch in train_data:
|
||||||
loss = model(batch)
|
loss = model(batch)
|
||||||
@@ -98,7 +98,7 @@ for batch in train_data:
|
|||||||
|
|
||||||
### In Transformers, optimizer and schedules are splitted and instantiated like this:
|
### In Transformers, optimizer and schedules are splitted and instantiated like this:
|
||||||
optimizer = AdamW(model.parameters(), lr=lr, correct_bias=False) # To reproduce BertAdam specific behavior set correct_bias=False
|
optimizer = AdamW(model.parameters(), lr=lr, correct_bias=False) # To reproduce BertAdam specific behavior set correct_bias=False
|
||||||
scheduler = WarmupLinearSchedule(optimizer, warmup_steps=num_warmup_steps, t_total=num_total_steps) # PyTorch scheduler
|
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps) # PyTorch scheduler
|
||||||
### and used like this:
|
### and used like this:
|
||||||
for batch in train_data:
|
for batch in train_data:
|
||||||
loss = model(batch)
|
loss = model(batch)
|
||||||
|
|||||||
@@ -41,7 +41,7 @@ from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
|
|||||||
|
|
||||||
from transformers import (OpenAIGPTDoubleHeadsModel, OpenAIGPTTokenizer,
|
from transformers import (OpenAIGPTDoubleHeadsModel, OpenAIGPTTokenizer,
|
||||||
AdamW, cached_path, WEIGHTS_NAME, CONFIG_NAME,
|
AdamW, cached_path, WEIGHTS_NAME, CONFIG_NAME,
|
||||||
WarmupLinearSchedule)
|
get_linear_schedule_with_warmup)
|
||||||
|
|
||||||
ROCSTORIES_URL = "https://s3.amazonaws.com/datasets.huggingface.co/ROCStories.tar.gz"
|
ROCSTORIES_URL = "https://s3.amazonaws.com/datasets.huggingface.co/ROCStories.tar.gz"
|
||||||
|
|
||||||
@@ -211,7 +211,7 @@ def main():
|
|||||||
{'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
|
{'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
|
||||||
]
|
]
|
||||||
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
|
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
|
||||||
scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total)
|
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total)
|
||||||
|
|
||||||
if args.do_train:
|
if args.do_train:
|
||||||
nb_tr_steps, tr_loss, exp_average_loss = 0, 0, None
|
nb_tr_steps, tr_loss, exp_average_loss = 0, 0, None
|
||||||
|
|||||||
@@ -42,7 +42,7 @@ from tqdm import tqdm, trange
|
|||||||
from transformers import (WEIGHTS_NAME, BertConfig,
|
from transformers import (WEIGHTS_NAME, BertConfig,
|
||||||
BertForMultipleChoice, BertTokenizer)
|
BertForMultipleChoice, BertTokenizer)
|
||||||
|
|
||||||
from transformers import AdamW, WarmupLinearSchedule
|
from transformers import AdamW, get_linear_schedule_with_warmup
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
@@ -322,7 +322,7 @@ def train(args, train_dataset, model, tokenizer):
|
|||||||
{'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
|
{'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
|
||||||
]
|
]
|
||||||
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
|
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
|
||||||
scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total)
|
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total)
|
||||||
if args.fp16:
|
if args.fp16:
|
||||||
try:
|
try:
|
||||||
from apex import amp
|
from apex import amp
|
||||||
|
|||||||
@@ -35,7 +35,7 @@ try:
|
|||||||
except:
|
except:
|
||||||
from tensorboardX import SummaryWriter
|
from tensorboardX import SummaryWriter
|
||||||
|
|
||||||
from transformers import WarmupLinearSchedule
|
from transformers import get_linear_schedule_with_warmup
|
||||||
|
|
||||||
from utils import logger
|
from utils import logger
|
||||||
from lm_seqs_dataset import LmSeqsDataset
|
from lm_seqs_dataset import LmSeqsDataset
|
||||||
@@ -137,9 +137,9 @@ class Distiller:
|
|||||||
betas=(0.9, 0.98))
|
betas=(0.9, 0.98))
|
||||||
|
|
||||||
warmup_steps = math.ceil(num_train_optimization_steps * params.warmup_prop)
|
warmup_steps = math.ceil(num_train_optimization_steps * params.warmup_prop)
|
||||||
self.scheduler = WarmupLinearSchedule(self.optimizer,
|
self.scheduler = get_linear_schedule_with_warmup(self.optimizer,
|
||||||
warmup_steps=warmup_steps,
|
num_warmup_steps=warmup_steps,
|
||||||
t_total=num_train_optimization_steps)
|
num_training_steps=num_train_optimization_steps)
|
||||||
|
|
||||||
if self.fp16:
|
if self.fp16:
|
||||||
try:
|
try:
|
||||||
|
|||||||
@@ -46,7 +46,7 @@ from transformers import (WEIGHTS_NAME, BertConfig,
|
|||||||
XLNetTokenizer,
|
XLNetTokenizer,
|
||||||
DistilBertConfig, DistilBertForQuestionAnswering, DistilBertTokenizer)
|
DistilBertConfig, DistilBertForQuestionAnswering, DistilBertTokenizer)
|
||||||
|
|
||||||
from transformers import AdamW, WarmupLinearSchedule
|
from transformers import AdamW, get_linear_schedule_with_warmup
|
||||||
|
|
||||||
from ..utils_squad import (read_squad_examples, convert_examples_to_features,
|
from ..utils_squad import (read_squad_examples, convert_examples_to_features,
|
||||||
RawResult, write_predictions,
|
RawResult, write_predictions,
|
||||||
@@ -101,7 +101,7 @@ def train(args, train_dataset, model, tokenizer, teacher=None):
|
|||||||
{'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
|
{'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
|
||||||
]
|
]
|
||||||
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
|
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
|
||||||
scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total)
|
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total)
|
||||||
if args.fp16:
|
if args.fp16:
|
||||||
try:
|
try:
|
||||||
from apex import amp
|
from apex import amp
|
||||||
|
|||||||
@@ -49,7 +49,7 @@ from transformers import (WEIGHTS_NAME, BertConfig,
|
|||||||
DistilBertForSequenceClassification,
|
DistilBertForSequenceClassification,
|
||||||
DistilBertTokenizer)
|
DistilBertTokenizer)
|
||||||
|
|
||||||
from transformers import AdamW, WarmupLinearSchedule
|
from transformers import AdamW, get_linear_schedule_with_warmup
|
||||||
|
|
||||||
from transformers import glue_compute_metrics as compute_metrics
|
from transformers import glue_compute_metrics as compute_metrics
|
||||||
from transformers import glue_output_modes as output_modes
|
from transformers import glue_output_modes as output_modes
|
||||||
@@ -100,7 +100,7 @@ def train(args, train_dataset, model, tokenizer):
|
|||||||
{'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
|
{'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
|
||||||
]
|
]
|
||||||
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
|
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
|
||||||
scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total)
|
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total)
|
||||||
if args.fp16:
|
if args.fp16:
|
||||||
try:
|
try:
|
||||||
from apex import amp
|
from apex import amp
|
||||||
|
|||||||
@@ -42,7 +42,7 @@ except:
|
|||||||
|
|
||||||
from tqdm import tqdm, trange
|
from tqdm import tqdm, trange
|
||||||
|
|
||||||
from transformers import (WEIGHTS_NAME, AdamW, WarmupLinearSchedule,
|
from transformers import (WEIGHTS_NAME, AdamW, get_linear_schedule_with_warmup,
|
||||||
BertConfig, BertForMaskedLM, BertTokenizer,
|
BertConfig, BertForMaskedLM, BertTokenizer,
|
||||||
GPT2Config, GPT2LMHeadModel, GPT2Tokenizer,
|
GPT2Config, GPT2LMHeadModel, GPT2Tokenizer,
|
||||||
OpenAIGPTConfig, OpenAIGPTLMHeadModel, OpenAIGPTTokenizer,
|
OpenAIGPTConfig, OpenAIGPTLMHeadModel, OpenAIGPTTokenizer,
|
||||||
@@ -185,7 +185,7 @@ def train(args, train_dataset, model, tokenizer):
|
|||||||
{'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
|
{'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
|
||||||
]
|
]
|
||||||
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
|
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
|
||||||
scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total)
|
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total)
|
||||||
if args.fp16:
|
if args.fp16:
|
||||||
try:
|
try:
|
||||||
from apex import amp
|
from apex import amp
|
||||||
|
|||||||
@@ -43,7 +43,7 @@ from transformers import (WEIGHTS_NAME, BertConfig,
|
|||||||
XLNetTokenizer, RobertaConfig,
|
XLNetTokenizer, RobertaConfig,
|
||||||
RobertaForMultipleChoice, RobertaTokenizer)
|
RobertaForMultipleChoice, RobertaTokenizer)
|
||||||
|
|
||||||
from transformers import AdamW, WarmupLinearSchedule
|
from transformers import AdamW, get_linear_schedule_with_warmup
|
||||||
|
|
||||||
from utils_multiple_choice import (convert_examples_to_features, processors)
|
from utils_multiple_choice import (convert_examples_to_features, processors)
|
||||||
|
|
||||||
@@ -101,7 +101,7 @@ def train(args, train_dataset, model, tokenizer):
|
|||||||
{'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
|
{'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
|
||||||
]
|
]
|
||||||
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
|
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
|
||||||
scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total)
|
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total)
|
||||||
if args.fp16:
|
if args.fp16:
|
||||||
try:
|
try:
|
||||||
from apex import amp
|
from apex import amp
|
||||||
|
|||||||
@@ -33,7 +33,7 @@ from torch.utils.data.distributed import DistributedSampler
|
|||||||
from tqdm import tqdm, trange
|
from tqdm import tqdm, trange
|
||||||
from utils_ner import convert_examples_to_features, get_labels, read_examples_from_file
|
from utils_ner import convert_examples_to_features, get_labels, read_examples_from_file
|
||||||
|
|
||||||
from transformers import AdamW, WarmupLinearSchedule
|
from transformers import AdamW, get_linear_schedule_with_warmup
|
||||||
from transformers import WEIGHTS_NAME, BertConfig, BertForTokenClassification, BertTokenizer
|
from transformers import WEIGHTS_NAME, BertConfig, BertForTokenClassification, BertTokenizer
|
||||||
from transformers import RobertaConfig, RobertaForTokenClassification, RobertaTokenizer
|
from transformers import RobertaConfig, RobertaForTokenClassification, RobertaTokenizer
|
||||||
|
|
||||||
@@ -80,7 +80,7 @@ def train(args, train_dataset, model, tokenizer, labels, pad_token_label_id):
|
|||||||
{"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0}
|
{"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0}
|
||||||
]
|
]
|
||||||
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
|
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
|
||||||
scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total)
|
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total)
|
||||||
if args.fp16:
|
if args.fp16:
|
||||||
try:
|
try:
|
||||||
from apex import amp
|
from apex import amp
|
||||||
|
|||||||
@@ -45,7 +45,7 @@ from transformers import (WEIGHTS_NAME, BertConfig,
|
|||||||
XLNetTokenizer,
|
XLNetTokenizer,
|
||||||
DistilBertConfig, DistilBertForQuestionAnswering, DistilBertTokenizer)
|
DistilBertConfig, DistilBertForQuestionAnswering, DistilBertTokenizer)
|
||||||
|
|
||||||
from transformers import AdamW, WarmupLinearSchedule
|
from transformers import AdamW, get_linear_schedule_with_warmup
|
||||||
|
|
||||||
from utils_squad import (read_squad_examples, convert_examples_to_features,
|
from utils_squad import (read_squad_examples, convert_examples_to_features,
|
||||||
RawResult, write_predictions,
|
RawResult, write_predictions,
|
||||||
@@ -100,7 +100,7 @@ def train(args, train_dataset, model, tokenizer):
|
|||||||
{'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
|
{'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
|
||||||
]
|
]
|
||||||
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
|
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
|
||||||
scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total)
|
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total)
|
||||||
if args.fp16:
|
if args.fp16:
|
||||||
try:
|
try:
|
||||||
from apex import amp
|
from apex import amp
|
||||||
|
|||||||
@@ -43,7 +43,7 @@ from transformers import (WEIGHTS_NAME, BertConfig,
|
|||||||
XLNetTokenizer,
|
XLNetTokenizer,
|
||||||
DistilBertConfig, DistilBertForQuestionAnswering, DistilBertTokenizer)
|
DistilBertConfig, DistilBertForQuestionAnswering, DistilBertTokenizer)
|
||||||
|
|
||||||
from transformers import AdamW, WarmupLinearSchedule
|
from transformers import AdamW, get_linear_schedule_with_warmup
|
||||||
|
|
||||||
from utils_squad import (read_squad_examples, convert_examples_to_features,
|
from utils_squad import (read_squad_examples, convert_examples_to_features,
|
||||||
RawResult, write_predictions,
|
RawResult, write_predictions,
|
||||||
@@ -98,7 +98,7 @@ def train(args, train_dataset, model, tokenizer):
|
|||||||
{'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
|
{'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
|
||||||
]
|
]
|
||||||
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
|
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
|
||||||
scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total)
|
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total)
|
||||||
if args.fp16:
|
if args.fp16:
|
||||||
try:
|
try:
|
||||||
from apex import amp
|
from apex import amp
|
||||||
|
|||||||
@@ -98,8 +98,8 @@ if is_torch_available():
|
|||||||
from .modeling_encoder_decoder import PreTrainedEncoderDecoder, Model2Model
|
from .modeling_encoder_decoder import PreTrainedEncoderDecoder, Model2Model
|
||||||
|
|
||||||
# Optimization
|
# Optimization
|
||||||
from .optimization import (AdamW, ConstantLRSchedule, WarmupConstantSchedule, WarmupCosineSchedule,
|
from .optimization import (AdamW, get_constant_schedule, get_constant_schedule_with_warmup, get_cosine_schedule_with_warmup,
|
||||||
WarmupCosineWithHardRestartsSchedule, WarmupLinearSchedule)
|
get_cosine_with_hard_restarts_schedule_with_warmup, get_linear_schedule_with_warmup)
|
||||||
|
|
||||||
|
|
||||||
# TensorFlow
|
# TensorFlow
|
||||||
|
|||||||
@@ -23,89 +23,65 @@ from torch.optim.lr_scheduler import LambdaLR
|
|||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
class ConstantLRSchedule(LambdaLR):
|
|
||||||
""" Constant learning rate schedule.
|
def get_constant_schedule(optimizer, last_epoch=-1):
|
||||||
|
""" Create a schedule with a constant learning rate.
|
||||||
"""
|
"""
|
||||||
def __init__(self, optimizer, last_epoch=-1):
|
return LambdaLR(optimizer, lambda _: 1, last_epoch=last_epoch)
|
||||||
super(ConstantLRSchedule, self).__init__(optimizer, lambda _: 1.0, last_epoch=last_epoch)
|
|
||||||
|
|
||||||
|
|
||||||
class WarmupConstantSchedule(LambdaLR):
|
def get_constant_schedule_with_warmup(optimizer, num_warmup_steps, last_epoch=-1):
|
||||||
""" Linear warmup and then constant.
|
""" Create a schedule with a constant learning rate preceded by a warmup
|
||||||
Multiplies the learning rate defined in the optimizer by a dynamic variable determined by the current step.
|
period during which the learning rate increases linearly between 0 and 1.
|
||||||
Linearly increases the multiplicative variable from 0. to 1. over `warmup_steps` training steps.
|
|
||||||
Keeps multiplicative variable equal to 1. after warmup_steps.
|
|
||||||
"""
|
"""
|
||||||
def __init__(self, optimizer, warmup_steps, last_epoch=-1):
|
def lr_lambda(current_step):
|
||||||
self.warmup_steps = warmup_steps
|
if current_step < num_warmup_steps:
|
||||||
super(WarmupConstantSchedule, self).__init__(optimizer, self.lr_lambda, last_epoch=last_epoch)
|
return float(current_step) / float(max(1.0, num_warmup_steps))
|
||||||
|
|
||||||
def lr_lambda(self, step):
|
|
||||||
if step < self.warmup_steps:
|
|
||||||
return float(step) / float(max(1.0, self.warmup_steps))
|
|
||||||
return 1.
|
return 1.
|
||||||
|
|
||||||
|
return LambdaLR(optimizer, lr_lambda, last_epoch=last_epoch)
|
||||||
|
|
||||||
class WarmupLinearSchedule(LambdaLR):
|
|
||||||
""" Linear warmup and then linear decay.
|
def get_linear_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps, last_epoch=-1):
|
||||||
Multiplies the learning rate defined in the optimizer by a dynamic variable determined by the current step.
|
""" Create a schedule with a learning rate that decreases linearly after
|
||||||
Linearly increases the multiplicative variable from 0. to 1. over `warmup_steps` training steps.
|
linearly increasing during a warmup period.
|
||||||
Linearly decreases the multiplicative variable from 1. to 0. over remaining `t_total - warmup_steps` steps.
|
|
||||||
"""
|
"""
|
||||||
def __init__(self, optimizer, warmup_steps, t_total, last_epoch=-1):
|
def lr_lambda(current_step):
|
||||||
self.warmup_steps = warmup_steps
|
if current_step < num_warmup_steps:
|
||||||
self.t_total = t_total
|
return float(current_step) / float(max(1, num_warmup_steps))
|
||||||
super(WarmupLinearSchedule, self).__init__(optimizer, self.lr_lambda, last_epoch=last_epoch)
|
return max(0.0, float(num_training_steps - current_step) / float(max(1, num_training_steps - num_warmup_steps)))
|
||||||
|
|
||||||
def lr_lambda(self, step):
|
return LambdaLR(optimizer, lr_lambda, last_epoch)
|
||||||
if step < self.warmup_steps:
|
|
||||||
return float(step) / float(max(1, self.warmup_steps))
|
|
||||||
return max(0.0, float(self.t_total - step) / float(max(1.0, self.t_total - self.warmup_steps)))
|
|
||||||
|
|
||||||
|
|
||||||
class WarmupCosineSchedule(LambdaLR):
|
def get_cosine_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps, num_cycles=.5, last_epoch=-1):
|
||||||
""" Linear warmup and then cosine decay.
|
""" Create a schedule with a learning rate that decreases following the
|
||||||
Multiplies the learning rate defined in the optimizer by a dynamic variable determined by the current step.
|
values of the cosine function between 0 and `pi * cycles` after a warmup
|
||||||
Linearly increases the multiplicative variable from 0. to 1. over `warmup_steps` training steps.
|
period during which it increases linearly between 0 and 1.
|
||||||
Decreases the multiplicative variable from 1. to 0. over remaining `t_total - warmup_steps` steps following a cosine curve.
|
|
||||||
If `cycles` (default=0.5) is different from default, then the multiplicative variable follows cosine function after warmup.
|
|
||||||
"""
|
"""
|
||||||
def __init__(self, optimizer, warmup_steps, t_total, cycles=.5, last_epoch=-1):
|
def lr_lambda(current_step):
|
||||||
self.warmup_steps = warmup_steps
|
if current_step < num_warmup_steps:
|
||||||
self.t_total = t_total
|
return float(current_step) / float(max(1, num_warmup_steps))
|
||||||
self.cycles = cycles
|
progress = float(current_step - num_warmup_steps) / float(max(1, num_training_steps - num_warmup_steps))
|
||||||
super(WarmupCosineSchedule, self).__init__(optimizer, self.lr_lambda, last_epoch=last_epoch)
|
return max(0., 0.5 * (1. + math.cos(math.pi * float(num_cycles) * 2. * progress)))
|
||||||
|
|
||||||
def lr_lambda(self, step):
|
return LambdaLR(optimizer, lr_lambda, last_epoch)
|
||||||
if step < self.warmup_steps:
|
|
||||||
return float(step) / float(max(1.0, self.warmup_steps))
|
|
||||||
# progress after warmup
|
|
||||||
progress = float(step - self.warmup_steps) / float(max(1, self.t_total - self.warmup_steps))
|
|
||||||
return max(0.0, 0.5 * (1. + math.cos(math.pi * float(self.cycles) * 2.0 * progress)))
|
|
||||||
|
|
||||||
|
|
||||||
class WarmupCosineWithHardRestartsSchedule(LambdaLR):
|
def get_cosine_with_hard_restarts_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps, num_cycles=1., last_epoch=-1):
|
||||||
""" Linear warmup and then cosine cycles with hard restarts.
|
""" Create a schedule with a learning rate that decreases following the
|
||||||
Multiplies the learning rate defined in the optimizer by a dynamic variable determined by the current step.
|
values of the cosine function with several hard restarts, after a warmup
|
||||||
Linearly increases the multiplicative variable from 0. to 1. over `warmup_steps` training steps.
|
period during which it increases linearly between 0 and 1.
|
||||||
If `cycles` (default=1.) is different from default, learning rate follows `cycles` times a cosine decaying
|
|
||||||
learning rate (with hard restarts).
|
|
||||||
"""
|
"""
|
||||||
def __init__(self, optimizer, warmup_steps, t_total, cycles=1., last_epoch=-1):
|
def lr_lambda(current_step):
|
||||||
self.warmup_steps = warmup_steps
|
if current_step < num_warmup_steps:
|
||||||
self.t_total = t_total
|
return float(current_step) / float(max(1, num_warmup_steps))
|
||||||
self.cycles = cycles
|
progress = float(current_step - num_warmup_steps) / float(max(1, num_training_steps - num_warmup_steps))
|
||||||
super(WarmupCosineWithHardRestartsSchedule, self).__init__(optimizer, self.lr_lambda, last_epoch=last_epoch)
|
if progress >= 1.:
|
||||||
|
return 0.
|
||||||
def lr_lambda(self, step):
|
return max(0., 0.5 * (1. + math.cos(math.pi * ((float(num_cycles) * progress) % 1.))))
|
||||||
if step < self.warmup_steps:
|
|
||||||
return float(step) / float(max(1, self.warmup_steps))
|
|
||||||
# progress after warmup
|
|
||||||
progress = float(step - self.warmup_steps) / float(max(1, self.t_total - self.warmup_steps))
|
|
||||||
if progress >= 1.0:
|
|
||||||
return 0.0
|
|
||||||
return max(0.0, 0.5 * (1. + math.cos(math.pi * ((float(self.cycles) * progress) % 1.0))))
|
|
||||||
|
|
||||||
|
return LambdaLR(optimizer, lr_lambda, last_epoch)
|
||||||
|
|
||||||
|
|
||||||
class AdamW(Optimizer):
|
class AdamW(Optimizer):
|
||||||
|
|||||||
@@ -25,8 +25,12 @@ from transformers import is_torch_available
|
|||||||
if is_torch_available():
|
if is_torch_available():
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
from transformers import (AdamW, ConstantLRSchedule, WarmupConstantSchedule,
|
from transformers import (AdamW,
|
||||||
WarmupCosineSchedule, WarmupCosineWithHardRestartsSchedule, WarmupLinearSchedule)
|
get_constant_schedule,
|
||||||
|
get_constant_schedule_with_warmup,
|
||||||
|
get_cosine_schedule_with_warmup,
|
||||||
|
get_cosine_with_hard_restarts_schedule_with_warmup,
|
||||||
|
get_linear_schedule_with_warmup)
|
||||||
else:
|
else:
|
||||||
pytestmark = pytest.mark.skip("Require Torch")
|
pytestmark = pytest.mark.skip("Require Torch")
|
||||||
|
|
||||||
@@ -87,59 +91,60 @@ class ScheduleInitTest(unittest.TestCase):
|
|||||||
self.assertAlmostEqual(a, b, delta=tol)
|
self.assertAlmostEqual(a, b, delta=tol)
|
||||||
|
|
||||||
def test_constant_scheduler(self):
|
def test_constant_scheduler(self):
|
||||||
scheduler = ConstantLRSchedule(self.optimizer)
|
scheduler = get_constant_schedule(self.optimizer)
|
||||||
lrs = unwrap_schedule(scheduler, self.num_steps)
|
lrs = unwrap_schedule(scheduler, self.num_steps)
|
||||||
expected_learning_rates = [10.] * self.num_steps
|
expected_learning_rates = [10.] * self.num_steps
|
||||||
self.assertEqual(len(lrs[0]), 1)
|
self.assertEqual(len(lrs[0]), 1)
|
||||||
self.assertListEqual([l[0] for l in lrs], expected_learning_rates)
|
self.assertListEqual([l[0] for l in lrs], expected_learning_rates)
|
||||||
|
|
||||||
scheduler = ConstantLRSchedule(self.optimizer)
|
scheduler = get_constant_schedule(self.optimizer)
|
||||||
lrs_2 = unwrap_and_save_reload_schedule(scheduler, self.num_steps)
|
lrs_2 = unwrap_and_save_reload_schedule(scheduler, self.num_steps)
|
||||||
self.assertListEqual([l[0] for l in lrs], [l[0] for l in lrs_2])
|
self.assertListEqual([l[0] for l in lrs], [l[0] for l in lrs_2])
|
||||||
|
|
||||||
def test_warmup_constant_scheduler(self):
|
def test_warmup_constant_scheduler(self):
|
||||||
scheduler = WarmupConstantSchedule(self.optimizer, warmup_steps=4)
|
scheduler = get_constant_schedule_with_warmup(self.optimizer, num_warmup_steps=4)
|
||||||
lrs = unwrap_schedule(scheduler, self.num_steps)
|
lrs = unwrap_schedule(scheduler, self.num_steps)
|
||||||
expected_learning_rates = [2.5, 5.0, 7.5, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0]
|
expected_learning_rates = [2.5, 5.0, 7.5, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0]
|
||||||
self.assertEqual(len(lrs[0]), 1)
|
self.assertEqual(len(lrs[0]), 1)
|
||||||
self.assertListEqual([l[0] for l in lrs], expected_learning_rates)
|
self.assertListEqual([l[0] for l in lrs], expected_learning_rates)
|
||||||
|
|
||||||
scheduler = WarmupConstantSchedule(self.optimizer, warmup_steps=4)
|
scheduler = get_constant_schedule_with_warmup(self.optimizer, num_warmup_steps=4)
|
||||||
lrs_2 = unwrap_and_save_reload_schedule(scheduler, self.num_steps)
|
lrs_2 = unwrap_and_save_reload_schedule(scheduler, self.num_steps)
|
||||||
self.assertListEqual([l[0] for l in lrs], [l[0] for l in lrs_2])
|
self.assertListEqual([l[0] for l in lrs], [l[0] for l in lrs_2])
|
||||||
|
|
||||||
def test_warmup_linear_scheduler(self):
|
def test_warmup_linear_scheduler(self):
|
||||||
scheduler = WarmupLinearSchedule(self.optimizer, warmup_steps=2, t_total=10)
|
scheduler = get_linear_schedule_with_warmup(self.optimizer, num_warmup_steps=2, num_training_steps=10)
|
||||||
lrs = unwrap_schedule(scheduler, self.num_steps)
|
lrs = unwrap_schedule(scheduler, self.num_steps)
|
||||||
expected_learning_rates = [5.0, 10.0, 8.75, 7.5, 6.25, 5.0, 3.75, 2.5, 1.25, 0.0]
|
expected_learning_rates = [5.0, 10.0, 8.75, 7.5, 6.25, 5.0, 3.75, 2.5, 1.25, 0.0]
|
||||||
self.assertEqual(len(lrs[0]), 1)
|
self.assertEqual(len(lrs[0]), 1)
|
||||||
self.assertListEqual([l[0] for l in lrs], expected_learning_rates)
|
self.assertListEqual([l[0] for l in lrs], expected_learning_rates)
|
||||||
|
|
||||||
scheduler = WarmupLinearSchedule(self.optimizer, warmup_steps=2, t_total=10)
|
scheduler = get_linear_schedule_with_warmup(self.optimizer, num_warmup_steps=2, num_training_steps=10)
|
||||||
lrs_2 = unwrap_and_save_reload_schedule(scheduler, self.num_steps)
|
lrs_2 = unwrap_and_save_reload_schedule(scheduler, self.num_steps)
|
||||||
self.assertListEqual([l[0] for l in lrs], [l[0] for l in lrs_2])
|
self.assertListEqual([l[0] for l in lrs], [l[0] for l in lrs_2])
|
||||||
|
|
||||||
def test_warmup_cosine_scheduler(self):
|
def test_warmup_cosine_scheduler(self):
|
||||||
scheduler = WarmupCosineSchedule(self.optimizer, warmup_steps=2, t_total=10)
|
scheduler = get_cosine_schedule_with_warmup(self.optimizer, num_warmup_steps=2, num_training_steps=10)
|
||||||
lrs = unwrap_schedule(scheduler, self.num_steps)
|
lrs = unwrap_schedule(scheduler, self.num_steps)
|
||||||
expected_learning_rates = [5.0, 10.0, 9.61, 8.53, 6.91, 5.0, 3.08, 1.46, 0.38, 0.0]
|
expected_learning_rates = [5.0, 10.0, 9.61, 8.53, 6.91, 5.0, 3.08, 1.46, 0.38, 0.0]
|
||||||
self.assertEqual(len(lrs[0]), 1)
|
self.assertEqual(len(lrs[0]), 1)
|
||||||
self.assertListAlmostEqual([l[0] for l in lrs], expected_learning_rates, tol=1e-2)
|
self.assertListAlmostEqual([l[0] for l in lrs], expected_learning_rates, tol=1e-2)
|
||||||
|
|
||||||
scheduler = WarmupCosineSchedule(self.optimizer, warmup_steps=2, t_total=10)
|
scheduler = get_cosine_schedule_with_warmup(self.optimizer, num_warmup_steps=2, num_training_steps=10)
|
||||||
lrs_2 = unwrap_and_save_reload_schedule(scheduler, self.num_steps)
|
lrs_2 = unwrap_and_save_reload_schedule(scheduler, self.num_steps)
|
||||||
self.assertListEqual([l[0] for l in lrs], [l[0] for l in lrs_2])
|
self.assertListEqual([l[0] for l in lrs], [l[0] for l in lrs_2])
|
||||||
|
|
||||||
def test_warmup_cosine_hard_restart_scheduler(self):
|
def test_warmup_cosine_hard_restart_scheduler(self):
|
||||||
scheduler = WarmupCosineWithHardRestartsSchedule(self.optimizer, warmup_steps=2, cycles=2, t_total=10)
|
scheduler = get_cosine_with_hard_restarts_schedule_with_warmup(self.optimizer, num_warmup_steps=2, num_cycles=2, num_training_steps=10)
|
||||||
lrs = unwrap_schedule(scheduler, self.num_steps)
|
lrs = unwrap_schedule(scheduler, self.num_steps)
|
||||||
expected_learning_rates = [5.0, 10.0, 8.53, 5.0, 1.46, 10.0, 8.53, 5.0, 1.46, 0.0]
|
expected_learning_rates = [5.0, 10.0, 8.53, 5.0, 1.46, 10.0, 8.53, 5.0, 1.46, 0.0]
|
||||||
self.assertEqual(len(lrs[0]), 1)
|
self.assertEqual(len(lrs[0]), 1)
|
||||||
self.assertListAlmostEqual([l[0] for l in lrs], expected_learning_rates, tol=1e-2)
|
self.assertListAlmostEqual([l[0] for l in lrs], expected_learning_rates, tol=1e-2)
|
||||||
|
|
||||||
scheduler = WarmupCosineWithHardRestartsSchedule(self.optimizer, warmup_steps=2, cycles=2, t_total=10)
|
scheduler = get_cosine_with_hard_restarts_schedule_with_warmup(self.optimizer, num_warmup_steps=2, num_cycles=2, num_training_steps=10)
|
||||||
lrs_2 = unwrap_and_save_reload_schedule(scheduler, self.num_steps)
|
lrs_2 = unwrap_and_save_reload_schedule(scheduler, self.num_steps)
|
||||||
self.assertListEqual([l[0] for l in lrs], [l[0] for l in lrs_2])
|
self.assertListEqual([l[0] for l in lrs], [l[0] for l in lrs_2])
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
unittest.main()
|
unittest.main()
|
||||||
|
|||||||
Reference in New Issue
Block a user