From 2276bf69b771763a2553eccd5d70c2b7331b1f99 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=A9mi=20Louf?= Date: Thu, 14 Nov 2019 18:00:14 +0100 Subject: [PATCH] update the examples, docs and template --- README.md | 8 ++++---- docs/source/main_classes/optimizer_schedules.rst | 14 +++++--------- docs/source/migration.md | 8 ++++---- examples/contrib/run_openai_gpt.py | 4 ++-- examples/contrib/run_swag.py | 4 ++-- examples/distillation/distiller.py | 8 ++++---- examples/distillation/run_squad_w_distillation.py | 4 ++-- examples/run_glue.py | 4 ++-- examples/run_lm_finetuning.py | 4 ++-- examples/run_multiple_choice.py | 4 ++-- examples/run_ner.py | 4 ++-- examples/run_squad.py | 4 ++-- templates/adding_a_new_example_script/run_xxx.py | 4 ++-- 13 files changed, 35 insertions(+), 39 deletions(-) diff --git a/README.md b/README.md index 40b08583b1..a22a3c83f5 100644 --- a/README.md +++ b/README.md @@ -521,12 +521,12 @@ Here is a conversion examples from `BertAdam` with a linear warmup and decay sch # Parameters: lr = 1e-3 max_grad_norm = 1.0 -num_total_steps = 1000 +num_training_steps = 1000 num_warmup_steps = 100 -warmup_proportion = float(num_warmup_steps) / float(num_total_steps) # 0.1 +warmup_proportion = float(num_warmup_steps) / float(num_training_steps) # 0.1 ### Previously BertAdam optimizer was instantiated like this: -optimizer = BertAdam(model.parameters(), lr=lr, schedule='warmup_linear', warmup=warmup_proportion, t_total=num_total_steps) +optimizer = BertAdam(model.parameters(), lr=lr, schedule='warmup_linear', warmup=warmup_proportion, t_total=num_training_steps) ### and used like this: for batch in train_data: loss = model(batch) @@ -535,7 +535,7 @@ for batch in train_data: ### In Transformers, optimizer and schedules are splitted and instantiated like this: optimizer = AdamW(model.parameters(), lr=lr, correct_bias=False) # To reproduce BertAdam specific behavior set correct_bias=False -scheduler = WarmupLinearSchedule(optimizer, warmup_steps=num_warmup_steps, t_total=num_total_steps) # PyTorch scheduler +scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps) # PyTorch scheduler ### and used like this: for batch in train_data: model.train() diff --git a/docs/source/main_classes/optimizer_schedules.rst b/docs/source/main_classes/optimizer_schedules.rst index ff0c9e6929..b30a2e0e2e 100644 --- a/docs/source/main_classes/optimizer_schedules.rst +++ b/docs/source/main_classes/optimizer_schedules.rst @@ -18,19 +18,17 @@ Schedules Learning Rate Schedules ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. autoclass:: transformers.ConstantLRSchedule - :members: +.. autofunction:: transformers.get_constant_schedule -.. autoclass:: transformers.WarmupConstantSchedule - :members: +.. autofunction:: transformers.get_constant_schedule_with_warmup .. image:: /imgs/warmup_constant_schedule.png :target: /imgs/warmup_constant_schedule.png :alt: -.. autoclass:: transformers.WarmupCosineSchedule +.. autofunction:: transformers.get_cosine_schedule_with_warmup :members: .. image:: /imgs/warmup_cosine_schedule.png @@ -38,8 +36,7 @@ Learning Rate Schedules :alt: -.. autoclass:: transformers.WarmupCosineWithHardRestartsSchedule - :members: +.. autofunction:: transformers.get_cosine_with_hard_restarts_schedule_with_warmup .. image:: /imgs/warmup_cosine_hard_restarts_schedule.png :target: /imgs/warmup_cosine_hard_restarts_schedule.png @@ -47,8 +44,7 @@ Learning Rate Schedules -.. autoclass:: transformers.WarmupLinearSchedule - :members: +.. autofunction:: transformers.get_linear_schedule_with_warmup .. image:: /imgs/warmup_linear_schedule.png :target: /imgs/warmup_linear_schedule.png diff --git a/docs/source/migration.md b/docs/source/migration.md index 553a79c82b..d04b66d5e4 100644 --- a/docs/source/migration.md +++ b/docs/source/migration.md @@ -84,12 +84,12 @@ Here is a conversion examples from `BertAdam` with a linear warmup and decay sch # Parameters: lr = 1e-3 max_grad_norm = 1.0 -num_total_steps = 1000 +num_training_steps = 1000 num_warmup_steps = 100 -warmup_proportion = float(num_warmup_steps) / float(num_total_steps) # 0.1 +warmup_proportion = float(num_warmup_steps) / float(num_training_steps) # 0.1 ### Previously BertAdam optimizer was instantiated like this: -optimizer = BertAdam(model.parameters(), lr=lr, schedule='warmup_linear', warmup=warmup_proportion, t_total=num_total_steps) +optimizer = BertAdam(model.parameters(), lr=lr, schedule='warmup_linear', warmup=warmup_proportion, num_training_steps=num_training_steps) ### and used like this: for batch in train_data: loss = model(batch) @@ -98,7 +98,7 @@ for batch in train_data: ### In Transformers, optimizer and schedules are splitted and instantiated like this: optimizer = AdamW(model.parameters(), lr=lr, correct_bias=False) # To reproduce BertAdam specific behavior set correct_bias=False -scheduler = WarmupLinearSchedule(optimizer, warmup_steps=num_warmup_steps, t_total=num_total_steps) # PyTorch scheduler +scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps) # PyTorch scheduler ### and used like this: for batch in train_data: loss = model(batch) diff --git a/examples/contrib/run_openai_gpt.py b/examples/contrib/run_openai_gpt.py index 7eb1b0be76..2d165a91e3 100644 --- a/examples/contrib/run_openai_gpt.py +++ b/examples/contrib/run_openai_gpt.py @@ -41,7 +41,7 @@ from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler, from transformers import (OpenAIGPTDoubleHeadsModel, OpenAIGPTTokenizer, AdamW, cached_path, WEIGHTS_NAME, CONFIG_NAME, - WarmupLinearSchedule) + get_linear_schedule_with_warmup) ROCSTORIES_URL = "https://s3.amazonaws.com/datasets.huggingface.co/ROCStories.tar.gz" @@ -211,7 +211,7 @@ def main(): {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) - scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total) + scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total) if args.do_train: nb_tr_steps, tr_loss, exp_average_loss = 0, 0, None diff --git a/examples/contrib/run_swag.py b/examples/contrib/run_swag.py index 8494c5fad9..5de93db7fe 100644 --- a/examples/contrib/run_swag.py +++ b/examples/contrib/run_swag.py @@ -42,7 +42,7 @@ from tqdm import tqdm, trange from transformers import (WEIGHTS_NAME, BertConfig, BertForMultipleChoice, BertTokenizer) -from transformers import AdamW, WarmupLinearSchedule +from transformers import AdamW, get_linear_schedule_with_warmup logger = logging.getLogger(__name__) @@ -322,7 +322,7 @@ def train(args, train_dataset, model, tokenizer): {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) - scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total) + scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total) if args.fp16: try: from apex import amp diff --git a/examples/distillation/distiller.py b/examples/distillation/distiller.py index d51bdae77f..0442072e84 100644 --- a/examples/distillation/distiller.py +++ b/examples/distillation/distiller.py @@ -35,7 +35,7 @@ try: except: from tensorboardX import SummaryWriter -from transformers import WarmupLinearSchedule +from transformers import get_linear_schedule_with_warmup from utils import logger from lm_seqs_dataset import LmSeqsDataset @@ -137,9 +137,9 @@ class Distiller: betas=(0.9, 0.98)) warmup_steps = math.ceil(num_train_optimization_steps * params.warmup_prop) - self.scheduler = WarmupLinearSchedule(self.optimizer, - warmup_steps=warmup_steps, - t_total=num_train_optimization_steps) + self.scheduler = get_linear_schedule_with_warmup(self.optimizer, + num_warmup_steps=warmup_steps, + num_training_steps=num_train_optimization_steps) if self.fp16: try: diff --git a/examples/distillation/run_squad_w_distillation.py b/examples/distillation/run_squad_w_distillation.py index 7c662df010..70b65dc1b8 100644 --- a/examples/distillation/run_squad_w_distillation.py +++ b/examples/distillation/run_squad_w_distillation.py @@ -46,7 +46,7 @@ from transformers import (WEIGHTS_NAME, BertConfig, XLNetTokenizer, DistilBertConfig, DistilBertForQuestionAnswering, DistilBertTokenizer) -from transformers import AdamW, WarmupLinearSchedule +from transformers import AdamW, get_linear_schedule_with_warmup from ..utils_squad import (read_squad_examples, convert_examples_to_features, RawResult, write_predictions, @@ -101,7 +101,7 @@ def train(args, train_dataset, model, tokenizer, teacher=None): {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) - scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total) + scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total) if args.fp16: try: from apex import amp diff --git a/examples/run_glue.py b/examples/run_glue.py index 1558a812c3..27048ad565 100644 --- a/examples/run_glue.py +++ b/examples/run_glue.py @@ -49,7 +49,7 @@ from transformers import (WEIGHTS_NAME, BertConfig, DistilBertForSequenceClassification, DistilBertTokenizer) -from transformers import AdamW, WarmupLinearSchedule +from transformers import AdamW, get_linear_schedule_with_warmup from transformers import glue_compute_metrics as compute_metrics from transformers import glue_output_modes as output_modes @@ -100,7 +100,7 @@ def train(args, train_dataset, model, tokenizer): {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) - scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total) + scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total) if args.fp16: try: from apex import amp diff --git a/examples/run_lm_finetuning.py b/examples/run_lm_finetuning.py index 2044cfe9e8..0085aee727 100644 --- a/examples/run_lm_finetuning.py +++ b/examples/run_lm_finetuning.py @@ -42,7 +42,7 @@ except: from tqdm import tqdm, trange -from transformers import (WEIGHTS_NAME, AdamW, WarmupLinearSchedule, +from transformers import (WEIGHTS_NAME, AdamW, get_linear_schedule_with_warmup, BertConfig, BertForMaskedLM, BertTokenizer, GPT2Config, GPT2LMHeadModel, GPT2Tokenizer, OpenAIGPTConfig, OpenAIGPTLMHeadModel, OpenAIGPTTokenizer, @@ -185,7 +185,7 @@ def train(args, train_dataset, model, tokenizer): {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) - scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total) + scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total) if args.fp16: try: from apex import amp diff --git a/examples/run_multiple_choice.py b/examples/run_multiple_choice.py index 638bbe74f1..544014fb66 100644 --- a/examples/run_multiple_choice.py +++ b/examples/run_multiple_choice.py @@ -43,7 +43,7 @@ from transformers import (WEIGHTS_NAME, BertConfig, XLNetTokenizer, RobertaConfig, RobertaForMultipleChoice, RobertaTokenizer) -from transformers import AdamW, WarmupLinearSchedule +from transformers import AdamW, get_linear_schedule_with_warmup from utils_multiple_choice import (convert_examples_to_features, processors) @@ -101,7 +101,7 @@ def train(args, train_dataset, model, tokenizer): {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) - scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total) + scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total) if args.fp16: try: from apex import amp diff --git a/examples/run_ner.py b/examples/run_ner.py index b35d8298fe..0077080aec 100644 --- a/examples/run_ner.py +++ b/examples/run_ner.py @@ -33,7 +33,7 @@ from torch.utils.data.distributed import DistributedSampler from tqdm import tqdm, trange from utils_ner import convert_examples_to_features, get_labels, read_examples_from_file -from transformers import AdamW, WarmupLinearSchedule +from transformers import AdamW, get_linear_schedule_with_warmup from transformers import WEIGHTS_NAME, BertConfig, BertForTokenClassification, BertTokenizer from transformers import RobertaConfig, RobertaForTokenClassification, RobertaTokenizer @@ -80,7 +80,7 @@ def train(args, train_dataset, model, tokenizer, labels, pad_token_label_id): {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0} ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) - scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total) + scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total) if args.fp16: try: from apex import amp diff --git a/examples/run_squad.py b/examples/run_squad.py index d9dc2abfde..b954a8b8b9 100644 --- a/examples/run_squad.py +++ b/examples/run_squad.py @@ -45,7 +45,7 @@ from transformers import (WEIGHTS_NAME, BertConfig, XLNetTokenizer, DistilBertConfig, DistilBertForQuestionAnswering, DistilBertTokenizer) -from transformers import AdamW, WarmupLinearSchedule +from transformers import AdamW, get_linear_schedule_with_warmup from utils_squad import (read_squad_examples, convert_examples_to_features, RawResult, write_predictions, @@ -100,7 +100,7 @@ def train(args, train_dataset, model, tokenizer): {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) - scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total) + scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total) if args.fp16: try: from apex import amp diff --git a/templates/adding_a_new_example_script/run_xxx.py b/templates/adding_a_new_example_script/run_xxx.py index 489dcb19c7..77ce587a54 100644 --- a/templates/adding_a_new_example_script/run_xxx.py +++ b/templates/adding_a_new_example_script/run_xxx.py @@ -43,7 +43,7 @@ from transformers import (WEIGHTS_NAME, BertConfig, XLNetTokenizer, DistilBertConfig, DistilBertForQuestionAnswering, DistilBertTokenizer) -from transformers import AdamW, WarmupLinearSchedule +from transformers import AdamW, get_linear_schedule_with_warmup from utils_squad import (read_squad_examples, convert_examples_to_features, RawResult, write_predictions, @@ -98,7 +98,7 @@ def train(args, train_dataset, model, tokenizer): {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) - scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total) + scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total) if args.fp16: try: from apex import amp