From b21d84b0276b31cccf56ebe714fa479e865787d2 Mon Sep 17 00:00:00 2001 From: thomwolf Date: Thu, 11 Jul 2019 15:37:34 +0200 Subject: [PATCH] update examples --- examples/run_bert_classifier.py | 528 -------------------------- examples/run_glue.py | 10 +- examples/run_xlnet_classifier.py | 530 --------------------------- examples/utils.py | 61 --- pytorch_transformers/optimization.py | 4 +- 5 files changed, 9 insertions(+), 1124 deletions(-) delete mode 100644 examples/run_bert_classifier.py delete mode 100644 examples/run_xlnet_classifier.py delete mode 100644 examples/utils.py diff --git a/examples/run_bert_classifier.py b/examples/run_bert_classifier.py deleted file mode 100644 index 27b8e6165d..0000000000 --- a/examples/run_bert_classifier.py +++ /dev/null @@ -1,528 +0,0 @@ -# coding=utf-8 -# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. -# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""BERT finetuning runner.""" - -from __future__ import absolute_import, division, print_function - -import argparse -import logging -import os -import sys -import random -from tqdm import tqdm, trange - -import numpy as np - -import torch -from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler, - TensorDataset) -from torch.utils.data.distributed import DistributedSampler -from torch.nn import CrossEntropyLoss, MSELoss - -from tensorboardX import SummaryWriter - -from pytorch_transformers import WEIGHTS_NAME, CONFIG_NAME -from pytorch_transformers.modeling_bert import BertForSequenceClassification -from pytorch_transformers.tokenization_bert import BertTokenizer -from pytorch_transformers.optimization import BertAdam, WarmupLinearSchedule - -from utils_glue import processors, output_modes, convert_examples_to_features, compute_metrics - -if sys.version_info[0] == 2: - import cPickle as pickle -else: - import pickle - - -logger = logging.getLogger(__name__) - - -def main(): - parser = argparse.ArgumentParser() - - ## Required parameters - parser.add_argument("--data_dir", - default=None, - type=str, - required=True, - help="The input data dir. Should contain the .tsv files (or other data files) for the task.") - parser.add_argument("--bert_model", default=None, type=str, required=True, - help="Bert pre-trained model selected in the list: bert-base-uncased, " - "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " - "bert-base-multilingual-cased, bert-base-chinese.") - parser.add_argument("--task_name", - default=None, - type=str, - required=True, - help="The name of the task to train.") - parser.add_argument("--output_dir", - default=None, - type=str, - required=True, - help="The output directory where the model predictions and checkpoints will be written.") - - ## Other parameters - parser.add_argument("--cache_dir", - default="", - type=str, - help="Where do you want to store the pre-trained models downloaded from s3") - parser.add_argument("--max_seq_length", - default=128, - type=int, - help="The maximum total input sequence length after WordPiece tokenization. \n" - "Sequences longer than this will be truncated, and sequences shorter \n" - "than this will be padded.") - parser.add_argument("--do_train", - action='store_true', - help="Whether to run training.") - parser.add_argument("--do_eval", - action='store_true', - help="Whether to run eval on the dev set.") - parser.add_argument("--do_lower_case", - action='store_true', - help="Set this flag if you are using an uncased model.") - parser.add_argument("--train_batch_size", - default=32, - type=int, - help="Total batch size for training.") - parser.add_argument("--eval_batch_size", - default=8, - type=int, - help="Total batch size for eval.") - parser.add_argument("--learning_rate", - default=5e-5, - type=float, - help="The initial learning rate for Adam.") - parser.add_argument("--num_train_epochs", - default=3.0, - type=float, - help="Total number of training epochs to perform.") - parser.add_argument("--warmup_proportion", - default=0.1, - type=float, - help="Proportion of training to perform linear learning rate warmup for. " - "E.g., 0.1 = 10%% of training.") - parser.add_argument("--no_cuda", - action='store_true', - help="Whether not to use CUDA when available") - parser.add_argument('--overwrite_output_dir', - action='store_true', - help="Overwrite the content of the output directory") - parser.add_argument("--local_rank", - type=int, - default=-1, - help="local_rank for distributed training on gpus") - parser.add_argument('--seed', - type=int, - default=42, - help="random seed for initialization") - parser.add_argument('--gradient_accumulation_steps', - type=int, - default=1, - help="Number of updates steps to accumulate before performing a backward/update pass.") - parser.add_argument('--fp16', - action='store_true', - help="Whether to use 16-bit float precision instead of 32-bit") - parser.add_argument('--loss_scale', - type=float, default=0, - help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" - "0 (default value): dynamic loss scaling.\n" - "Positive power of 2: static loss scaling value.\n") - parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") - parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") - args = parser.parse_args() - - if args.server_ip and args.server_port: - # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script - import ptvsd - print("Waiting for debugger attach") - ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) - ptvsd.wait_for_attach() - - if args.local_rank == -1 or args.no_cuda: - device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") - n_gpu = torch.cuda.device_count() - else: - torch.cuda.set_device(args.local_rank) - device = torch.device("cuda", args.local_rank) - n_gpu = 1 - # Initializes the distributed backend which will take care of sychronizing nodes/GPUs - torch.distributed.init_process_group(backend='nccl') - args.device = device - - logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s', - datefmt = '%m/%d/%Y %H:%M:%S', - level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN) - - logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format( - device, n_gpu, bool(args.local_rank != -1), args.fp16)) - - if args.gradient_accumulation_steps < 1: - raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format( - args.gradient_accumulation_steps)) - - args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps - - random.seed(args.seed) - np.random.seed(args.seed) - torch.manual_seed(args.seed) - if n_gpu > 0: - torch.cuda.manual_seed_all(args.seed) - - if not args.do_train and not args.do_eval: - raise ValueError("At least one of `do_train` or `do_eval` must be True.") - - if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir: - raise ValueError("Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(args.output_dir)) - if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]: - os.makedirs(args.output_dir) - - task_name = args.task_name.lower() - - if task_name not in processors: - raise ValueError("Task not found: %s" % (task_name)) - - processor = processors[task_name]() - output_mode = output_modes[task_name] - - label_list = processor.get_labels() - num_labels = len(label_list) - - if args.local_rank not in [-1, 0]: - torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab - tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) - model = BertForSequenceClassification.from_pretrained(args.bert_model, num_labels=num_labels) - if args.local_rank == 0: - torch.distributed.barrier() - - if args.fp16: - model.half() - model.to(device) - if args.local_rank != -1: - model = torch.nn.parallel.DistributedDataParallel(model, - device_ids=[args.local_rank], - output_device=args.local_rank, - find_unused_parameters=True) - elif n_gpu > 1: - model = torch.nn.DataParallel(model) - - global_step = 0 - nb_tr_steps = 0 - tr_loss = 0 - - if args.do_train: - if args.local_rank in [-1, 0]: - tb_writer = SummaryWriter() - - # Prepare data loader - train_examples = processor.get_train_examples(args.data_dir) - cached_train_features_file = os.path.join(args.data_dir, 'train_{0}_{1}_{2}'.format( - list(filter(None, args.bert_model.split('/'))).pop(), - str(args.max_seq_length), - str(task_name))) - try: - with open(cached_train_features_file, "rb") as reader: - train_features = pickle.load(reader) - except: - train_features = convert_examples_to_features( - train_examples, label_list, args.max_seq_length, tokenizer, output_mode) - if args.local_rank == -1 or torch.distributed.get_rank() == 0: - logger.info(" Saving train features into cached file %s", cached_train_features_file) - with open(cached_train_features_file, "wb") as writer: - pickle.dump(train_features, writer) - - all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) - all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) - all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) - - if output_mode == "classification": - all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) - elif output_mode == "regression": - all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.float) - - train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) - if args.local_rank == -1: - train_sampler = RandomSampler(train_data) - else: - train_sampler = DistributedSampler(train_data) - train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) - - num_train_optimization_steps = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs - - # Prepare optimizer - - param_optimizer = list(model.named_parameters()) - no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] - optimizer_grouped_parameters = [ - {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, - {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} - ] - if args.fp16: - try: - from apex.optimizers import FP16_Optimizer - from apex.optimizers import FusedAdam - except ImportError: - raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") - - optimizer = FusedAdam(optimizer_grouped_parameters, - lr=args.learning_rate, - bias_correction=False, - max_grad_norm=1.0) - if args.loss_scale == 0: - optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) - else: - optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) - warmup_linear = WarmupLinearSchedule(warmup=args.warmup_proportion, - t_total=num_train_optimization_steps) - - else: - optimizer = BertAdam(optimizer_grouped_parameters, - lr=args.learning_rate, - warmup=args.warmup_proportion, - t_total=num_train_optimization_steps) - - logger.info("***** Running training *****") - logger.info(" Num examples = %d", len(train_examples)) - logger.info(" Batch size = %d", args.train_batch_size) - logger.info(" Num steps = %d", num_train_optimization_steps) - - model.train() - for _ in trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]): - tr_loss = 0 - nb_tr_examples, nb_tr_steps = 0, 0 - for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])): - batch = tuple(t.to(device) for t in batch) - input_ids, input_mask, segment_ids, label_ids = batch - - # define a new function to compute loss values for both output_modes - ouputs = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids) - loss = ouputs[0] - - if n_gpu > 1: - loss = loss.mean() # mean() to average on multi-gpu. - if args.gradient_accumulation_steps > 1: - loss = loss / args.gradient_accumulation_steps - - if args.fp16: - optimizer.backward(loss) - else: - loss.backward() - - tr_loss += loss.item() - nb_tr_examples += input_ids.size(0) - nb_tr_steps += 1 - if (step + 1) % args.gradient_accumulation_steps == 0: - if args.fp16: - # modify learning rate with special warm up BERT uses - # if args.fp16 is False, BertAdam is used that handles this automatically - lr_this_step = args.learning_rate * warmup_linear.get_lr(global_step, args.warmup_proportion) - for param_group in optimizer.param_groups: - param_group['lr'] = lr_this_step - optimizer.step() - optimizer.zero_grad() - global_step += 1 - if args.local_rank in [-1, 0]: - tb_writer.add_scalar('lr', optimizer.get_lr()[0], global_step) - tb_writer.add_scalar('loss', loss.item(), global_step) - - ### Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained() - ### Example: - if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0): - # Save a trained model, configuration and tokenizer - model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self - - # If we save using the predefined names, we can load using `from_pretrained` - output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) - output_config_file = os.path.join(args.output_dir, CONFIG_NAME) - - torch.save(model_to_save.state_dict(), output_model_file) - model_to_save.config.to_json_file(output_config_file) - tokenizer.save_vocabulary(args.output_dir) - - # Load a trained model and vocabulary that you have fine-tuned - model = BertForSequenceClassification.from_pretrained(args.output_dir, num_labels=num_labels) - tokenizer = BertTokenizer.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case) - - # Good practice: save your training arguments together with the trained model - output_args_file = os.path.join(args.output_dir, 'training_args.bin') - torch.save(args, output_args_file) - else: - model = BertForSequenceClassification.from_pretrained(args.bert_model, num_labels=num_labels) - - model.to(device) - - ### Evaluation - if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): - eval_examples = processor.get_dev_examples(args.data_dir) - cached_eval_features_file = os.path.join(args.data_dir, 'dev_{0}_{1}_{2}'.format( - list(filter(None, args.bert_model.split('/'))).pop(), - str(args.max_seq_length), - str(task_name))) - try: - with open(cached_eval_features_file, "rb") as reader: - eval_features = pickle.load(reader) - except: - eval_features = convert_examples_to_features( - eval_examples, label_list, args.max_seq_length, tokenizer, output_mode) - if args.local_rank == -1 or torch.distributed.get_rank() == 0: - logger.info(" Saving eval features into cached file %s", cached_eval_features_file) - with open(cached_eval_features_file, "wb") as writer: - pickle.dump(eval_features, writer) - - - logger.info("***** Running evaluation *****") - logger.info(" Num examples = %d", len(eval_examples)) - logger.info(" Batch size = %d", args.eval_batch_size) - all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) - all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) - all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) - - if output_mode == "classification": - all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) - elif output_mode == "regression": - all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.float) - - eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) - # Run prediction for full data - if args.local_rank == -1: - eval_sampler = SequentialSampler(eval_data) - else: - eval_sampler = DistributedSampler(eval_data) # Note that this sampler samples randomly - eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) - - model.eval() - eval_loss = 0 - nb_eval_steps = 0 - preds = [] - out_label_ids = None - - for input_ids, input_mask, segment_ids, label_ids in tqdm(eval_dataloader, desc="Evaluating"): - input_ids = input_ids.to(device) - input_mask = input_mask.to(device) - segment_ids = segment_ids.to(device) - label_ids = label_ids.to(device) - - with torch.no_grad(): - outputs = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids) - tmp_eval_loss, logits = outputs[:2] - - eval_loss += tmp_eval_loss.mean().item() - nb_eval_steps += 1 - if len(preds) == 0: - preds.append(logits.detach().cpu().numpy()) - out_label_ids = label_ids.detach().cpu().numpy() - else: - preds[0] = np.append( - preds[0], logits.detach().cpu().numpy(), axis=0) - out_label_ids = np.append( - out_label_ids, label_ids.detach().cpu().numpy(), axis=0) - - eval_loss = eval_loss / nb_eval_steps - preds = preds[0] - if output_mode == "classification": - preds = np.argmax(preds, axis=1) - elif output_mode == "regression": - preds = np.squeeze(preds) - result = compute_metrics(task_name, preds, out_label_ids) - - loss = tr_loss/global_step if args.do_train else None - - result['eval_loss'] = eval_loss - result['global_step'] = global_step - result['loss'] = loss - - output_eval_file = os.path.join(args.output_dir, "eval_results.txt") - with open(output_eval_file, "w") as writer: - logger.info("***** Eval results *****") - for key in sorted(result.keys()): - logger.info(" %s = %s", key, str(result[key])) - writer.write("%s = %s\n" % (key, str(result[key]))) - - # hack for MNLI-MM - if task_name == "mnli": - task_name = "mnli-mm" - processor = processors[task_name]() - - if os.path.exists(args.output_dir + '-MM') and os.listdir(args.output_dir + '-MM') and args.do_train: - raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir)) - if not os.path.exists(args.output_dir + '-MM'): - os.makedirs(args.output_dir + '-MM') - - eval_examples = processor.get_dev_examples(args.data_dir) - eval_features = convert_examples_to_features( - eval_examples, label_list, args.max_seq_length, tokenizer, output_mode) - logger.info("***** Running evaluation *****") - logger.info(" Num examples = %d", len(eval_examples)) - logger.info(" Batch size = %d", args.eval_batch_size) - all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) - all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) - all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) - all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) - - eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) - # Run prediction for full data - eval_sampler = SequentialSampler(eval_data) - eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) - - model.eval() - eval_loss = 0 - nb_eval_steps = 0 - preds = [] - out_label_ids = None - - for input_ids, input_mask, segment_ids, label_ids in tqdm(eval_dataloader, desc="Evaluating"): - input_ids = input_ids.to(device) - input_mask = input_mask.to(device) - segment_ids = segment_ids.to(device) - label_ids = label_ids.to(device) - - with torch.no_grad(): - logits = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=None) - - loss_fct = CrossEntropyLoss() - tmp_eval_loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1)) - - eval_loss += tmp_eval_loss.mean().item() - nb_eval_steps += 1 - if len(preds) == 0: - preds.append(logits.detach().cpu().numpy()) - out_label_ids = label_ids.detach().cpu().numpy() - else: - preds[0] = np.append( - preds[0], logits.detach().cpu().numpy(), axis=0) - out_label_ids = np.append( - out_label_ids, label_ids.detach().cpu().numpy(), axis=0) - - eval_loss = eval_loss / nb_eval_steps - preds = preds[0] - preds = np.argmax(preds, axis=1) - result = compute_metrics(task_name, preds, out_label_ids) - - loss = tr_loss/global_step if args.do_train else None - - result['eval_loss'] = eval_loss - result['global_step'] = global_step - result['loss'] = loss - - output_eval_file = os.path.join(args.output_dir + '-MM', "eval_results.txt") - with open(output_eval_file, "w") as writer: - logger.info("***** Eval results *****") - for key in sorted(result.keys()): - logger.info(" %s = %s", key, str(result[key])) - writer.write("%s = %s\n" % (key, str(result[key]))) - -if __name__ == "__main__": - main() diff --git a/examples/run_glue.py b/examples/run_glue.py index 93f69e1741..aaf9a9876c 100644 --- a/examples/run_glue.py +++ b/examples/run_glue.py @@ -74,8 +74,8 @@ def train(args, train_dataset, model, tokenizer): {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay}, {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] - optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate) - schedule = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total) + optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) + scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total) if args.fp16: try: from apex import amp @@ -300,6 +300,8 @@ def main(): help="The initial learning rate for Adam.") parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight deay if we apply some.") + parser.add_argument("--adam_epsilon", default=1e-8, type=float, + help="Epsilon for Adam optimizer.") parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument("--num_train_epochs", default=3.0, type=float, @@ -358,7 +360,9 @@ def main(): args.device = device # Setup logging - logging.basicConfig(level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN) + logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s', + datefmt = '%m/%d/%Y %H:%M:%S', + level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16) diff --git a/examples/run_xlnet_classifier.py b/examples/run_xlnet_classifier.py deleted file mode 100644 index 35b0ebfbd1..0000000000 --- a/examples/run_xlnet_classifier.py +++ /dev/null @@ -1,530 +0,0 @@ -# coding=utf-8 -# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. -# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""BERT finetuning runner.""" - -from __future__ import absolute_import, division, print_function - -import argparse -import logging -import os -import sys -import random -from tqdm import tqdm, trange - -import numpy as np - -import torch -from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler, - TensorDataset) -from torch.utils.data.distributed import DistributedSampler -from torch.nn import CrossEntropyLoss, MSELoss - -from tensorboardX import SummaryWriter - -from pytorch_transformers import WEIGHTS_NAME, CONFIG_NAME -from pytorch_transformers.modeling_xlnet import XLNetForSequenceClassification -from pytorch_transformers.tokenization_xlnet import XLNetTokenizer -from pytorch_transformers.optimization import BertAdam, WarmupLinearSchedule - -from utils_glue import processors, output_modes, convert_examples_to_features, compute_metrics - -if sys.version_info[0] == 2: - import cPickle as pickle -else: - import pickle - - -logger = logging.getLogger(__name__) - - -def main(): - parser = argparse.ArgumentParser() - - ## Required parameters - parser.add_argument("--data_dir", default=None, type=str, required=True, - help="The input data dir. Should contain the .tsv files (or other data files) for the task.") - parser.add_argument("--task_name", default=None, type=str, required=True, - help="The name of the task to train.") - parser.add_argument("--output_dir", default=None, type=str, required=True, - help="The output directory where the model predictions and checkpoints will be written.") - # training - parser.add_argument("--do_train", action='store_true', - help="Whether to run training.") - parser.add_argument("--learning_rate", default=5e-5, type=float, - help="The initial learning rate for Adam.") - parser.add_argument("--num_train_epochs", default=3.0, type=float, - help="Total number of training epochs to perform.") - parser.add_argument("--max_steps", default=-1, type=int, - help="If > 0 limit the number of training steps to perform, you should choose only one of num_train_epochs and max_steps.") - parser.add_argument("--warmup_proportion", default=0.1, type=float, - help="Proportion of training to perform linear learning rate warmup for. " - "E.g., 0.1 = 10%% of training.") - parser.add_argument("--clip_gradients", default=1.0, type=float, - help="Clip gradient norms.") - parser.add_argument("--train_batch_size", default=32, type=int, - help="Total batch size for training.") - parser.add_argument('--gradient_accumulation_steps', type=int, default=1, - help="Number of updates steps to accumulate before performing a backward/update pass.") - parser.add_argument('--fp16', action='store_true', - help="Whether to use 16-bit float precision instead of 32-bit") - parser.add_argument('--loss_scale', type=float, default=0, - help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" - "0 (default value): dynamic loss scaling.\n" - "Positive power of 2: static loss scaling value.\n") - parser.add_argument("--log_every", default=10, type=int, - help="Log metrics every X training steps.") - # evaluation - parser.add_argument("--do_eval", action='store_true', - help="Whether to run eval on the dev set.") - parser.add_argument("--eval_batch_size", default=8, type=int, - help="Total batch size for eval.") - # Model - parser.add_argument("--xlnet_model", default="xlnet-large-cased", type=str, - help="XLNet pre-trained model: currently only xlnet-large-cased.") - parser.add_argument("--do_lower_case", action='store_true', - help="Set this flag if you are using an uncased model.") - parser.add_argument("--cache_dir", default="", type=str, - help="Where do you want to store the pre-trained models downloaded from s3") - # task specific - parser.add_argument("--max_seq_length", default=128, type=int, - help="The maximum total input sequence length after WordPiece tokenization. \n" - "Sequences longer than this will be truncated, and sequences shorter \n" - "than this will be padded.") - parser.add_argument('--overwrite_output_dir', action='store_true', - help="Overwrite the content of the output directory") - # Misc - parser.add_argument("--no_cuda", action='store_true', - help="Whether not to use CUDA when available") - parser.add_argument("--local_rank", type=int, default=-1, - help="local_rank for distributed training on gpus") - parser.add_argument('--seed', type=int, default=42, - help="random seed for initialization") - parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") - parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") - args = parser.parse_args() - - if args.server_ip and args.server_port: - # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script - import ptvsd - print("Waiting for debugger attach") - ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) - ptvsd.wait_for_attach() - - if args.local_rank == -1 or args.no_cuda: - device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") - n_gpu = torch.cuda.device_count() - else: - torch.cuda.set_device(args.local_rank) - device = torch.device("cuda", args.local_rank) - n_gpu = 1 - # Initializes the distributed backend which will take care of sychronizing nodes/GPUs - torch.distributed.init_process_group(backend='nccl') - args.device = device - - logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s', - datefmt = '%m/%d/%Y %H:%M:%S', - level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN) - - logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format( - device, n_gpu, bool(args.local_rank != -1), args.fp16)) - - if args.gradient_accumulation_steps < 1: - raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format( - args.gradient_accumulation_steps)) - - args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps - - random.seed(args.seed) - np.random.seed(args.seed) - torch.manual_seed(args.seed) - if n_gpu > 0: - torch.cuda.manual_seed_all(args.seed) - - if not args.do_train and not args.do_eval: - raise ValueError("At least one of `do_train` or `do_eval` must be True.") - - if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir: - raise ValueError("Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(args.output_dir)) - if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]: - os.makedirs(args.output_dir) - - task_name = args.task_name.lower() - - if task_name not in processors: - raise ValueError("Task not found: %s" % (task_name)) - - processor = processors[task_name]() - output_mode = output_modes[task_name] - - label_list = processor.get_labels() - num_labels = len(label_list) - - if args.local_rank not in [-1, 0]: - torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab - tokenizer = XLNetTokenizer.from_pretrained(args.xlnet_model, do_lower_case=args.do_lower_case) - model = XLNetForSequenceClassification.from_pretrained(args.xlnet_model, num_labels=num_labels) - if args.local_rank == 0: - torch.distributed.barrier() - - if args.fp16: - model.half() - model.to(device) - if args.local_rank != -1: - model = torch.nn.parallel.DistributedDataParallel(model, - device_ids=[args.local_rank], - output_device=args.local_rank, - find_unused_parameters=True) - elif n_gpu > 1: - model = torch.nn.DataParallel(model) - - global_step = 0 - curr_tr_loss, curr_steps = 0., 1 - - if args.do_train: - if args.local_rank in [-1, 0]: - tb_writer = SummaryWriter() - - # Prepare data loader - train_examples = processor.get_train_examples(args.data_dir) - cached_train_features_file = os.path.join(args.data_dir, 'train_{0}_{1}_{2}'.format( - list(filter(None, args.xlnet_model.split('/'))).pop(), - str(args.max_seq_length), - str(task_name))) - if os.path.exists(cached_train_features_file): - logger.info("Loading train features for cache file %s", cached_train_features_file) - with open(cached_train_features_file, "rb") as reader: - train_features = pickle.load(reader) - else: - logger.info("No cache file at %s, preparing train features", cached_train_features_file) - train_features = convert_examples_to_features( - train_examples, label_list, args.max_seq_length, tokenizer, output_mode, - cls_token_at_end=True, cls_token=tokenizer.cls_token, - sep_token=tokenizer.sep_token, cls_token_segment_id=2, - pad_on_left=True, pad_token_segment_id=4) - if args.local_rank == -1 or torch.distributed.get_rank() == 0: - logger.info(" Saving train features into cached file %s", cached_train_features_file) - with open(cached_train_features_file, "wb") as writer: - pickle.dump(train_features, writer) - - all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) - all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) - all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) - - if output_mode == "classification": - all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) - elif output_mode == "regression": - all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.float) - - train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) - if args.local_rank == -1: - train_sampler = SequentialSampler(train_data) # RandomSampler(train_data) - else: - train_sampler = DistributedSampler(train_data) - train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) - - if args.max_steps > 0: - num_train_optimization_steps = args.max_steps - else: - num_train_optimization_steps = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs - - # Prepare optimizer - - optimizer_grouped_parameters = model.parameters() - # param_optimizer = list(model.named_parameters()) - # no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] - # optimizer_grouped_parameters = [ - # {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, - # {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} - # ] - if args.fp16: - try: - from apex.optimizers import FP16_Optimizer - from apex.optimizers import FusedAdam - except ImportError: - raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") - - optimizer = FusedAdam(optimizer_grouped_parameters, - lr=args.learning_rate, - bias_correction=False, - max_grad_norm=1.0) - if args.loss_scale == 0: - optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) - else: - optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) - warmup_linear = WarmupLinearSchedule(warmup=args.warmup_proportion, - t_total=num_train_optimization_steps) - - else: - optimizer = BertAdam(optimizer_grouped_parameters, - lr=args.learning_rate, - warmup=args.warmup_proportion, - t_total=num_train_optimization_steps) - - logger.info("***** Running training *****") - logger.info(" Num examples = %d", len(train_examples)) - logger.info(" Batch size = %d", args.train_batch_size) - logger.info(" Num steps = %d", num_train_optimization_steps) - - model.train() - for _ in trange(int(args.num_train_epochs) if args.max_steps <= 0 else int('Inf'), - desc="Epoch", disable=args.local_rank not in [-1, 0]): - for step, batch in enumerate(tqdm(train_dataloader, - desc="Iteration", - disable=args.local_rank not in [-1, 0])): - batch = tuple(t.to(device) for t in batch) - input_ids, input_mask, segment_ids, label_ids = batch - - # define a new function to compute loss values for both output_modes - loss, _ = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids) - - if n_gpu > 1: - loss = loss.mean() # mean() to average on multi-gpu. - if args.gradient_accumulation_steps > 1: - loss = loss / args.gradient_accumulation_steps - - if args.fp16: - optimizer.backward(loss) - else: - loss.backward() - - gnorm = torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip_gradients) - - curr_tr_loss += loss.item() - curr_steps += 1 - if (step + 1) % args.gradient_accumulation_steps == 0: - if args.fp16: - # modify learning rate with special warm up BERT uses - # if args.fp16 is False, BertAdam is used that handles this automatically - lr_this_step = args.learning_rate * warmup_linear.get_lr(global_step, args.warmup_proportion) - for param_group in optimizer.param_groups: - param_group['lr'] = lr_this_step - optimizer.step() - optimizer.zero_grad() - global_step += 1 - if args.local_rank in [-1, 0] and (args.log_every <= 0 or (global_step + 1) % args.log_every == 0): - learning_rate = optimizer.get_lr()[0] if not args.fp16 else lr_this_step - logger.info("[{}] | gnorm {:.2f} lr {:8.6f} | loss {:.2f}".format( - global_step, gnorm, learning_rate, curr_tr_loss / curr_steps)) - tb_writer.add_scalar('lr', learning_rate, global_step) - tb_writer.add_scalar('loss', curr_tr_loss / curr_steps, global_step) - curr_tr_loss, curr_steps = 0., 1 - - if args.max_steps > 0 and global_step > args.max_steps: - break - - if args.max_steps > 0 and global_step > args.max_steps: - break - - ### Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained() - ### Example: - if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0): - # Save a trained model, configuration and tokenizer - model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self - - # If we save using the predefined names, we can load using `from_pretrained` - output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) - output_config_file = os.path.join(args.output_dir, CONFIG_NAME) - - torch.save(model_to_save.state_dict(), output_model_file) - model_to_save.config.to_json_file(output_config_file) - tokenizer.save_vocabulary(args.output_dir) - - # Load a trained model and vocabulary that you have fine-tuned - model = XLNetForSequenceClassification.from_pretrained(args.output_dir, num_labels=num_labels) - tokenizer = XLNetTokenizer.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case) - - # Good practice: save your training arguments together with the trained model - output_args_file = os.path.join(args.output_dir, 'training_args.bin') - torch.save(args, output_args_file) - else: - model = XLNetForSequenceClassification.from_pretrained(args.xlnet_model, num_labels=num_labels) - - model.to(device) - - ### Evaluation - if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): - eval_examples = processor.get_dev_examples(args.data_dir) - cached_eval_features_file = os.path.join(args.data_dir, 'dev_{0}_{1}_{2}'.format( - list(filter(None, args.xlnet_model.split('/'))).pop(), - str(args.max_seq_length), - str(task_name))) - if os.path.exists(cached_eval_features_file): - logger.info("Loading eval features for cache file %s", cached_eval_features_file) - with open(cached_eval_features_file, "rb") as reader: - eval_features = pickle.load(reader) - else: - logger.info("No cache file at %s, preparing eval features", cached_eval_features_file) - eval_features = convert_examples_to_features( - eval_examples, label_list, args.max_seq_length, tokenizer, output_mode, - cls_token_at_end=True, cls_token=tokenizer.cls_token, - sep_token=tokenizer.sep_token, cls_token_segment_id=2, - pad_on_left=True, pad_token_segment_id=4) - if args.local_rank == -1 or torch.distributed.get_rank() == 0: - logger.info(" Saving eval features into cached file %s", cached_eval_features_file) - with open(cached_eval_features_file, "wb") as writer: - pickle.dump(eval_features, writer) - - - logger.info("***** Running evaluation *****") - logger.info(" Num examples = %d", len(eval_examples)) - logger.info(" Batch size = %d", args.eval_batch_size) - all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) - all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) - all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) - - if output_mode == "classification": - all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) - elif output_mode == "regression": - all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.float) - - eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) - # Run prediction for full data - if args.local_rank == -1: - eval_sampler = SequentialSampler(eval_data) - else: - eval_sampler = DistributedSampler(eval_data) # Note that this sampler samples randomly - eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) - - model.eval() - eval_loss = 0 - nb_eval_steps = 0 - preds = [] - out_label_ids = None - - for input_ids, input_mask, segment_ids, label_ids in tqdm(eval_dataloader, desc="Evaluating"): - input_ids = input_ids.to(device) - input_mask = input_mask.to(device) - segment_ids = segment_ids.to(device) - label_ids = label_ids.to(device) - - with torch.no_grad(): - logits, _ = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask) - - # create eval loss and other metric required by the task - if output_mode == "classification": - loss_fct = CrossEntropyLoss() - tmp_eval_loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1)) - elif output_mode == "regression": - loss_fct = MSELoss() - tmp_eval_loss = loss_fct(logits.view(-1), label_ids.view(-1)) - - eval_loss += tmp_eval_loss.mean().item() - nb_eval_steps += 1 - if len(preds) == 0: - preds.append(logits.detach().cpu().numpy()) - out_label_ids = label_ids.detach().cpu().numpy() - else: - preds[0] = np.append( - preds[0], logits.detach().cpu().numpy(), axis=0) - out_label_ids = np.append( - out_label_ids, label_ids.detach().cpu().numpy(), axis=0) - - eval_loss = eval_loss / nb_eval_steps - preds = preds[0] - if output_mode == "classification": - preds = np.argmax(preds, axis=1) - elif output_mode == "regression": - preds = np.squeeze(preds) - result = compute_metrics(task_name, preds, out_label_ids) - - loss = curr_tr_loss/curr_steps if args.do_train else None - - result['eval_loss'] = eval_loss - result['global_step'] = global_step - result['loss'] = loss - - output_eval_file = os.path.join(args.output_dir, "eval_results.txt") - with open(output_eval_file, "w") as writer: - logger.info("***** Eval results *****") - for key in sorted(result.keys()): - logger.info(" %s = %s", key, str(result[key])) - writer.write("%s = %s\n" % (key, str(result[key]))) - - # hack for MNLI-MM - if task_name == "mnli": - task_name = "mnli-mm" - processor = processors[task_name]() - - if os.path.exists(args.output_dir + '-MM') and os.listdir(args.output_dir + '-MM') and args.do_train: - raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir)) - if not os.path.exists(args.output_dir + '-MM'): - os.makedirs(args.output_dir + '-MM') - - eval_examples = processor.get_dev_examples(args.data_dir) - eval_features = convert_examples_to_features( - eval_examples, label_list, args.max_seq_length, tokenizer, output_mode) - logger.info("***** Running evaluation *****") - logger.info(" Num examples = %d", len(eval_examples)) - logger.info(" Batch size = %d", args.eval_batch_size) - all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) - all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) - all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) - all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) - - eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) - # Run prediction for full data - eval_sampler = SequentialSampler(eval_data) - eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) - - model.eval() - eval_loss = 0 - nb_eval_steps = 0 - preds = [] - out_label_ids = None - - for input_ids, input_mask, segment_ids, label_ids in tqdm(eval_dataloader, desc="Evaluating"): - input_ids = input_ids.to(device) - input_mask = input_mask.to(device) - segment_ids = segment_ids.to(device) - label_ids = label_ids.to(device) - - with torch.no_grad(): - logits, _ = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=None) - - loss_fct = CrossEntropyLoss() - tmp_eval_loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1)) - - eval_loss += tmp_eval_loss.mean().item() - nb_eval_steps += 1 - if len(preds) == 0: - preds.append(logits.detach().cpu().numpy()) - out_label_ids = label_ids.detach().cpu().numpy() - else: - preds[0] = np.append( - preds[0], logits.detach().cpu().numpy(), axis=0) - out_label_ids = np.append( - out_label_ids, label_ids.detach().cpu().numpy(), axis=0) - - eval_loss = eval_loss / nb_eval_steps - preds = preds[0] - preds = np.argmax(preds, axis=1) - result = compute_metrics(task_name, preds, out_label_ids) - - loss = curr_tr_loss/curr_steps if args.do_train else None - - result['eval_loss'] = eval_loss - result['global_step'] = global_step - result['loss'] = loss - - output_eval_file = os.path.join(args.output_dir + '-MM', "eval_results.txt") - with open(output_eval_file, "w") as writer: - logger.info("***** Eval results *****") - for key in sorted(result.keys()): - logger.info(" %s = %s", key, str(result[key])) - writer.write("%s = %s\n" % (key, str(result[key]))) - -if __name__ == "__main__": - main() diff --git a/examples/utils.py b/examples/utils.py deleted file mode 100644 index e4b7263efa..0000000000 --- a/examples/utils.py +++ /dev/null @@ -1,61 +0,0 @@ -# Copyright (c) 2019-present, the HuggingFace Inc. authors. -# All rights reserved. This source code is licensed under the BSD-style -# license found in the LICENSE file in the root directory of this source tree. -import logging -import os -from tqdm import tqdm -from pprint import pformat - -import torch - -from ignite.engine import Engine, Events -from ignite.handlers import ModelCheckpoint -from ignite.metrics import RunningAverage -from ignite.contrib.handlers import ProgressBar -from ignite.contrib.handlers.tensorboard_logger import OptimizerParamsHandler, OutputHandler, TensorboardLogger - - -def average_distributed_scalar(scalar, args): - """ Average a scalar over nodes if we are in distributed training. - We use this for distributed evaluation. - Beware, such averages only works for metrics which are additive with regard - to the evaluation dataset, e.g. accuracy, log probabilities. - Doesn't work for ratio metrics like F1. - """ - if args.local_rank == -1: - return scalar - scalar_t = torch.tensor(scalar, dtype=torch.float, device=args.device) / torch.distributed.get_world_size() - torch.distributed.all_reduce(scalar_t, op=torch.distributed.ReduceOp.SUM) - return scalar_t.item() - - -def add_logging_and_checkpoint_saving(trainer, evaluator, metrics, model, optimizer, args, prefix=""): - """ Add to a PyTorch ignite training engine tensorboard logging, - progress bar with average loss, checkpoint saving and save training config. - """ - # Add progress bar with average loss - RunningAverage(output_transform=lambda x: x).attach(trainer, prefix + "loss") - pbar = ProgressBar(persist=True) - pbar.attach(trainer, metric_names=[prefix + "loss"]) - evaluator.add_event_handler(Events.COMPLETED, lambda _: - pbar.log_message("Validation: %s" % pformat(evaluator.state.metrics))) - - # Add tensorboard logging with training and evaluation metrics - tb_logger = TensorboardLogger(log_dir=None) - tb_logger.attach(trainer, log_handler=OutputHandler(tag="training", metric_names=[prefix + "loss"]), - event_name=Events.ITERATION_COMPLETED) - tb_logger.attach(trainer, log_handler=OptimizerParamsHandler(optimizer), - event_name=Events.ITERATION_STARTED) - @evaluator.on(Events.COMPLETED) - def tb_log_metrics(engine): - for name in metrics.keys(): - tb_logger.writer.add_scalar(name, engine.state.metrics[name], trainer.state.iteration) - - # Add checkpoint saving after each epoch - take care of distributed encapsulation ('getattr()') - checkpoint_handler = ModelCheckpoint(tb_logger.writer.log_dir, 'checkpoint', save_interval=1, n_saved=3) - trainer.add_event_handler(Events.EPOCH_COMPLETED, checkpoint_handler, {'mymodel': getattr(model, 'module', model)}) - - # Save training configuration - torch.save(args, os.path.join(tb_logger.writer.log_dir, CONFIG_NAME)) - - return checkpoint_handler, tb_logger diff --git a/pytorch_transformers/optimization.py b/pytorch_transformers/optimization.py index c7f169f0b6..c78818dd74 100644 --- a/pytorch_transformers/optimization.py +++ b/pytorch_transformers/optimization.py @@ -25,7 +25,7 @@ logger = logging.getLogger(__name__) class ConstantLRSchedule(LambdaLR): def __init__(self, optimizer, last_epoch=-1): - super(ConstantLR, self).__init__(optimizer, lambda x: x, last_epoch=last_epoch) + super(ConstantLRSchedule, self).__init__(optimizer, lambda x: x, last_epoch=last_epoch) class WarmupCosineSchedule(LambdaLR): """ @@ -128,7 +128,7 @@ class AdamW(Optimizer): raise ValueError("Invalid epsilon value: {} - should be >= 0.0".format(e)) defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay, correct_bias=correct_bias) - super(BertAdam, self).__init__(params, defaults) + super(AdamW, self).__init__(params, defaults) def step(self, closure=None): """Performs a single optimization step.