From 6bc082da0aedff4c6128356fbef8e101fc23d635 Mon Sep 17 00:00:00 2001 From: thomwolf Date: Fri, 8 Feb 2019 00:02:26 +0100 Subject: [PATCH] updating examples --- examples/run_openai_gpt.py | 229 ++++++++++++++ examples/run_transfo_xl.py | 142 +++++++++ examples/train_openai_gpt.py | 344 -------------------- examples/train_transfo_xl.py | 595 ----------------------------------- examples/transfo_xl_eval.py | 139 -------- 5 files changed, 371 insertions(+), 1078 deletions(-) create mode 100644 examples/run_openai_gpt.py create mode 100644 examples/run_transfo_xl.py delete mode 100644 examples/train_openai_gpt.py delete mode 100644 examples/train_transfo_xl.py delete mode 100644 examples/transfo_xl_eval.py diff --git a/examples/run_openai_gpt.py b/examples/run_openai_gpt.py new file mode 100644 index 0000000000..4f76407958 --- /dev/null +++ b/examples/run_openai_gpt.py @@ -0,0 +1,229 @@ +# coding=utf-8 +# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HugginFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" OpenAI GPT model fine-tuning script. + Adapted from https://github.com/huggingface/pytorch-openai-transformer-lm/blob/master/train.py + It self adapted from https://github.com/openai/finetune-transformer-lm/blob/master/train.py + + This script with default values fine-tunes and evaluate a pretrained OpenAI GPT on the RocStories dataset +""" +import argparse +import os +import csv +import random +import logging +from tqdm import tqdm, trange + +import numpy as np +import torch +from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler, + TensorDataset) + +from pytorch_pretrained_bert import OpenAIGPTDoubleHeadsModel, OpenAIGPTTokenizer, OpenAIAdam + +logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s', + datefmt = '%m/%d/%Y %H:%M:%S', + level = logging.INFO) +logger = logging.getLogger(__name__) + +def accuracy(out, labels): + outputs = np.argmax(out, axis=1) + return np.sum(outputs == labels) + +def load_rocstories_dataset(dataset_path): + """ Output a list of tuples(story, 1st continuation, 2nd continuation, label) """ + with open(dataset_path, encoding='utf_8') as f: + f = csv.reader(f) + output = [] + next(f) # skip the first line + for line in tqdm(f): + output.append((' '.join(line[1:5]), line[5], line[6], int(line[-1])-1)) + return output + +def pre_process_datasets(encoded_datasets, max_len, start_token, delimiter_token, clf_token): + """ Pre-process datasets containing lists of + tuples(story, 1st continuation, 2nd continuation, label) + + In Transformer inputs of shape (n_batch, n_alternative, length) comprising for each batch, continuation: + input_ids[batch, alternative, :] = [start_token] + story[:max_len] + [delimiter_token] + cont1[:max_len] + [clf_token] + """ + tensor_datasets = [] + for dataset in encoded_datasets: + n_batch = len(dataset) + input_ids = np.zeros((n_batch, 2, max_len), dtype=np.int32) + mc_token_mask = np.zeros((n_batch, 2, max_len), dtype=np.int32) + lm_labels = np.full((n_batch, 2, max_len), -1, dtype=np.float32) + mc_labels = np.zeros((n_batch,), dtype=np.float32) + for i, (story, cont1, cont2, mc_label), in enumerate(dataset): + with_cont1 = [start_token] + story[:max_len] + [delimiter_token] + cont1[:max_len] + [clf_token] + with_cont2 = [start_token] + story[:max_len] + [delimiter_token] + cont2[:max_len] + [clf_token] + input_ids[i, 0, :len(with_cont1)] = with_cont1 + input_ids[i, 1, :len(with_cont2)] = with_cont2 + mc_token_mask[i, 0, len(with_cont1) - 1] = 1 + lm_labels[i, 0, :len(with_cont1)-1] = with_cont1[1:] + lm_labels[i, 1, :len(with_cont2)-1] = with_cont2[1:] + mc_labels[i] = mc_label + all_inputs = tuple(input_ids, mc_token_mask, lm_labels, mc_labels) + tensor_datasets.append(tuple(torch.tensor(t) for t in all_inputs)) + return tensor_datasets + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument('--model_name', type=str, default='openai-gpt', + help='pretrained model name') + parser.add_argument('--train_dataset', type=str, default='cloze_test_val__spring2016 - cloze_test_ALL_val.tsv') + parser.add_argument('--eval_dataset', type=str, default='test_spring2016.tsv') + parser.add_argument('--seed', type=int, default=42) + parser.add_argument('--num_train_epochs', type=int, default=3) + parser.add_argument('--train_batch_size', type=int, default=8) + parser.add_argument('--eval_batch_size', type=int, default=16) + parser.add_argument('--max_grad_norm', type=int, default=1) + parser.add_argument('--learning_rate', type=float, default=6.25e-5) + parser.add_argument('--warmup_proportion', type=float, default=0.002) + parser.add_argument('--max_grad_norm', type=float, default=1) + parser.add_argument('--lr_schedule', type=str, default='warmup_linear') + parser.add_argument('--weight_decay', type=float, default=0.01) + parser.add_argument('--lm_coef', type=float, default=0.5) + parser.add_argument('--n_valid', type=int, default=374) + args = parser.parse_args() + print(args) + + random.seed(args.seed) + np.random.seed(args.seed) + torch.manual_seed(args.seed) + torch.cuda.manual_seed_all(args.seed) + + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + n_gpu = torch.cuda.device_count() + logger.info("device: {}, n_gpu {}".format(device, n_gpu)) + + # Load tokenizer and model + # This loading functions also add new tokens and embeddings called `special tokens` + # These new embeddings will be fine-tuned on the RocStories dataset + special_tokens = ['_start_', '_delimiter_', '_classify_'] + tokenizer = OpenAIGPTTokenizer.from_pretrained(args.model_name, special_tokens=special_tokens) + special_tokens_ids = list(tokenizer.convert_tokens_to_ids(token) for token in special_tokens) + model = OpenAIGPTDoubleHeadsModel.from_pretrained(args.model_name, num_special_tokens=len(special_tokens)) + + # Load and encode the datasets + logger.info("Encoding dataset...") + train_dataset = load_rocstories_dataset(args.train_dataset) + eval_datset = load_rocstories_dataset(args.eval_datset) + datasets = (train_dataset, eval_datset) + tokenized_datasets = tuple(list(list(tokenizer.tokenize(x) for x in instance) + for instance in dataset) for dataset in datasets) + encoded_datasets = tuple(list(list(tokenizer.convert_tokens_to_ids(x) for x in instance) + for instance in dataset) for dataset in tokenized_datasets) + + # Compute the mex input length for the Transformer + max_input_length = max(len(story) + max(len(cont1), len(cont2)) + 3 \ + for dataset in encoded_datasets for story, cont1, cont2, _ in dataset) + max_input_length = min(max_input_length, model.config.n_positions) # Max size of input for the pre-trained model + max_sub_part_length = max_input_length // 2 - 2 + + # Prepare inputs tensors and dataloaders + tensor_datasets = pre_process_datasets(encoded_datasets, max_sub_part_length, *special_tokens_ids) + train_tensor_dataset, eval_tensor_dataset = tensor_datasets[0], tensor_datasets[1] + + train_data = TensorDataset(*train_tensor_dataset) + train_sampler = RandomSampler(train_data) + train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) + + eval_data = TensorDataset(*eval_tensor_dataset) + eval_sampler = SequentialSampler(eval_data) + eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) + + # Prepare optimizer + param_optimizer = list(model.named_parameters()) + no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] + optimizer_grouped_parameters = [ + {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, + {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} + ] + num_train_optimization_steps = len(train_data) // args.train_batch_size + optimizer = OpenAIAdam(optimizer_grouped_parameters, + lr=args.learning_rate, + warmup=args.warmup_proportion, + max_grad_norm=args.max_grad_norm, + weight_decay=args.weight_decay, + t_total=num_train_optimization_steps) + + if args.do_train: + nb_tr_steps = 0 + tr_loss = 0 + model.train() + for _ in trange(int(args.num_train_epochs), desc="Epoch"): + tr_loss = 0 + nb_tr_examples, nb_tr_steps = 0, 0 + for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")): + batch = tuple(t.to(device) for t in batch) + input_ids, mc_token_mask, lm_labels, mc_labels = batch + losses = model(input_ids, mc_token_mask, lm_labels, mc_labels) + loss = args.lm_coef * losses[0] + losses[1] + loss.backward() + optimizer.step() + tr_loss += loss.item() + nb_tr_examples += input_ids.size(0) + nb_tr_steps += 1 + + # Save a trained model + model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self + output_model_file = os.path.join(args.output_dir, "pytorch_model.bin") + if args.do_train: + torch.save(model_to_save.state_dict(), output_model_file) + + # Load a trained model that you have fine-tuned + model_state_dict = torch.load(output_model_file) + model = OpenAIGPTDoubleHeadsModel.from_pretrained(args.model_name, state_dict=model_state_dict, + num_special_tokens=len(special_tokens)) + model.to(device) + + if args.do_eval: + model.eval() + eval_loss, eval_accuracy = 0, 0 + nb_eval_steps, nb_eval_examples = 0, 0 + for batch in tqdm(eval_dataloader, desc="Evaluating"): + batch = tuple(t.to(device) for t in batch) + input_ids, mc_token_mask, lm_labels, mc_labels = batch + with torch.no_grad(): + _, mc_loss = model(input_ids, mc_token_mask, lm_labels, mc_labels) + _, mc_logits = model(input_ids, mc_token_mask) + + mc_logits = mc_logits.detach().cpu().numpy() + mc_labels = mc_labels.to('cpu').numpy() + tmp_eval_accuracy = accuracy(mc_logits, mc_labels) + + eval_loss += mc_loss.mean().item() + eval_accuracy += tmp_eval_accuracy + + nb_eval_examples += input_ids.size(0) + nb_eval_steps += 1 + + eval_loss = eval_loss / nb_eval_steps + eval_accuracy = eval_accuracy / nb_eval_examples + train_loss = tr_loss/nb_tr_steps if args.do_train else None + result = {'eval_loss': eval_loss, + 'eval_accuracy': eval_accuracy, + 'train_loss': train_loss} + + output_eval_file = os.path.join(args.output_dir, "eval_results.txt") + with open(output_eval_file, "w") as writer: + logger.info("***** Eval results *****") + for key in sorted(result.keys()): + logger.info(" %s = %s", key, str(result[key])) + writer.write("%s = %s\n" % (key, str(result[key]))) + +if __name__ == '__main__': + main() diff --git a/examples/run_transfo_xl.py b/examples/run_transfo_xl.py new file mode 100644 index 0000000000..1218a1f547 --- /dev/null +++ b/examples/run_transfo_xl.py @@ -0,0 +1,142 @@ +# coding=utf-8 +# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HugginFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" PyTorch Transformer XL model evaluation script. + Adapted from https://github.com/kimiyoung/transformer-xl. + In particular https://github.com/kimiyoung/transformer-xl/blob/master/pytorch/eval.py + + This script with default values evaluates a pretrained Transformer-XL on WikiText 103 +""" +from __future__ import absolute_import, division, print_function, unicode_literals + +import argparse +import logging +import time +import math + +import torch + +from pytorch_pretrained_bert import TransfoXLModel, TransfoXLCorpus + +logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s', + datefmt = '%m/%d/%Y %H:%M:%S', + level = logging.INFO) +logger = logging.getLogger(__name__) + +def main(): + parser = argparse.ArgumentParser(description='PyTorch Transformer Language Model') + parser.add_argument('--model_name', type=str, default='transfo-xl-wt103', + help='pretrained model name') + parser.add_argument('--split', type=str, default='test', + choices=['all', 'valid', 'test'], + help='which split to evaluate') + parser.add_argument('--batch_size', type=int, default=10, + help='batch size') + parser.add_argument('--tgt_len', type=int, default=128, + help='number of tokens to predict') + parser.add_argument('--ext_len', type=int, default=0, + help='length of the extended context') + parser.add_argument('--mem_len', type=int, default=1600, + help='length of the retained previous heads') + parser.add_argument('--clamp_len', type=int, default=1000, + help='max positional embedding index') + parser.add_argument('--cuda', action='store_true', + help='use CUDA') + parser.add_argument('--work_dir', type=str, required=True, + help='path to the work_dir') + parser.add_argument('--no_log', action='store_true', + help='do not log the eval result') + parser.add_argument('--same_length', action='store_true', + help='set same length attention with masking') + args = parser.parse_args() + assert args.ext_len >= 0, 'extended context length must be non-negative' + + device = torch.device("cuda" if args.cuda else "cpu") + + # Load a pre-processed dataset + # You can also build the corpus yourself using TransfoXLCorpus methods + # The pre-processing involve computing word frequencies to prepare the Adaptive input and SoftMax + # and tokenizing the dataset + # The pre-processed corpus is a convertion (using the conversion script ) + corpus = TransfoXLCorpus.from_pretrained(args.model_name) + ntokens = len(corpus.vocab) + + va_iter = corpus.get_iterator('valid', args.batch_size, args.tgt_len, + device=device, ext_len=args.ext_len) + te_iter = corpus.get_iterator('test', args.batch_size, args.tgt_len, + device=device, ext_len=args.ext_len) + + # Load a pre-trained model + model = TransfoXLModel.from_pretrained(args.model_name) + model = model.to(device) + + logger.info('Evaluating with bsz {} tgt_len {} ext_len {} mem_len {} clamp_len {}'.format( + args.batch_size, args.tgt_len, args.ext_len, args.mem_len, args.clamp_len)) + + model.reset_length(args.tgt_len, args.ext_len, args.mem_len) + if args.clamp_len > 0: + model.clamp_len = args.clamp_len + if args.same_length: + model.same_length = True + + ############################################################################### + # Evaluation code + ############################################################################### + def evaluate(eval_iter): + # Turn on evaluation mode which disables dropout. + model.eval() + total_len, total_loss = 0, 0. + start_time = time.time() + with torch.no_grad(): + mems = tuple() + for idx, (data, target, seq_len) in enumerate(eval_iter): + ret = model(data, target, *mems) + loss, mems = ret + loss = loss.mean() + total_loss += seq_len * loss.item() + total_len += seq_len + total_time = time.time() - start_time + logger.info('Time : {:.2f}s, {:.2f}ms/segment'.format( + total_time, 1000 * total_time / (idx+1))) + return total_loss / total_len + + # Run on test data. + if args.split == 'all': + test_loss = evaluate(te_iter) + valid_loss = evaluate(va_iter) + elif args.split == 'valid': + valid_loss = evaluate(va_iter) + test_loss = None + elif args.split == 'test': + test_loss = evaluate(te_iter) + valid_loss = None + + def format_log(loss, split): + log_str = '| {0} loss {1:5.2f} | {0} ppl {2:9.3f} '.format( + split, loss, math.exp(loss)) + return log_str + + log_str = '' + if valid_loss is not None: + log_str += format_log(valid_loss, 'valid') + if test_loss is not None: + log_str += format_log(test_loss, 'test') + + logger.info('=' * 100) + logger.info(log_str) + logger.info('=' * 100) + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/examples/train_openai_gpt.py b/examples/train_openai_gpt.py deleted file mode 100644 index 7a3dd90988..0000000000 --- a/examples/train_openai_gpt.py +++ /dev/null @@ -1,344 +0,0 @@ -# coding=utf-8 -# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HugginFace Inc. team. -# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" OpenAI GPT model fine-tuning script. - Adapted from https://github.com/huggingface/pytorch-openai-transformer-lm/blob/master/train.py - It self adapted from https://github.com/openai/finetune-transformer-lm/blob/master/train.py - - This script with default values fine-tunes and evaluate a pretrained OpenAI GPT on the RocStories dataset -""" -import argparse -import os -import csv -import random -import logging -from tqdm import tqdm - -import numpy as np -import torch -from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler, - TensorDataset) -from sklearn.metrics import accuracy_score -from sklearn.utils import shuffle - -from pytorch_pretrained_bert import OpenAIGPTDoubleHeadsModel, OpenAIGPTTokenizer, OpenAIAdam - -# from analysis import rocstories as rocstories_analysis -# from datasets import rocstories -# from model_pytorch import DoubleHeadModel, load_openai_pretrained_model -# from opt import OpenAIAdam -# from text_utils import TextEncoder -# from utils import (encode_dataset, iter_data, -# ResultLogger, make_path) -# from loss import MultipleChoiceLossCompute - -logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s', - datefmt = '%m/%d/%Y %H:%M:%S', - level = logging.INFO) -logger = logging.getLogger(__name__) - -def iter_apply(Xs, Ms, Ys): - # fns = [lambda x: np.concatenate(x, 0), lambda x: float(np.sum(x))] - logits = [] - cost = 0 - with torch.no_grad(): - dh_model.eval() - for xmb, mmb, ymb in iter_data(Xs, Ms, Ys, n_batch=n_batch_train, truncate=False, verbose=True): - n = len(xmb) - XMB = torch.tensor(xmb, dtype=torch.long).to(device) - YMB = torch.tensor(ymb, dtype=torch.long).to(device) - MMB = torch.tensor(mmb).to(device) - _, clf_logits = dh_model(XMB) - clf_logits *= n - clf_losses = compute_loss_fct(XMB, YMB, MMB, clf_logits, only_return_losses=True) - clf_losses *= n - logits.append(clf_logits.to("cpu").numpy()) - cost += clf_losses.sum().item() - logits = np.concatenate(logits, 0) - return logits, cost - - -def iter_predict(Xs, Ms): - logits = [] - with torch.no_grad(): - dh_model.eval() - for xmb, mmb in iter_data(Xs, Ms, n_batch=n_batch_train, truncate=False, verbose=True): - n = len(xmb) - XMB = torch.tensor(xmb, dtype=torch.long).to(device) - MMB = torch.tensor(mmb).to(device) - _, clf_logits = dh_model(XMB) - logits.append(clf_logits.to("cpu").numpy()) - logits = np.concatenate(logits, 0) - return logits - - -def log(save_dir, desc): - global best_score - print("Logging") - tr_logits, tr_cost = iter_apply(trX[:n_valid], trM[:n_valid], trY[:n_valid]) - va_logits, va_cost = iter_apply(vaX, vaM, vaY) - tr_cost = tr_cost / len(trY[:n_valid]) - va_cost = va_cost / n_valid - tr_acc = accuracy_score(trY[:n_valid], np.argmax(tr_logits, 1)) * 100. - va_acc = accuracy_score(vaY, np.argmax(va_logits, 1)) * 100. - logger.log(n_epochs=n_epochs, n_updates=n_updates, tr_cost=tr_cost, va_cost=va_cost, tr_acc=tr_acc, va_acc=va_acc) - print('%d %d %.3f %.3f %.2f %.2f' % (n_epochs, n_updates, tr_cost, va_cost, tr_acc, va_acc)) - if submit: - score = va_acc - if score > best_score: - best_score = score - path = os.path.join(save_dir, desc, 'best_params') - torch.save(dh_model.state_dict(), make_path(path)) - - -def predict(dataset, submission_dir): - filename = filenames[dataset] - pred_fn = pred_fns[dataset] - label_decoder = label_decoders[dataset] - predictions = pred_fn(iter_predict(teX, teM)) - if label_decoder is not None: - predictions = [label_decoder[prediction] for prediction in predictions] - path = os.path.join(submission_dir, filename) - os.makedirs(os.path.dirname(path), exist_ok=True) - with open(path, 'w') as f: - f.write('{}\t{}\n'.format('index', 'prediction')) - for i, prediction in enumerate(predictions): - f.write('{}\t{}\n'.format(i, prediction)) - - -def run_epoch(): - for xmb, mmb, ymb in iter_data(*shuffle(trX, trM, trYt, random_state=np.random), - n_batch=n_batch_train, truncate=True, verbose=True): - global n_updates - dh_model.train() - XMB = torch.tensor(xmb, dtype=torch.long).to(device) - YMB = torch.tensor(ymb, dtype=torch.long).to(device) - MMB = torch.tensor(mmb).to(device) - lm_logits, clf_logits = dh_model(XMB) - compute_loss_fct(XMB, YMB, MMB, clf_logits, lm_logits) - n_updates += 1 - if n_updates in [1000, 2000, 4000, 8000, 16000, 32000] and n_epochs == 0: - log(save_dir, desc) - - -def accuracy(out, labels): - outputs = np.argmax(out, axis=1) - return np.sum(outputs == labels) - -def load_rocstories_dataset(dataset_path): - """ Output a list of tuples(story, 1st continuation, 2nd continuation, label) """ - with open(dataset_path, encoding='utf_8') as f: - f = csv.reader(f) - output = [] - next(f) # skip the first line - for line in tqdm(f): - output.append((' '.join(line[1:5]), line[5], line[6], int(line[-1])-1)) - return output - -def pre_process_dataset(encoded_dataset, max_len, start_token, delimiter_token, clf_token): - n_batch = len(dataset) - input_ids = np.zeros((n_batch, 2, max_len), dtype=np.int32) - mc_token_mask = np.zeros((n_batch, 2, max_len), dtype=np.int32) - lm_labels = np.full((n_batch, 2, max_len), -1, dtype=np.float32) - mc_labels = np.zeros((n_batch,), dtype=np.float32) - for i, (story, cont1, cont2, mc_label), in enumerate(encoded_dataset): - with_cont1 = [start_token] + story[:max_len] + [delimiter_token] + cont1[:max_len] + [clf_token] - with_cont2 = [start_token] + story[:max_len] + [delimiter_token] + cont2[:max_len] + [clf_token] - xmb[i, 0, :len(with_cont1)] = with_cont1 - xmb[i, 1, :len(with_cont2)] = with_cont2 - mc_token_mask[i, 0, len(with_cont1) - 1] = 1 - lm_labels[i, 0, :len(with_cont1)-1] = with_cont1[1:] - lm_labels[i, 1, :len(with_cont2)-1] = with_cont2[1:] - mc_labels[i] = mc_label - all_inputs = (input_ids, mc_token_mask, lm_labels, mc_labels) - all_input_tensors = list(torch.tensor(t) for t in all_inputs) - return all_input_tensors - - - -if __name__ == '__main__': - parser = argparse.ArgumentParser() - parser.add_argument('--model_name', type=str, default='openai-gpt', - help='pretrained model name') - parser.add_argument('--data_dir', type=str, default='data/') - parser.add_argument('--seed', type=int, default=42) - parser.add_argument('--num_train_epochs', type=int, default=3) - parser.add_argument('--train_batch_size', type=int, default=8) - parser.add_argument('--max_grad_norm', type=int, default=1) - parser.add_argument('--learning_rate', type=float, default=6.25e-5) - parser.add_argument('--warmup_proportion', type=float, default=0.002) - parser.add_argument('--max_grad_norm', type=float, default=1) - parser.add_argument('--lr_schedule', type=str, default='warmup_linear') - parser.add_argument('--weight_decay', type=float, default=0.01) - parser.add_argument('--lm_coef', type=float, default=0.5) - parser.add_argument('--n_valid', type=int, default=374) - args = parser.parse_args() - print(args) - - random.seed(args.seed) - np.random.seed(args.seed) - torch.manual_seed(args.seed) - torch.cuda.manual_seed_all(args.seed) - - device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - n_gpu = torch.cuda.device_count() - logger.info("device", device, "n_gpu", n_gpu) - - # Load tokenizer and model - # This loading functions also add new tokens and embeddings called `special tokens` - # These new embeddings will be fine-tuned on the RocStories dataset - special_tokens = ['_start_', '_delimiter_', '_classify_'] - tokenizer = OpenAIGPTTokenizer.from_pretrained(args.model_name, special_tokens=special_tokens) - special_tokens_ids = list(tokenizer.convert_tokens_to_ids(token) for token in special_tokens) - model = OpenAIGPTDoubleHeadsModel.from_pretrained(args.model_name, num_special_tokens=len(special_tokens)) - - # Load the dataset and prepare the inputs - logger.info("Encoding dataset...") - dataset = load_rocstories_dataset(args.dataset_path) - tokenized_dataset = list(list(tokenizer.tokenize(x) for x in instance) for instance in dataset) - encoded_dataset = list(list(tokenizer.convert_tokens_to_ids(x) for x in instance) for instance in tokenized_dataset) - - max_input_length = max(len(story)+max(len(cont1), len(cont2))+3 for story, cont1, cont2, _ in encoded_dataset) - max_input_length = min(max_input_length, model.config.n_positions) # Max size of input for the pre-trained model - max_sub_part_length = max_input_length // 2 - 2 - - # Prepare dataloader - dataset_tensors = pre_process_dataset(encoded_dataset, max_sub_part_length, *special_tokens_ids) - train_data = TensorDataset(*dataset_tensors) - train_sampler = RandomSampler(train_data) - train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) - - # Prepare optimizer - param_optimizer = list(model.named_parameters()) - no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] - optimizer_grouped_parameters = [ - {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, - {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} - ] - num_train_optimization_steps = len(train_data) // args.train_batch_size - optimizer = OpenAIAdam(optimizer_grouped_parameters, - lr=args.learning_rate, - warmup=args.warmup_proportion, - max_grad_norm=args.max_grad_norm, - weight_decay=arsg.weight_decay, - t_total=num_train_optimization_steps) - - if args.do_train: - global_step = 0 - nb_tr_steps = 0 - tr_loss = 0 - model.train() - for _ in trange(int(args.num_train_epochs), desc="Epoch"): - tr_loss = 0 - nb_tr_examples, nb_tr_steps = 0, 0 - for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")): - batch = tuple(t.to(device) for t in batch) - input_ids, mc_token_mask, lm_labels, mc_labels = batch - losses = model(input_ids, mc_token_mask, lm_labels, mc_labels) - loss = args.lm_coef * losses[0] + losses[1] - loss.backward() - tr_loss += loss.item() - nb_tr_examples += input_ids.size(0) - nb_tr_steps += 1 - - # Save a trained model - model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self - output_model_file = os.path.join(args.output_dir, "pytorch_model.bin") - if args.do_train: - torch.save(model_to_save.state_dict(), output_model_file) - - # Load a trained model that you have fine-tuned - model_state_dict = torch.load(output_model_file) - model = OpenAIGPTDoubleHeadsModel(args.mode, state_dict=model_state_dict, num_labels=num_labels) - model.to(device) - - if args.do_eval: - eval_examples = processor.get_dev_examples(args.data_dir) - eval_features = convert_examples_to_features( - eval_examples, label_list, args.max_seq_length, tokenizer) - logger.info("***** Running evaluation *****") - logger.info(" Num examples = %d", len(eval_examples)) - logger.info(" Batch size = %d", args.eval_batch_size) - all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) - all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) - all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) - all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) - eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) - # Run prediction for full data - eval_sampler = SequentialSampler(eval_data) - eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) - - model.eval() - eval_loss, eval_accuracy = 0, 0 - nb_eval_steps, nb_eval_examples = 0, 0 - - for input_ids, input_mask, segment_ids, label_ids in tqdm(eval_dataloader, desc="Evaluating"): - input_ids = input_ids.to(device) - input_mask = input_mask.to(device) - segment_ids = segment_ids.to(device) - label_ids = label_ids.to(device) - - with torch.no_grad(): - tmp_eval_loss = model(input_ids, segment_ids, input_mask, label_ids) - logits = model(input_ids, segment_ids, input_mask) - - logits = logits.detach().cpu().numpy() - label_ids = label_ids.to('cpu').numpy() - tmp_eval_accuracy = accuracy(logits, label_ids) - - eval_loss += tmp_eval_loss.mean().item() - eval_accuracy += tmp_eval_accuracy - - nb_eval_examples += input_ids.size(0) - nb_eval_steps += 1 - - eval_loss = eval_loss / nb_eval_steps - eval_accuracy = eval_accuracy / nb_eval_examples - loss = tr_loss/nb_tr_steps if args.do_train else None - result = {'eval_loss': eval_loss, - 'eval_accuracy': eval_accuracy, - 'global_step': global_step, - 'loss': loss} - - output_eval_file = os.path.join(args.output_dir, "eval_results.txt") - with open(output_eval_file, "w") as writer: - logger.info("***** Eval results *****") - for key in sorted(result.keys()): - logger.info(" %s = %s", key, str(result[key])) - writer.write("%s = %s\n" % (key, str(result[key]))) - -if __name__ == "__main__": - main() - - n_updates = 0 - n_epochs = 0 - if dataset != 'stsb': - trYt = trY - if submit: - path = os.path.join(save_dir, desc, 'best_params') - torch.save(dh_model.state_dict(), make_path(path)) - best_score = 0 - for i in range(args.n_iter): - print("running epoch", i) - run_epoch() - n_epochs += 1 - log(save_dir, desc) - if submit: - path = os.path.join(save_dir, desc, 'best_params') - dh_model.load_state_dict(torch.load(path)) - predict(dataset, args.submission_dir) - if args.analysis: - rocstories_analysis(data_dir, os.path.join(args.submission_dir, 'ROCStories.tsv'), - os.path.join(log_dir, 'rocstories.jsonl')) diff --git a/examples/train_transfo_xl.py b/examples/train_transfo_xl.py deleted file mode 100644 index 6ea0920489..0000000000 --- a/examples/train_transfo_xl.py +++ /dev/null @@ -1,595 +0,0 @@ -# coding=utf-8 -# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HugginFace Inc. team. -# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" PyTorch Transformer XL model training script. - Adapted from https://github.com/kimiyoung/transformer-xl. - In particular https://github.com/kimiyoung/transformer-xl/blob/master/pytorch/eval.py - - This script with default values train a Transformer-XL on WikiText 103 -""" -from __future__ import absolute_import, division, print_function, unicode_literals - -import os -import functools -import argparse -import logging -import time -import math -import sys -from io import open -import itertools - -import numpy as np -import torch -import torch.nn as nn -import torch.optim as optim - -from pytorch_pretrained_bert import TransfoXLModel, TransfoXLConfig -from pytorch_pretrained_bert.tokenization_transfo_xl import get_lm_corpus - -logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s', - datefmt = '%m/%d/%Y %H:%M:%S', - level = logging.INFO) -logger = logging.getLogger(__name__) - - -parser = argparse.ArgumentParser(description='PyTorch Transformer Language Model') -parser.add_argument('--data', type=str, default='../data/wikitext-103', - help='location of the data corpus') -parser.add_argument('--dataset', type=str, default='wt103', - choices=['wt103', 'lm1b', 'enwik8', 'text8'], - help='dataset name') -parser.add_argument('--n_layer', type=int, default=12, - help='number of total layers') -parser.add_argument('--n_head', type=int, default=10, - help='number of heads') -parser.add_argument('--d_head', type=int, default=50, - help='head dimension') -parser.add_argument('--d_embed', type=int, default=-1, - help='embedding dimension') -parser.add_argument('--d_model', type=int, default=500, - help='model dimension') -parser.add_argument('--d_inner', type=int, default=1000, - help='inner dimension in FF') -parser.add_argument('--dropout', type=float, default=0.0, - help='global dropout rate') -parser.add_argument('--dropatt', type=float, default=0.0, - help='attention probability dropout rate') -parser.add_argument('--init', default='normal', type=str, - help='parameter initializer to use.') -parser.add_argument('--emb_init', default='normal', type=str, - help='parameter initializer to use.') -parser.add_argument('--init_range', type=float, default=0.1, - help='parameters initialized by U(-init_range, init_range)') -parser.add_argument('--emb_init_range', type=float, default=0.01, - help='parameters initialized by U(-init_range, init_range)') -parser.add_argument('--init_std', type=float, default=0.02, - help='parameters initialized by N(0, init_std)') -parser.add_argument('--proj_init_std', type=float, default=0.01, - help='parameters initialized by N(0, init_std)') -parser.add_argument('--optim', default='adam', type=str, - choices=['adam', 'sgd', 'adagrad'], - help='optimizer to use.') -parser.add_argument('--lr', type=float, default=0.00025, - help='initial learning rate (0.00025|5 for adam|sgd)') -parser.add_argument('--mom', type=float, default=0.0, - help='momentum for sgd') -parser.add_argument('--scheduler', default='cosine', type=str, - choices=['cosine', 'inv_sqrt', 'dev_perf', 'constant'], - help='lr scheduler to use.') -parser.add_argument('--warmup_step', type=int, default=0, - help='upper epoch limit') -parser.add_argument('--decay_rate', type=float, default=0.5, - help='decay factor when ReduceLROnPlateau is used') -parser.add_argument('--lr_min', type=float, default=0.0, - help='minimum learning rate during annealing') -parser.add_argument('--clip', type=float, default=0.25, - help='gradient clipping') -parser.add_argument('--clip_nonemb', action='store_true', - help='only clip the gradient of non-embedding params') -parser.add_argument('--max_step', type=int, default=100000, - help='upper epoch limit') -parser.add_argument('--batch_size', type=int, default=60, - help='batch size') -parser.add_argument('--batch_chunk', type=int, default=1, - help='split batch into chunks to save memory') -parser.add_argument('--tgt_len', type=int, default=70, - help='number of tokens to predict') -parser.add_argument('--eval_tgt_len', type=int, default=50, - help='number of tokens to predict for evaluation') -parser.add_argument('--ext_len', type=int, default=0, - help='length of the extended context') -parser.add_argument('--mem_len', type=int, default=0, - help='length of the retained previous heads') -parser.add_argument('--not_tied', action='store_true', - help='do not tie the word embedding and softmax weights') -parser.add_argument('--seed', type=int, default=1111, - help='random seed') -parser.add_argument('--cuda', action='store_true', - help='use CUDA') -parser.add_argument('--adaptive', action='store_true', - help='use adaptive softmax') -parser.add_argument('--div_val', type=int, default=1, - help='divident value for adapative input and softmax') -parser.add_argument('--pre_lnorm', action='store_true', - help='apply LayerNorm to the input instead of the output') -parser.add_argument('--varlen', action='store_true', - help='use variable length') -parser.add_argument('--multi_gpu', action='store_true', - help='use multiple GPU') -parser.add_argument('--log-interval', type=int, default=200, - help='report interval') -parser.add_argument('--eval-interval', type=int, default=4000, - help='evaluation interval') -parser.add_argument('--work_dir', default='LM-TFM', type=str, - help='experiment directory.') -parser.add_argument('--restart', action='store_true', - help='restart training from the saved checkpoint') -parser.add_argument('--restart_dir', type=str, default='', - help='restart dir') -parser.add_argument('--debug', action='store_true', - help='run in debug mode (do not create exp dir)') -parser.add_argument('--same_length', action='store_true', - help='use the same attn length for all tokens') -parser.add_argument('--attn_type', type=int, default=0, - help='attention type. 0 for ours, 1 for Shaw et al,' - '2 for Vaswani et al, 3 for Al Rfou et al.') -parser.add_argument('--clamp_len', type=int, default=-1, - help='use the same pos embeddings after clamp_len') -parser.add_argument('--eta_min', type=float, default=0.0, - help='min learning rate for cosine scheduler') -parser.add_argument('--gpu0_bsz', type=int, default=-1, - help='batch size on gpu 0') -parser.add_argument('--max_eval_steps', type=int, default=-1, - help='max eval steps') -parser.add_argument('--sample_softmax', type=int, default=-1, - help='number of samples in sampled softmax') -parser.add_argument('--patience', type=int, default=0, - help='patience') -parser.add_argument('--finetune_v2', action='store_true', - help='finetune v2') -parser.add_argument('--finetune_v3', action='store_true', - help='finetune v3') -parser.add_argument('--fp16', action='store_true', - help='Run in pseudo-fp16 mode (fp16 storage fp32 math).') -parser.add_argument('--static-loss-scale', type=float, default=1, - help='Static loss scale, positive power of 2 values can ' - 'improve fp16 convergence.') -parser.add_argument('--dynamic-loss-scale', action='store_true', - help='Use dynamic loss scaling. If supplied, this argument' - ' supersedes --static-loss-scale.') -args = parser.parse_args() -args.tied = not args.not_tied - -if args.d_embed < 0: - args.d_embed = args.d_model - -assert args.ext_len >= 0, 'extended context length must be non-negative' -assert args.batch_size % args.batch_chunk == 0 - -args.work_dir = '{}-{}'.format(args.work_dir, args.dataset) -args.work_dir = os.path.join(args.work_dir, time.strftime('%Y%m%d-%H%M%S')) -# logging = create_exp_dir(args.work_dir, -# scripts_to_save=['train.py', 'mem_transformer.py'], debug=args.debug) - -# Set the random seed manually for reproducibility. -np.random.seed(args.seed) -torch.manual_seed(args.seed) -if torch.cuda.is_available(): - if not args.cuda: - print('WARNING: You have a CUDA device, so you should probably run with --cuda') - else: - torch.cuda.manual_seed_all(args.seed) - -# Validate `--fp16` option -if args.fp16: - if not args.cuda: - print('WARNING: --fp16 requires --cuda, ignoring --fp16 option') - args.fp16 = False - else: - try: - from apex.fp16_utils import FP16_Optimizer - except ImportError: - print('WARNING: apex not installed, ignoring --fp16 option') - args.fp16 = False - -device = torch.device('cuda' if args.cuda else 'cpu') - -############################################################################### -# Load data -############################################################################### -corpus = get_lm_corpus(args.data, args.dataset) -ntokens = len(corpus.vocab) -args.n_token = ntokens - -eval_batch_size = 10 -tr_iter = corpus.get_iterator('train', args.batch_size, args.tgt_len, - device=device, ext_len=args.ext_len) -va_iter = corpus.get_iterator('valid', eval_batch_size, args.eval_tgt_len, - device=device, ext_len=args.ext_len) -te_iter = corpus.get_iterator('test', eval_batch_size, args.eval_tgt_len, - device=device, ext_len=args.ext_len) - -# adaptive softmax / embedding -cutoffs = [] -if args.adaptive: - assert args.dataset in ['wt103', 'lm1b'] - if args.dataset == 'wt103': - cutoffs = [20000, 40000, 200000] - proj_share_all_but_first = True - elif args.dataset == 'lm1b': - cutoffs = [60000, 100000, 640000] - proj_share_all_but_first = False - -############################################################################### -# Build the model -############################################################################### -def init_weight(weight): - if args.init == 'uniform': - nn.init.uniform_(weight, -args.init_range, args.init_range) - elif args.init == 'normal': - nn.init.normal_(weight, 0.0, args.init_std) - -def init_bias(bias): - nn.init.constant_(bias, 0.0) - -def weights_init(m): - classname = m.__class__.__name__ - if classname.find('Linear') != -1: - if hasattr(m, 'weight') and m.weight is not None: - init_weight(m.weight) - if hasattr(m, 'bias') and m.bias is not None: - init_bias(m.bias) - elif classname.find('AdaptiveEmbedding') != -1: - if hasattr(m, 'emb_projs'): - for i in range(len(m.emb_projs)): - if m.emb_projs[i] is not None: - nn.init.normal_(m.emb_projs[i], 0.0, args.proj_init_std) - elif classname.find('Embedding') != -1: - if hasattr(m, 'weight'): - init_weight(m.weight) - elif classname.find('ProjectedAdaptiveLogSoftmax') != -1: - if hasattr(m, 'cluster_weight') and m.cluster_weight is not None: - init_weight(m.cluster_weight) - if hasattr(m, 'cluster_bias') and m.cluster_bias is not None: - init_bias(m.cluster_bias) - if hasattr(m, 'out_projs'): - for i in range(len(m.out_projs)): - if m.out_projs[i] is not None: - nn.init.normal_(m.out_projs[i], 0.0, args.proj_init_std) - elif classname.find('LayerNorm') != -1: - if hasattr(m, 'weight'): - nn.init.normal_(m.weight, 1.0, args.init_std) - if hasattr(m, 'bias') and m.bias is not None: - init_bias(m.bias) - elif classname.find('TransformerLM') != -1: - if hasattr(m, 'r_emb'): - init_weight(m.r_emb) - if hasattr(m, 'r_w_bias'): - init_weight(m.r_w_bias) - if hasattr(m, 'r_r_bias'): - init_weight(m.r_r_bias) - if hasattr(m, 'r_bias'): - init_bias(m.r_bias) - -def update_dropout(m): - classname = m.__class__.__name__ - if classname.find('Dropout') != -1: - if hasattr(m, 'p'): - m.p = args.dropout - -def update_dropatt(m): - if hasattr(m, 'dropatt'): - m.dropatt.p = args.dropatt - -if args.restart: - with open(os.path.join(args.restart_dir, 'model.pt'), 'rb') as f: - model = torch.load(f) - if not args.fp16: - model = model.float() - model.apply(update_dropout) - model.apply(update_dropatt) -else: - config = TransfoXLConfig(ntokens, n_layer=args.n_layer, n_head=args.n_head, - d_model=args.d_model, d_head=args.d_head, d_inner=args.d_inner, - dropout=args.dropout, dropatt=args.dropatt, - tie_weight=args.tied, d_embed=args.d_embed, div_val=args.div_val, - proj_share_all_but_first=proj_share_all_but_first, - pre_lnorm=args.pre_lnorm, tgt_len=args.tgt_len, - ext_len=args.ext_len, mem_len=args.mem_len, cutoffs=cutoffs, - same_length=args.same_length, attn_type=args.attn_type, - clamp_len=args.clamp_len, sample_softmax=args.sample_softmax) - model = TransfoXLModel(config) - model.apply(weights_init) - model.word_emb.apply(weights_init) # ensure embedding init is not overridden by out_layer in case of weight sharing -args.n_all_param = sum([p.nelement() for p in model.parameters()]) -args.n_nonemb_param = sum([p.nelement() for p in model.layers.parameters()]) - -if args.fp16: - model = model.half() - -if args.multi_gpu: - model = model.to(device) - if args.gpu0_bsz >= 0: - raise NotImplementedError - # para_model = BalancedDataParallel(args.gpu0_bsz // args.batch_chunk, - # model, dim=1).to(device) - else: - para_model = nn.DataParallel(model, dim=1).to(device) -else: - para_model = model.to(device) - -#### optimizer -if args.optim.lower() == 'sgd': - if args.sample_softmax > 0: - dense_params, sparse_params = [], [] - for param in model.parameters(): - if param.size() == model.word_emb.weight.size(): - sparse_params.append(param) - else: - dense_params.append(param) - optimizer_sparse = optim.SGD(sparse_params, lr=args.lr * 2) - optimizer = optim.SGD(dense_params, lr=args.lr, momentum=args.mom) - else: - optimizer = optim.SGD(model.parameters(), lr=args.lr, - momentum=args.mom) -elif args.optim.lower() == 'adam': - if args.sample_softmax > 0: - dense_params, sparse_params = [], [] - for param in model.parameters(): - if param.size() == model.word_emb.weight.size(): - sparse_params.append(param) - else: - dense_params.append(param) - optimizer_sparse = optim.SparseAdam(sparse_params, lr=args.lr) - optimizer = optim.Adam(dense_params, lr=args.lr) - else: - optimizer = optim.Adam(model.parameters(), lr=args.lr) -elif args.optim.lower() == 'adagrad': - optimizer = optim.Adagrad(model.parameters(), lr=args.lr) - -#### scheduler -if args.scheduler == 'cosine': - # here we do not set eta_min to lr_min to be backward compatible - # because in previous versions eta_min is default to 0 - # rather than the default value of lr_min 1e-6 - scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, - args.max_step, eta_min=args.eta_min) # should use eta_min arg - if args.sample_softmax > 0: - scheduler_sparse = optim.lr_scheduler.CosineAnnealingLR(optimizer_sparse, - args.max_step, eta_min=args.eta_min) # should use eta_min arg -elif args.scheduler == 'inv_sqrt': - # originally used for Transformer (in Attention is all you need) - def lr_lambda(step): - # return a multiplier instead of a learning rate - if step == 0 and args.warmup_step == 0: - return 1. - else: - return 1. / (step ** 0.5) if step > args.warmup_step \ - else step / (args.warmup_step ** 1.5) - scheduler = optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lr_lambda) -elif args.scheduler == 'dev_perf': - scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, - factor=args.decay_rate, patience=args.patience, min_lr=args.lr_min) - if args.sample_softmax > 0: - scheduler_sparse = optim.lr_scheduler.ReduceLROnPlateau(optimizer_sparse, - factor=args.decay_rate, patience=args.patience, min_lr=args.lr_min) -elif args.scheduler == 'constant': - pass - -if args.cuda and args.fp16: - # If args.dynamic_loss_scale is False, static_loss_scale will be used. - # If args.dynamic_loss_scale is True, it will take precedence over static_loss_scale. - optimizer = FP16_Optimizer(optimizer, - static_loss_scale = args.static_loss_scale, - dynamic_loss_scale = args.dynamic_loss_scale, - dynamic_loss_args = {'init_scale': 2 ** 16}) - -if args.restart: - if os.path.exists(os.path.join(args.restart_dir, 'optimizer.pt')): - with open(os.path.join(args.restart_dir, 'optimizer.pt'), 'rb') as f: - opt_state_dict = torch.load(f) - optimizer.load_state_dict(opt_state_dict) - else: - print('Optimizer was not saved. Start from scratch.') - -logger.info('=' * 100) -for k, v in args.__dict__.items(): - logger.info(' - {} : {}'.format(k, v)) -logger.info('=' * 100) -logger.info('#params = {}'.format(args.n_all_param)) -logger.info('#non emb params = {}'.format(args.n_nonemb_param)) - -############################################################################### -# Training code -############################################################################### - -def evaluate(eval_iter): - # Turn on evaluation mode which disables dropout. - model.eval() - - # If the model does not use memory at all, make the ext_len longer. - # Otherwise, make the mem_len longer and keep the ext_len the same. - if args.mem_len == 0: - model.reset_length(args.eval_tgt_len, - args.ext_len+args.tgt_len-args.eval_tgt_len, args.mem_len) - else: - model.reset_length(args.eval_tgt_len, - args.ext_len, args.mem_len+args.tgt_len-args.eval_tgt_len) - - # Evaluation - total_len, total_loss = 0, 0. - with torch.no_grad(): - mems = tuple() - for i, (data, target, seq_len) in enumerate(eval_iter): - if args.max_eval_steps > 0 and i >= args.max_eval_steps: - break - ret = model(data, target, *mems) - loss, mems = ret - loss = loss.mean() - total_loss += seq_len * loss.float().item() - total_len += seq_len - - # Switch back to the training mode - model.reset_length(args.tgt_len, args.ext_len, args.mem_len) - model.train() - - return total_loss / total_len - - -def train(): - # Turn on training mode which enables dropout. - global train_step, train_loss, best_val_loss, eval_start_time, log_start_time - model.train() - if args.batch_chunk > 1: - mems = [tuple() for _ in range(args.batch_chunk)] - else: - mems = tuple() - train_iter = tr_iter.get_varlen_iter() if args.varlen else tr_iter - for batch, (data, target, seq_len) in enumerate(train_iter): - model.zero_grad() - if args.batch_chunk > 1: - data_chunks = torch.chunk(data, args.batch_chunk, 1) - target_chunks = torch.chunk(target, args.batch_chunk, 1) - for i in range(args.batch_chunk): - data_i = data_chunks[i].contiguous() - target_i = target_chunks[i].contiguous() - ret = para_model(data_i, target_i, *mems[i]) - loss, mems[i] = ret[0], ret[1:] - loss = loss.float().mean().type_as(loss) / args.batch_chunk - if args.fp16: - optimizer.backward(loss) - else: - loss.backward() - train_loss += loss.float().item() - else: - ret = para_model(data, target, *mems) - loss, mems = ret[0], ret[1:] - loss = loss.float().mean().type_as(loss) - if args.fp16: - optimizer.backward(loss) - else: - loss.backward() - train_loss += loss.float().item() - - if args.fp16: - optimizer.clip_master_grads(args.clip) - else: - torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip) - - optimizer.step() - if args.sample_softmax > 0: - optimizer_sparse.step() - - # step-wise learning rate annealing - train_step += 1 - if args.scheduler in ['cosine', 'constant', 'dev_perf']: - # linear warmup stage - if train_step < args.warmup_step: - curr_lr = args.lr * train_step / args.warmup_step - optimizer.param_groups[0]['lr'] = curr_lr - if args.sample_softmax > 0: - optimizer_sparse.param_groups[0]['lr'] = curr_lr * 2 - else: - if args.scheduler == 'cosine': - scheduler.step(train_step) - if args.sample_softmax > 0: - scheduler_sparse.step(train_step) - elif args.scheduler == 'inv_sqrt': - scheduler.step(train_step) - - if train_step % args.log_interval == 0: - cur_loss = train_loss / args.log_interval - elapsed = time.time() - log_start_time - log_str = '| epoch {:3d} step {:>8d} | {:>6d} batches | lr {:.3g} ' \ - '| ms/batch {:5.2f} | loss {:5.2f}'.format( - epoch, train_step, batch+1, optimizer.param_groups[0]['lr'], - elapsed * 1000 / args.log_interval, cur_loss) - if args.dataset in ['enwik8', 'text8']: - log_str += ' | bpc {:9.5f}'.format(cur_loss / math.log(2)) - else: - log_str += ' | ppl {:9.3f}'.format(math.exp(cur_loss)) - logger.info(log_str) - train_loss = 0 - log_start_time = time.time() - - if train_step % args.eval_interval == 0: - val_loss = evaluate(va_iter) - logger.info('-' * 100) - log_str = '| Eval {:3d} at step {:>8d} | time: {:5.2f}s ' \ - '| valid loss {:5.2f}'.format( - train_step // args.eval_interval, train_step, - (time.time() - eval_start_time), val_loss) - if args.dataset in ['enwik8', 'text8']: - log_str += ' | bpc {:9.5f}'.format(val_loss / math.log(2)) - else: - log_str += ' | valid ppl {:9.3f}'.format(math.exp(val_loss)) - logger.info(log_str) - logger.info('-' * 100) - # Save the model if the validation loss is the best we've seen so far. - if not best_val_loss or val_loss < best_val_loss: - if not args.debug: - with open(os.path.join(args.work_dir, 'model.pt'), 'wb') as f: - torch.save(model, f) - with open(os.path.join(args.work_dir, 'optimizer.pt'), 'wb') as f: - torch.save(optimizer.state_dict(), f) - best_val_loss = val_loss - - # dev-performance based learning rate annealing - if args.scheduler == 'dev_perf': - scheduler.step(val_loss) - if args.sample_softmax > 0: - scheduler_sparse.step(val_loss) - - eval_start_time = time.time() - - if train_step == args.max_step: - break - -# Loop over epochs. -train_step = 0 -train_loss = 0 -best_val_loss = None - -log_start_time = time.time() -eval_start_time = time.time() - -# At any point you can hit Ctrl + C to break out of training early. -try: - for epoch in itertools.count(start=1): - train() - if train_step == args.max_step: - logger.info('-' * 100) - logger.info('End of training') - break -except KeyboardInterrupt: - logger.info('-' * 100) - logger.info('Exiting from training early') - -# Load the best saved model. -with open(os.path.join(args.work_dir, 'model.pt'), 'rb') as f: - model = torch.load(f) -para_model = model.to(device) - -# Run on test data. -test_loss = evaluate(te_iter) -logger.info('=' * 100) -if args.dataset in ['enwik8', 'text8']: - logger.info('| End of training | test loss {:5.2f} | test bpc {:9.5f}'.format( - test_loss, test_loss / math.log(2))) -else: - logger.info('| End of training | test loss {:5.2f} | test ppl {:9.3f}'.format( - test_loss, math.exp(test_loss))) -logger.info('=' * 100) diff --git a/examples/transfo_xl_eval.py b/examples/transfo_xl_eval.py deleted file mode 100644 index 4f3606a97e..0000000000 --- a/examples/transfo_xl_eval.py +++ /dev/null @@ -1,139 +0,0 @@ -# coding=utf-8 -# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HugginFace Inc. team. -# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" PyTorch Transformer XL model evaluation script. - Adapted from https://github.com/kimiyoung/transformer-xl. - In particular https://github.com/kimiyoung/transformer-xl/blob/master/pytorch/eval.py - - This script with default values evaluates a pretrained Transformer-XL on WikiText 103 -""" -from __future__ import absolute_import, division, print_function, unicode_literals - -import argparse -import logging -import time -import math - -import torch - -from pytorch_pretrained_bert import TransfoXLModel, TransfoXLCorpus - -logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s', - datefmt = '%m/%d/%Y %H:%M:%S', - level = logging.INFO) -logger = logging.getLogger(__name__) - - -parser = argparse.ArgumentParser(description='PyTorch Transformer Language Model') -parser.add_argument('--model_name', type=str, default='transfo-xl-wt103', - help='pretrained model name') -parser.add_argument('--split', type=str, default='test', - choices=['all', 'valid', 'test'], - help='which split to evaluate') -parser.add_argument('--batch_size', type=int, default=10, - help='batch size') -parser.add_argument('--tgt_len', type=int, default=128, - help='number of tokens to predict') -parser.add_argument('--ext_len', type=int, default=0, - help='length of the extended context') -parser.add_argument('--mem_len', type=int, default=1600, - help='length of the retained previous heads') -parser.add_argument('--clamp_len', type=int, default=1000, - help='max positional embedding index') -parser.add_argument('--cuda', action='store_true', - help='use CUDA') -parser.add_argument('--work_dir', type=str, required=True, - help='path to the work_dir') -parser.add_argument('--no_log', action='store_true', - help='do not log the eval result') -parser.add_argument('--same_length', action='store_true', - help='set same length attention with masking') -args = parser.parse_args() -assert args.ext_len >= 0, 'extended context length must be non-negative' - -device = torch.device("cuda" if args.cuda else "cpu") - -# Load a pre-processed dataset -# You can also build the corpus yourself using TransfoXLCorpus methods -# The pre-processing involve computing word frequencies to prepare the Adaptive input and SoftMax -# and tokenizing the dataset -# The pre-processed corpus is a convertion (using the conversion script ) -corpus = TransfoXLCorpus.from_pretrained(args.model_name) -ntokens = len(corpus.vocab) - -va_iter = corpus.get_iterator('valid', args.batch_size, args.tgt_len, - device=device, ext_len=args.ext_len) -te_iter = corpus.get_iterator('test', args.batch_size, args.tgt_len, - device=device, ext_len=args.ext_len) - -# Load a pre-trained model -model = TransfoXLModel.from_pretrained(args.model_name) -model = model.to(device) - -logger.info('Evaluating with bsz {} tgt_len {} ext_len {} mem_len {} clamp_len {}'.format( - args.batch_size, args.tgt_len, args.ext_len, args.mem_len, args.clamp_len)) - -model.reset_length(args.tgt_len, args.ext_len, args.mem_len) -if args.clamp_len > 0: - model.clamp_len = args.clamp_len -if args.same_length: - model.same_length = True - -############################################################################### -# Evaluation code -############################################################################### -def evaluate(eval_iter): - # Turn on evaluation mode which disables dropout. - model.eval() - total_len, total_loss = 0, 0. - start_time = time.time() - with torch.no_grad(): - mems = tuple() - for idx, (data, target, seq_len) in enumerate(eval_iter): - ret = model(data, target, *mems) - loss, mems = ret - loss = loss.mean() - total_loss += seq_len * loss.item() - total_len += seq_len - total_time = time.time() - start_time - logger.info('Time : {:.2f}s, {:.2f}ms/segment'.format( - total_time, 1000 * total_time / (idx+1))) - return total_loss / total_len - -# Run on test data. -if args.split == 'all': - test_loss = evaluate(te_iter) - valid_loss = evaluate(va_iter) -elif args.split == 'valid': - valid_loss = evaluate(va_iter) - test_loss = None -elif args.split == 'test': - test_loss = evaluate(te_iter) - valid_loss = None - -def format_log(loss, split): - log_str = '| {0} loss {1:5.2f} | {0} ppl {2:9.3f} '.format( - split, loss, math.exp(loss)) - return log_str - -log_str = '' -if valid_loss is not None: - log_str += format_log(valid_loss, 'valid') -if test_loss is not None: - log_str += format_log(test_loss, 'test') - -logger.info('=' * 100) -logger.info(log_str) -logger.info('=' * 100)