diff --git a/.gitignore b/.gitignore index 8abc9b84e1..05129fc402 100644 --- a/.gitignore +++ b/.gitignore @@ -123,4 +123,7 @@ tensorflow_code # Models models -proc_data \ No newline at end of file +proc_data + +# examples +examples/runs \ No newline at end of file diff --git a/examples/run_xlnet_classifier.py b/examples/run_xlnet_classifier.py index bedca65bb7..154aac332a 100644 --- a/examples/run_xlnet_classifier.py +++ b/examples/run_xlnet_classifier.py @@ -54,91 +54,58 @@ def main(): parser = argparse.ArgumentParser() ## Required parameters - parser.add_argument("--data_dir", - default=None, - type=str, - required=True, + parser.add_argument("--data_dir", default=None, type=str, required=True, help="The input data dir. Should contain the .tsv files (or other data files) for the task.") - parser.add_argument("--xlnet_model", default="xlnet-large-cased", type=str, - help="XLNet pre-trained model: currently only xlnet-large-cased.") - parser.add_argument("--task_name", - default=None, - type=str, - required=True, + parser.add_argument("--task_name", default=None, type=str, required=True, help="The name of the task to train.") - parser.add_argument("--output_dir", - default=None, - type=str, - required=True, + parser.add_argument("--output_dir", default=None, type=str, required=True, help="The output directory where the model predictions and checkpoints will be written.") - - ## Other parameters - parser.add_argument("--cache_dir", - default="", - type=str, - help="Where do you want to store the pre-trained models downloaded from s3") - parser.add_argument("--max_seq_length", - default=128, - type=int, - help="The maximum total input sequence length after WordPiece tokenization. \n" - "Sequences longer than this will be truncated, and sequences shorter \n" - "than this will be padded.") - parser.add_argument("--do_train", - action='store_true', + # training + parser.add_argument("--do_train", action='store_true', help="Whether to run training.") - parser.add_argument("--do_eval", - action='store_true', - help="Whether to run eval on the dev set.") - parser.add_argument("--do_lower_case", - action='store_true', - help="Set this flag if you are using an uncased model.") - parser.add_argument("--train_batch_size", - default=32, - type=int, - help="Total batch size for training.") - parser.add_argument("--eval_batch_size", - default=8, - type=int, - help="Total batch size for eval.") - parser.add_argument("--learning_rate", - default=5e-5, - type=float, + parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") - parser.add_argument("--num_train_epochs", - default=3.0, - type=float, + parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") - parser.add_argument("--warmup_proportion", - default=0.1, - type=float, + parser.add_argument("--warmup_proportion", default=0.1, type=float, help="Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") - parser.add_argument("--no_cuda", - action='store_true', - help="Whether not to use CUDA when available") - parser.add_argument('--overwrite_output_dir', - action='store_true', - help="Overwrite the content of the output directory") - parser.add_argument("--local_rank", - type=int, - default=-1, - help="local_rank for distributed training on gpus") - parser.add_argument('--seed', - type=int, - default=42, - help="random seed for initialization") - parser.add_argument('--gradient_accumulation_steps', - type=int, - default=1, + parser.add_argument("--train_batch_size", default=32, type=int, + help="Total batch size for training.") + parser.add_argument('--gradient_accumulation_steps', type=int, default=1, help="Number of updates steps to accumulate before performing a backward/update pass.") - parser.add_argument('--fp16', - action='store_true', + parser.add_argument('--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") - parser.add_argument('--loss_scale', - type=float, default=0, + parser.add_argument('--loss_scale', type=float, default=0, help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") + # evaluation + parser.add_argument("--do_eval", action='store_true', + help="Whether to run eval on the dev set.") + parser.add_argument("--eval_batch_size", default=8, type=int, + help="Total batch size for eval.") + # Model + parser.add_argument("--xlnet_model", default="xlnet-large-cased", type=str, + help="XLNet pre-trained model: currently only xlnet-large-cased.") + parser.add_argument("--do_lower_case", action='store_true', + help="Set this flag if you are using an uncased model.") + parser.add_argument("--cache_dir", default="", type=str, + help="Where do you want to store the pre-trained models downloaded from s3") + # task specific + parser.add_argument("--max_seq_length", default=128, type=int, + help="The maximum total input sequence length after WordPiece tokenization. \n" + "Sequences longer than this will be truncated, and sequences shorter \n" + "than this will be padded.") + parser.add_argument('--overwrite_output_dir', action='store_true', + help="Overwrite the content of the output directory") + # Misc + parser.add_argument("--no_cuda", action='store_true', + help="Whether not to use CUDA when available") + parser.add_argument("--local_rank", type=int, default=-1, + help="local_rank for distributed training on gpus") + parser.add_argument('--seed', type=int, default=42, + help="random seed for initialization") parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") args = parser.parse_args() @@ -306,7 +273,7 @@ def main(): input_ids, input_mask, segment_ids, label_ids = batch # define a new function to compute loss values for both output_modes - logits = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask) + logits, _ = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask) if output_mode == "classification": loss_fct = CrossEntropyLoss() @@ -420,7 +387,7 @@ def main(): label_ids = label_ids.to(device) with torch.no_grad(): - logits = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask) + logits, _ = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask) # create eval loss and other metric required by the task if output_mode == "classification": @@ -501,7 +468,7 @@ def main(): label_ids = label_ids.to(device) with torch.no_grad(): - logits = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=None) + logits, _ = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=None) loss_fct = CrossEntropyLoss() tmp_eval_loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1)) diff --git a/examples/run_xlnet_squad.py b/examples/run_xlnet_squad.py new file mode 100644 index 0000000000..b01bf82a55 --- /dev/null +++ b/examples/run_xlnet_squad.py @@ -0,0 +1,398 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Run BERT on SQuAD.""" + +from __future__ import absolute_import, division, print_function + +import argparse +import logging +import os +import random +import sys +from io import open + +import numpy as np +import torch +from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler, + TensorDataset) +from torch.utils.data.distributed import DistributedSampler +from tqdm import tqdm, trange + +from tensorboardX import SummaryWriter + +from pytorch_pretrained_bert.file_utils import WEIGHTS_NAME, CONFIG_NAME +from pytorch_pretrained_bert.modeling_xlnet import BertForQuestionAnswering +from pytorch_pretrained_bert.tokenization_xlnet import XLNetTokenizer +from pytorch_pretrained_bert.optimization import BertAdam, WarmupLinearSchedule + +from utils_squad import read_squad_examples, convert_examples_to_features, RawResult, write_predictions + +if sys.version_info[0] == 2: + import cPickle as pickle +else: + import pickle + +logger = logging.getLogger(__name__) + + +def main(): + parser = argparse.ArgumentParser() + + ## Required parameters + parser.add_argument("--bert_model", default=None, type=str, required=True, + help="Bert pre-trained model selected in the list: bert-base-uncased, " + "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " + "bert-base-multilingual-cased, bert-base-chinese.") + parser.add_argument("--output_dir", default=None, type=str, required=True, + help="The output directory where the model checkpoints and predictions will be written.") + + ## Other parameters + parser.add_argument("--train_file", default=None, type=str, help="SQuAD json for training. E.g., train-v1.1.json") + parser.add_argument("--predict_file", default=None, type=str, + help="SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json") + parser.add_argument("--max_seq_length", default=384, type=int, + help="The maximum total input sequence length after WordPiece tokenization. Sequences " + "longer than this will be truncated, and sequences shorter than this will be padded.") + parser.add_argument("--doc_stride", default=128, type=int, + help="When splitting up a long document into chunks, how much stride to take between chunks.") + parser.add_argument("--max_query_length", default=64, type=int, + help="The maximum number of tokens for the question. Questions longer than this will " + "be truncated to this length.") + parser.add_argument("--do_train", action='store_true', help="Whether to run training.") + parser.add_argument("--do_predict", action='store_true', help="Whether to run eval on the dev set.") + parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") + parser.add_argument("--predict_batch_size", default=8, type=int, help="Total batch size for predictions.") + parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") + parser.add_argument("--num_train_epochs", default=3.0, type=float, + help="Total number of training epochs to perform.") + parser.add_argument("--warmup_proportion", default=0.1, type=float, + help="Proportion of training to perform linear learning rate warmup for. E.g., 0.1 = 10%% " + "of training.") + parser.add_argument("--n_best_size", default=20, type=int, + help="The total number of n-best predictions to generate in the nbest_predictions.json " + "output file.") + parser.add_argument("--max_answer_length", default=30, type=int, + help="The maximum length of an answer that can be generated. This is needed because the start " + "and end predictions are not conditioned on one another.") + parser.add_argument("--verbose_logging", action='store_true', + help="If true, all of the warnings related to data processing will be printed. " + "A number of warnings are expected for a normal SQuAD evaluation.") + parser.add_argument("--no_cuda", + action='store_true', + help="Whether not to use CUDA when available") + parser.add_argument('--seed', + type=int, + default=42, + help="random seed for initialization") + parser.add_argument('--gradient_accumulation_steps', + type=int, + default=1, + help="Number of updates steps to accumulate before performing a backward/update pass.") + parser.add_argument("--do_lower_case", + action='store_true', + help="Whether to lower case the input text. True for uncased models, False for cased models.") + parser.add_argument("--local_rank", + type=int, + default=-1, + help="local_rank for distributed training on gpus") + parser.add_argument('--fp16', + action='store_true', + help="Whether to use 16-bit float precision instead of 32-bit") + parser.add_argument('--overwrite_output_dir', + action='store_true', + help="Overwrite the content of the output directory") + parser.add_argument('--loss_scale', + type=float, default=0, + help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" + "0 (default value): dynamic loss scaling.\n" + "Positive power of 2: static loss scaling value.\n") + parser.add_argument('--version_2_with_negative', + action='store_true', + help='If true, the SQuAD examples contain some that do not have an answer.') + parser.add_argument('--null_score_diff_threshold', + type=float, default=0.0, + help="If null_score - best_non_null is greater than the threshold predict null.") + parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") + parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") + args = parser.parse_args() + print(args) + + if args.server_ip and args.server_port: + # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script + import ptvsd + print("Waiting for debugger attach") + ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) + ptvsd.wait_for_attach() + + if args.local_rank == -1 or args.no_cuda: + device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") + n_gpu = torch.cuda.device_count() + else: + torch.cuda.set_device(args.local_rank) + device = torch.device("cuda", args.local_rank) + n_gpu = 1 + # Initializes the distributed backend which will take care of sychronizing nodes/GPUs + torch.distributed.init_process_group(backend='nccl') + + logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s', + datefmt = '%m/%d/%Y %H:%M:%S', + level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN) + + logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format( + device, n_gpu, bool(args.local_rank != -1), args.fp16)) + + if args.gradient_accumulation_steps < 1: + raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format( + args.gradient_accumulation_steps)) + + args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps + + random.seed(args.seed) + np.random.seed(args.seed) + torch.manual_seed(args.seed) + if n_gpu > 0: + torch.cuda.manual_seed_all(args.seed) + + if not args.do_train and not args.do_predict: + raise ValueError("At least one of `do_train` or `do_predict` must be True.") + + if args.do_train: + if not args.train_file: + raise ValueError( + "If `do_train` is True, then `train_file` must be specified.") + if args.do_predict: + if not args.predict_file: + raise ValueError( + "If `do_predict` is True, then `predict_file` must be specified.") + + if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir: + raise ValueError("Output directory () already exists and is not empty.") + if not os.path.exists(args.output_dir): + os.makedirs(args.output_dir) + + if args.local_rank not in [-1, 0]: + torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab + tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) + model = BertForQuestionAnswering.from_pretrained(args.bert_model) + if args.local_rank == 0: + torch.distributed.barrier() + + if args.fp16: + model.half() + model.to(device) + if args.local_rank != -1: + model = torch.nn.parallel.DistributedDataParallel(model, + device_ids=[args.local_rank], + output_device=args.local_rank, + find_unused_parameters=True) + elif n_gpu > 1: + model = torch.nn.DataParallel(model) + + if args.do_train: + if args.local_rank in [-1, 0]: + tb_writer = SummaryWriter() + # Prepare data loader + train_examples = read_squad_examples( + input_file=args.train_file, is_training=True, version_2_with_negative=args.version_2_with_negative) + cached_train_features_file = args.train_file+'_{0}_{1}_{2}_{3}'.format( + list(filter(None, args.bert_model.split('/'))).pop(), str(args.max_seq_length), str(args.doc_stride), str(args.max_query_length)) + try: + with open(cached_train_features_file, "rb") as reader: + train_features = pickle.load(reader) + except: + train_features = convert_examples_to_features( + examples=train_examples, + tokenizer=tokenizer, + max_seq_length=args.max_seq_length, + doc_stride=args.doc_stride, + max_query_length=args.max_query_length, + is_training=True) + if args.local_rank == -1 or torch.distributed.get_rank() == 0: + logger.info(" Saving train features into cached file %s", cached_train_features_file) + with open(cached_train_features_file, "wb") as writer: + pickle.dump(train_features, writer) + + all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) + all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) + all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) + all_start_positions = torch.tensor([f.start_position for f in train_features], dtype=torch.long) + all_end_positions = torch.tensor([f.end_position for f in train_features], dtype=torch.long) + train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, + all_start_positions, all_end_positions) + if args.local_rank == -1: + train_sampler = RandomSampler(train_data) + else: + train_sampler = DistributedSampler(train_data) + + train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) + num_train_optimization_steps = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs + # if args.local_rank != -1: + # num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size() + + # Prepare optimizer + param_optimizer = list(model.named_parameters()) + + # hack to remove pooler, which is not used + # thus it produce None grad that break apex + param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]] + + no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] + optimizer_grouped_parameters = [ + {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, + {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} + ] + + if args.fp16: + try: + from apex.optimizers import FP16_Optimizer + from apex.optimizers import FusedAdam + except ImportError: + raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") + + optimizer = FusedAdam(optimizer_grouped_parameters, + lr=args.learning_rate, + bias_correction=False, + max_grad_norm=1.0) + if args.loss_scale == 0: + optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) + else: + optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) + warmup_linear = WarmupLinearSchedule(warmup=args.warmup_proportion, + t_total=num_train_optimization_steps) + else: + optimizer = BertAdam(optimizer_grouped_parameters, + lr=args.learning_rate, + warmup=args.warmup_proportion, + t_total=num_train_optimization_steps) + + global_step = 0 + + logger.info("***** Running training *****") + logger.info(" Num orig examples = %d", len(train_examples)) + logger.info(" Num split examples = %d", len(train_features)) + logger.info(" Batch size = %d", args.train_batch_size) + logger.info(" Num steps = %d", num_train_optimization_steps) + + model.train() + for epoch in trange(int(args.num_train_epochs), desc="Epoch"): + for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])): + if n_gpu == 1: + batch = tuple(t.to(device) for t in batch) # multi-gpu does scattering it-self + input_ids, input_mask, segment_ids, start_positions, end_positions = batch + loss = model(input_ids, segment_ids, input_mask, start_positions, end_positions) + if n_gpu > 1: + loss = loss.mean() # mean() to average on multi-gpu. + if args.gradient_accumulation_steps > 1: + loss = loss / args.gradient_accumulation_steps + + if args.fp16: + optimizer.backward(loss) + else: + loss.backward() + if (step + 1) % args.gradient_accumulation_steps == 0: + if args.fp16: + # modify learning rate with special warm up BERT uses + # if args.fp16 is False, BertAdam is used and handles this automatically + lr_this_step = args.learning_rate * warmup_linear.get_lr(global_step, args.warmup_proportion) + for param_group in optimizer.param_groups: + param_group['lr'] = lr_this_step + optimizer.step() + optimizer.zero_grad() + global_step += 1 + if args.local_rank in [-1, 0]: + tb_writer.add_scalar('lr', optimizer.get_lr()[0], global_step) + tb_writer.add_scalar('loss', loss.item(), global_step) + + if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0): + # Save a trained model, configuration and tokenizer + model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self + + # If we save using the predefined names, we can load using `from_pretrained` + output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) + output_config_file = os.path.join(args.output_dir, CONFIG_NAME) + + torch.save(model_to_save.state_dict(), output_model_file) + model_to_save.config.to_json_file(output_config_file) + tokenizer.save_vocabulary(args.output_dir) + + # Load a trained model and vocabulary that you have fine-tuned + model = BertForQuestionAnswering.from_pretrained(args.output_dir) + tokenizer = BertTokenizer.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case) + + # Good practice: save your training arguments together with the trained model + output_args_file = os.path.join(args.output_dir, 'training_args.bin') + torch.save(args, output_args_file) + else: + model = BertForQuestionAnswering.from_pretrained(args.bert_model) + + model.to(device) + + if args.do_predict and (args.local_rank == -1 or torch.distributed.get_rank() == 0): + eval_examples = read_squad_examples( + input_file=args.predict_file, is_training=False, version_2_with_negative=args.version_2_with_negative) + eval_features = convert_examples_to_features( + examples=eval_examples, + tokenizer=tokenizer, + max_seq_length=args.max_seq_length, + doc_stride=args.doc_stride, + max_query_length=args.max_query_length, + is_training=False) + + logger.info("***** Running predictions *****") + logger.info(" Num orig examples = %d", len(eval_examples)) + logger.info(" Num split examples = %d", len(eval_features)) + logger.info(" Batch size = %d", args.predict_batch_size) + + all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) + all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) + all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) + all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long) + eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_example_index) + # Run prediction for full data + eval_sampler = SequentialSampler(eval_data) + eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.predict_batch_size) + + model.eval() + all_results = [] + logger.info("Start evaluating") + for input_ids, input_mask, segment_ids, example_indices in tqdm(eval_dataloader, desc="Evaluating", disable=args.local_rank not in [-1, 0]): + if len(all_results) % 1000 == 0: + logger.info("Processing example: %d" % (len(all_results))) + input_ids = input_ids.to(device) + input_mask = input_mask.to(device) + segment_ids = segment_ids.to(device) + with torch.no_grad(): + batch_start_logits, batch_end_logits = model(input_ids, segment_ids, input_mask) + for i, example_index in enumerate(example_indices): + start_logits = batch_start_logits[i].detach().cpu().tolist() + end_logits = batch_end_logits[i].detach().cpu().tolist() + eval_feature = eval_features[example_index.item()] + unique_id = int(eval_feature.unique_id) + all_results.append(RawResult(unique_id=unique_id, + start_logits=start_logits, + end_logits=end_logits)) + output_prediction_file = os.path.join(args.output_dir, "predictions.json") + output_nbest_file = os.path.join(args.output_dir, "nbest_predictions.json") + output_null_log_odds_file = os.path.join(args.output_dir, "null_odds.json") + write_predictions(eval_examples, eval_features, all_results, + args.n_best_size, args.max_answer_length, + args.do_lower_case, output_prediction_file, + output_nbest_file, output_null_log_odds_file, args.verbose_logging, + args.version_2_with_negative, args.null_score_diff_threshold) + + +if __name__ == "__main__": + main() diff --git a/pytorch_pretrained_bert/modeling.py b/pytorch_pretrained_bert/modeling.py index 2b67a260f0..95bdd7452f 100644 --- a/pytorch_pretrained_bert/modeling.py +++ b/pytorch_pretrained_bert/modeling.py @@ -606,7 +606,7 @@ class BertPreTrainedModel(nn.Module): )) self.config = config - def init_bert_weights(self, module): + def init_weights(self, module): """ Initialize the weights. """ if isinstance(module, (nn.Linear, nn.Embedding)): @@ -823,7 +823,7 @@ class BertModel(BertPreTrainedModel): self.encoder = BertEncoder(config, output_attentions=output_attentions, keep_multihead_output=keep_multihead_output) self.pooler = BertPooler(config) - self.apply(self.init_bert_weights) + self.apply(self.init_weights) def prune_heads(self, heads_to_prune): """ Prunes heads of the model. @@ -951,7 +951,7 @@ class BertForPreTraining(BertPreTrainedModel): self.bert = BertModel(config, output_attentions=output_attentions, keep_multihead_output=keep_multihead_output) self.cls = BertPreTrainingHeads(config, self.bert.embeddings.word_embeddings.weight) - self.apply(self.init_bert_weights) + self.apply(self.init_weights) def forward(self, input_ids, token_type_ids=None, attention_mask=None, masked_lm_labels=None, next_sentence_label=None, head_mask=None): outputs = self.bert(input_ids, token_type_ids, attention_mask, @@ -1030,7 +1030,7 @@ class BertForMaskedLM(BertPreTrainedModel): self.bert = BertModel(config, output_attentions=output_attentions, keep_multihead_output=keep_multihead_output) self.cls = BertOnlyMLMHead(config, self.bert.embeddings.word_embeddings.weight) - self.apply(self.init_bert_weights) + self.apply(self.init_weights) def forward(self, input_ids, token_type_ids=None, attention_mask=None, masked_lm_labels=None, head_mask=None): outputs = self.bert(input_ids, token_type_ids, attention_mask, @@ -1105,7 +1105,7 @@ class BertForNextSentencePrediction(BertPreTrainedModel): self.bert = BertModel(config, output_attentions=output_attentions, keep_multihead_output=keep_multihead_output) self.cls = BertOnlyNSPHead(config) - self.apply(self.init_bert_weights) + self.apply(self.init_weights) def forward(self, input_ids, token_type_ids=None, attention_mask=None, next_sentence_label=None, head_mask=None): outputs = self.bert(input_ids, token_type_ids, attention_mask, @@ -1184,7 +1184,7 @@ class BertForSequenceClassification(BertPreTrainedModel): keep_multihead_output=keep_multihead_output) self.dropout = nn.Dropout(config.hidden_dropout_prob) self.classifier = nn.Linear(config.hidden_size, num_labels) - self.apply(self.init_bert_weights) + self.apply(self.init_weights) def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None, head_mask=None): outputs = self.bert(input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False, head_mask=head_mask) @@ -1261,7 +1261,7 @@ class BertForMultipleChoice(BertPreTrainedModel): keep_multihead_output=keep_multihead_output) self.dropout = nn.Dropout(config.hidden_dropout_prob) self.classifier = nn.Linear(config.hidden_size, 1) - self.apply(self.init_bert_weights) + self.apply(self.init_weights) def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None, head_mask=None): flat_input_ids = input_ids.view(-1, input_ids.size(-1)) @@ -1343,7 +1343,7 @@ class BertForTokenClassification(BertPreTrainedModel): keep_multihead_output=keep_multihead_output) self.dropout = nn.Dropout(config.hidden_dropout_prob) self.classifier = nn.Linear(config.hidden_size, num_labels) - self.apply(self.init_bert_weights) + self.apply(self.init_weights) def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None, head_mask=None): outputs = self.bert(input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False, head_mask=head_mask) @@ -1428,7 +1428,7 @@ class BertForQuestionAnswering(BertPreTrainedModel): self.bert = BertModel(config, output_attentions=output_attentions, keep_multihead_output=keep_multihead_output) self.qa_outputs = nn.Linear(config.hidden_size, 2) - self.apply(self.init_bert_weights) + self.apply(self.init_weights) def forward(self, input_ids, token_type_ids=None, attention_mask=None, start_positions=None, end_positions=None, head_mask=None): diff --git a/pytorch_pretrained_bert/modeling_xlnet.py b/pytorch_pretrained_bert/modeling_xlnet.py index 45cd6350d5..9cdf82bbc3 100644 --- a/pytorch_pretrained_bert/modeling_xlnet.py +++ b/pytorch_pretrained_bert/modeling_xlnet.py @@ -633,7 +633,7 @@ class XLNetPreTrainedModel(nn.Module): )) self.config = config - def init_xlnet_weights(self, module): + def init_weights(self, module): """ Initialize the weights. """ if isinstance(module, (nn.Linear, nn.Embedding)): @@ -904,14 +904,14 @@ class XLNetModel(XLNetPreTrainedModel): pos_emb = pos_emb.to(next(self.parameters())) return pos_emb - def forward(self, inp_k, seg_id=None, input_mask=None, + def forward(self, inp_k, token_type_ids=None, attention_mask=None, mems=None, perm_mask=None, target_mapping=None, inp_q=None, output_all_encoded_layers=True, head_mask=None): """ Args: inp_k: int32 Tensor in shape [bsz, len], the input token IDs. - seg_id: int32 Tensor in shape [bsz, len], the input segment IDs. - input_mask: [optional] float32 Tensor in shape [bsz, len], the input mask. + token_type_ids: int32 Tensor in shape [bsz, len], the input segment IDs. + attention_mask: [optional] float32 Tensor in shape [bsz, len], the input mask. 0 for real tokens and 1 for padding. mems: [optional] a list of float32 Tensors in shape [mem_len, bsz, d_model], memory from previous batches. The length of the list equals n_layer. @@ -945,8 +945,8 @@ class XLNetModel(XLNetPreTrainedModel): # but we want a unified interface in the library with the batch size on the first dimension # so we move here the first dimension (batch) to the end inp_k = inp_k.transpose(0, 1).contiguous() - seg_id = seg_id.transpose(0, 1).contiguous() if seg_id is not None else None - input_mask = input_mask.transpose(0, 1).contiguous() if input_mask is not None else None + token_type_ids = token_type_ids.transpose(0, 1).contiguous() if token_type_ids is not None else None + attention_mask = attention_mask.transpose(0, 1).contiguous() if attention_mask is not None else None perm_mask = perm_mask.permute(1, 2, 0).contiguous() if perm_mask is not None else None target_mapping = target_mapping.permute(1, 2, 0).contiguous() if target_mapping is not None else None inp_q = inp_q.transpose(0, 1).contiguous() if inp_q is not None else None @@ -969,11 +969,11 @@ class XLNetModel(XLNetPreTrainedModel): raise ValueError('Unsupported attention type: {}'.format(self.attn_type)) # data mask: input mask & perm mask - if input_mask is not None and perm_mask is not None: - data_mask = input_mask[None] + perm_mask - elif input_mask is not None and perm_mask is None: - data_mask = input_mask[None] - elif input_mask is None and perm_mask is not None: + if attention_mask is not None and perm_mask is not None: + data_mask = attention_mask[None] + perm_mask + elif attention_mask is not None and perm_mask is None: + data_mask = attention_mask[None] + elif attention_mask is None and perm_mask is not None: data_mask = perm_mask else: data_mask = None @@ -1011,13 +1011,13 @@ class XLNetModel(XLNetPreTrainedModel): output_g = None ##### Segment embedding - if seg_id is not None: - # Convert `seg_id` to one-hot `seg_mat` + if token_type_ids is not None: + # Convert `token_type_ids` to one-hot `seg_mat` mem_pad = torch.zeros([mlen, bsz], dtype=torch.long, device=device) - cat_ids = torch.cat([mem_pad, seg_id], dim=0) + cat_ids = torch.cat([mem_pad, token_type_ids], dim=0) # `1` indicates not in the same segment [qlen x klen x bsz] - seg_mat = (seg_id[:, None] != cat_ids[None, :]).long() + seg_mat = (token_type_ids[:, None] != cat_ids[None, :]).long() seg_mat = F.one_hot(seg_mat, num_classes=2).to(dtype_float) else: seg_mat = None @@ -1076,8 +1076,8 @@ class XLNetLMHeadModel(XLNetPreTrainedModel): Inputs: inp_k: int32 Tensor in shape [bsz, len], the input token IDs. - seg_id: int32 Tensor in shape [bsz, len], the input segment IDs. - input_mask: [optional] float32 Tensor in shape [bsz, len], the input mask. + token_type_ids: int32 Tensor in shape [bsz, len], the input segment IDs. + attention_mask: [optional] float32 Tensor in shape [bsz, len], the input mask. 0 for real tokens and 1 for padding. mems: [optional] a list of float32 Tensors in shape [mem_len, bsz, d_model], memory from previous batches. The length of the list equals n_layer. @@ -1112,14 +1112,14 @@ class XLNetLMHeadModel(XLNetPreTrainedModel): ```python # Already been converted into WordPiece token ids input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]]) - input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]]) + attention_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]]) token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]]) config = modeling.XLNetConfig(vocab_size_or_config_json_file=32000, d_model=768, n_layer=12, num_attention_heads=12, intermediate_size=3072) model = modeling.XLNetModel(config=config) - all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask) + all_encoder_layers, pooled_output = model(input_ids, token_type_ids, attention_mask) ``` """ def __init__(self, config, output_attentions=False, keep_multihead_output=False): @@ -1134,7 +1134,7 @@ class XLNetLMHeadModel(XLNetPreTrainedModel): # Tie weights - self.apply(self.init_xlnet_weights) + self.apply(self.init_weights) self.tie_weights() def tie_weights(self): @@ -1142,14 +1142,14 @@ class XLNetLMHeadModel(XLNetPreTrainedModel): """ self.lm_loss.weight = self.transformer.word_embedding.weight - def forward(self, inp_k, seg_id=None, input_mask=None, + def forward(self, inp_k, token_type_ids=None, attention_mask=None, mems=None, perm_mask=None, target_mapping=None, inp_q=None, target=None, output_all_encoded_layers=True, head_mask=None): """ Args: inp_k: int32 Tensor in shape [bsz, len], the input token IDs. - seg_id: int32 Tensor in shape [bsz, len], the input segment IDs. - input_mask: float32 Tensor in shape [bsz, len], the input mask. + token_type_ids: int32 Tensor in shape [bsz, len], the input segment IDs. + attention_mask: float32 Tensor in shape [bsz, len], the input mask. 0 for real tokens and 1 for padding. mems: a list of float32 Tensors in shape [mem_len, bsz, d_model], memory from previous batches. The length of the list equals n_layer. @@ -1171,7 +1171,7 @@ class XLNetLMHeadModel(XLNetPreTrainedModel): summary_type: str, "last", "first", "mean", or "attn". The method to pool the input to get a vector representation. """ - output, hidden_states, new_mems = self.transformer(inp_k, seg_id, input_mask, + output, hidden_states, new_mems = self.transformer(inp_k, token_type_ids, attention_mask, mems, perm_mask, target_mapping, inp_q, output_all_encoded_layers, head_mask) @@ -1200,7 +1200,7 @@ class XLNetSequenceSummary(nn.Module): super(XLNetSequenceSummary, self).__init__() self.summary_type = summary_type if use_proj: - self.summary = nn.Linear(config.hidden_size, num_labels) + self.summary = nn.Linear(config.d_model, config.d_model) else: self.summary = None if summary_type == 'attn': @@ -1211,19 +1211,20 @@ class XLNetSequenceSummary(nn.Module): self.dropout = nn.Dropout(config.dropout) self.activation = nn.Tanh() - def forward(self, hidden_states, input_mask=None): + def forward(self, hidden_states): + """ hidden_states: float Tensor in shape [bsz, seq_len, d_model], the hidden-states of the last layer.""" if self.summary_type == 'last': - output = hidden_states[-1] + output = hidden_states[:, -1] elif self.summary_type == 'first': - output = hidden_states[0] + output = hidden_states[:, 0] elif self.summary_type == 'mean': - output = hidden_states.mean(dim=0) + output = hidden_states.mean(dim=1) elif summary_type == 'attn': raise NotImplementedError output = self.summary(output) - output = self.dropout(output) output = self.activation(output) + output = self.dropout(output) return output @@ -1240,8 +1241,8 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel): Inputs: inp_k: int32 Tensor in shape [bsz, len], the input token IDs. - seg_id: int32 Tensor in shape [bsz, len], the input segment IDs. - input_mask: float32 Tensor in shape [bsz, len], the input mask. + token_type_ids: int32 Tensor in shape [bsz, len], the input segment IDs. + attention_mask: float32 Tensor in shape [bsz, len], the input mask. 0 for real tokens and 1 for padding. mems: a list of float32 Tensors in shape [mem_len, bsz, d_model], memory from previous batches. The length of the list equals n_layer. @@ -1277,14 +1278,14 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel): ```python # Already been converted into WordPiece token ids input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]]) - input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]]) + attention_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]]) token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]]) config = modeling.XLNetConfig(vocab_size_or_config_json_file=32000, d_model=768, n_layer=12, num_attention_heads=12, intermediate_size=3072) model = modeling.XLNetModel(config=config) - all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask) + all_encoder_layers, pooled_output = model(input_ids, token_type_ids, attention_mask) ``` """ def __init__(self, config, summary_type="last", use_proj=True, num_labels=2, @@ -1302,17 +1303,17 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel): self.sequence_summary = XLNetSequenceSummary(config, summary_type=summary_type, use_proj=use_proj, output_attentions=output_attentions, keep_multihead_output=keep_multihead_output) - self.loss_proj = nn.Linear(config.d_model, num_classes if not is_regression else 1) - self.apply(self.init_bert_weights) + self.loss_proj = nn.Linear(config.d_model, num_labels if not is_regression else 1) + self.apply(self.init_weights) - def forward(self, inp_k, seg_id=None, input_mask=None, + def forward(self, inp_k, token_type_ids=None, attention_mask=None, mems=None, perm_mask=None, target_mapping=None, inp_q=None, target=None, output_all_encoded_layers=True, head_mask=None): """ Args: inp_k: int32 Tensor in shape [bsz, len], the input token IDs. - seg_id: int32 Tensor in shape [bsz, len], the input segment IDs. - input_mask: float32 Tensor in shape [bsz, len], the input mask. + token_type_ids: int32 Tensor in shape [bsz, len], the input segment IDs. + attention_mask: float32 Tensor in shape [bsz, len], the input mask. 0 for real tokens and 1 for padding. mems: a list of float32 Tensors in shape [mem_len, bsz, d_model], memory from previous batches. The length of the list equals n_layer. @@ -1331,7 +1332,7 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel): Only used during pretraining for two-stream attention. Set to None during finetuning. """ - output, _, new_mems = self.transformer(inp_k, seg_id, input_mask, + output, _, new_mems = self.transformer(inp_k, token_type_ids, attention_mask, mems, perm_mask, target_mapping, inp_q, output_all_encoded_layers, head_mask) @@ -1356,3 +1357,96 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel): # if self.output_attentions: return logits, new_mems # return all_attentions, encoded_layers, pooled_output + +class XLNetForQuestionAnswering(XLNetPreTrainedModel): + """XLNet model for Question Answering (span extraction). + This module is composed of the XLNet model with a linear layer on top of + the sequence output that computes start_logits and end_logits + + Params: + `config`: a XLNetConfig class instance with the configuration to build a new model + `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False + `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient. + This can be used to compute head importance metrics. Default: False + + Inputs: + `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] + with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts + `extract_features.py`, `run_classifier.py` and `run_squad.py`) + `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token + types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to + a `sentence B` token (see XLNet paper for more details). + `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices + selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max + input sequence length in the current batch. It's the mask that we typically use for attention when + a batch has varying length sentences. + `start_positions`: position of the first token for the labeled span: torch.LongTensor of shape [batch_size]. + Positions are clamped to the length of the sequence and position outside of the sequence are not taken + into account for computing the loss. + `end_positions`: position of the last token for the labeled span: torch.LongTensor of shape [batch_size]. + Positions are clamped to the length of the sequence and position outside of the sequence are not taken + into account for computing the loss. + `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1. + It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked. + + Outputs: + if `start_positions` and `end_positions` are not `None`: + Outputs the total_loss which is the sum of the CrossEntropy loss for the start and end token positions. + if `start_positions` or `end_positions` is `None`: + Outputs a tuple of start_logits, end_logits which are the logits respectively for the start and end + position tokens of shape [batch_size, sequence_length]. + + Example usage: + ```python + # Already been converted into WordPiece token ids + input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]]) + attention_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]]) + token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]]) + + config = XLNetConfig(vocab_size_or_config_json_file=32000, hidden_size=768, + num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072) + + model = XLNetForQuestionAnswering(config) + start_logits, end_logits = model(input_ids, token_type_ids, attention_mask) + ``` + """ + def __init__(self, config, output_attentions=False, keep_multihead_output=False): + super(XLNetForQuestionAnswering, self).__init__(config) + self.output_attentions = output_attentions + self.transformer = XLNetModel(config, output_attentions=output_attentions, + keep_multihead_output=keep_multihead_output) + self.qa_outputs = nn.Linear(config.hidden_size, 2) + self.apply(self.init_weights) + + def forward(self, inp_k, token_type_ids=None, attention_mask=None, + mems=None, perm_mask=None, target_mapping=None, inp_q=None, + start_positions=None, end_positions=None, + output_all_encoded_layers=True, head_mask=None): + output, _, new_mems = self.transformer(inp_k, token_type_ids, attention_mask, + mems, perm_mask, target_mapping, inp_q, + output_all_encoded_layers, head_mask) + + logits = self.qa_outputs(output) + start_logits, end_logits = logits.split(1, dim=-1) + start_logits = start_logits.squeeze(-1) + end_logits = end_logits.squeeze(-1) + + if start_positions is not None and end_positions is not None: + # If we are on multi-GPU, split add a dimension + if len(start_positions.size()) > 1: + start_positions = start_positions.squeeze(-1) + if len(end_positions.size()) > 1: + end_positions = end_positions.squeeze(-1) + # sometimes the start/end positions are outside our model inputs, we ignore these terms + ignored_index = start_logits.size(1) + start_positions.clamp_(0, ignored_index) + end_positions.clamp_(0, ignored_index) + + loss_fct = CrossEntropyLoss(ignore_index=ignored_index) + start_loss = loss_fct(start_logits, start_positions) + end_loss = loss_fct(end_logits, end_positions) + total_loss = (start_loss + end_loss) / 2 + return total_loss + elif self.output_attentions: + return all_attentions, start_logits, end_logits + return start_logits, end_logits