Merge branch 'xlnet'

2019-07-16 11:51:13 +02:00
parent 78462aad61 1b35d05d4b
commit f31154cb9d
136 changed files with 16407 additions and 10024 deletions
--- a/examples/bertology.py
+++ b/examples/bertology.py
@@ -1,310 +0,0 @@
-#!/usr/bin/env python3
-import os
-import argparse
-import logging
-from datetime import timedelta, datetime
-from tqdm import tqdm
-
-import numpy as np
-
-import torch
-from torch.utils.data import DataLoader, SequentialSampler, TensorDataset, Subset
-from torch.utils.data.distributed import DistributedSampler
-from torch.nn import CrossEntropyLoss, MSELoss
-
-from pytorch_pretrained_bert import BertForSequenceClassification, BertTokenizer
-
-from run_classifier_dataset_utils import processors, output_modes, convert_examples_to_features, compute_metrics
-
-
-logger = logging.getLogger(__name__)
-
-
-def entropy(p):
-    plogp = p * torch.log(p)
-    plogp[p == 0] = 0
-    return -plogp.sum(dim=-1)
-
-
-def print_1d_tensor(tensor, prefix=""):
-    if tensor.dtype != torch.long:
-        logger.info(prefix + "\t".join(f"{x:.5f}" for x in tensor.cpu().data))
-    else:
-        logger.info(prefix + "\t".join(f"{x:d}" for x in tensor.cpu().data))
-
-
-def print_2d_tensor(tensor):
-    logger.info("lv, h >\t" + "\t".join(f"{x + 1}" for x in range(len(tensor))))
-    for row in range(len(tensor)):
-        print_1d_tensor(tensor[row], prefix=f"layer {row + 1}:\t")
-
-
-def compute_heads_importance(args, model, eval_dataloader, compute_entropy=True, compute_importance=True, head_mask=None):
-    """ Example on how to use model outputs to compute:
-        - head attention entropy (activated by setting output_attentions=True when we created the model
-        - head importance scores according to http://arxiv.org/abs/1905.10650
-            (activated by setting keep_multihead_output=True when we created the model)
-    """
-    # Prepare our tensors
-    n_layers, n_heads = model.bert.config.num_hidden_layers, model.bert.config.num_attention_heads
-    head_importance = torch.zeros(n_layers, n_heads).to(args.device)
-    attn_entropy = torch.zeros(n_layers, n_heads).to(args.device)
-    preds = None
-    labels = None
-    tot_tokens = 0.0
-
-    for step, batch in enumerate(tqdm(eval_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])):
-        batch = tuple(t.to(args.device) for t in batch)
-        input_ids, input_mask, segment_ids, label_ids = batch
-
-        # Do a forward pass (not with torch.no_grad() since we need gradients for importance score - see below)
-        all_attentions, logits = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask, head_mask=head_mask)
-
-        if compute_entropy:
-            # Update head attention entropy
-            for layer, attn in enumerate(all_attentions):
-                masked_entropy = entropy(attn.detach()) * input_mask.float().unsqueeze(1)
-                attn_entropy[layer] += masked_entropy.sum(-1).sum(0).detach()
-
-        if compute_importance:
-            # Update head importance scores with regards to our loss
-            # First, backpropagate to populate the gradients
-            if args.output_mode == "classification":
-                loss_fct = CrossEntropyLoss()
-                loss = loss_fct(logits.view(-1, args.num_labels), label_ids.view(-1))
-            elif args.output_mode == "regression":
-                loss_fct = MSELoss()
-                loss = loss_fct(logits.view(-1), label_ids.view(-1))
-            loss.backward()
-            # Second, compute importance scores according to http://arxiv.org/abs/1905.10650
-            multihead_outputs = model.bert.get_multihead_outputs()
-            for layer, mh_layer_output in enumerate(multihead_outputs):
-                dot = torch.einsum("bhli,bhli->bhl", [mh_layer_output.grad, mh_layer_output])
-                head_importance[layer] += dot.abs().sum(-1).sum(0).detach()
-
-        # Also store our logits/labels if we want to compute metrics afterwards
-        if preds is None:
-            preds = logits.detach().cpu().numpy()
-            labels = label_ids.detach().cpu().numpy()
-        else:
-            preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
-            labels = np.append(labels, label_ids.detach().cpu().numpy(), axis=0)
-
-        tot_tokens += input_mask.float().detach().sum().data
-
-    # Normalize
-    attn_entropy /= tot_tokens
-    head_importance /= tot_tokens
-    # Layerwise importance normalization
-    if not args.dont_normalize_importance_by_layer:
-        exponent = 2
-        norm_by_layer = torch.pow(torch.pow(head_importance, exponent).sum(-1), 1/exponent)
-        head_importance /= norm_by_layer.unsqueeze(-1) + 1e-20
-
-    if not args.dont_normalize_global_importance:
-        head_importance = (head_importance - head_importance.min()) / (head_importance.max() - head_importance.min())
-
-    return attn_entropy, head_importance, preds, labels
-
-
-def run_model():
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--model_name_or_path', type=str, default='bert-base-cased-finetuned-mrpc', help='pretrained model name or path to local checkpoint')
-    parser.add_argument("--task_name", type=str, default='mrpc', help="The name of the task to train.")
-    parser.add_argument("--data_dir", type=str, required=True, help="The input data dir. Should contain the .tsv files (or other data files) for the task.")
-    parser.add_argument("--output_dir", type=str, required=True, help="The output directory where the model predictions and checkpoints will be written.")
-    parser.add_argument("--data_subset", type=int, default=-1, help="If > 0: limit the data to a subset of data_subset instances.")
-    parser.add_argument("--overwrite_output_dir", action='store_true', help="Whether to overwrite data in output directory")
-
-    parser.add_argument("--dont_normalize_importance_by_layer", action='store_true', help="Don't normalize importance score by layers")
-    parser.add_argument("--dont_normalize_global_importance", action='store_true', help="Don't normalize all importance scores between 0 and 1")
-
-    parser.add_argument("--try_masking", action='store_true', help="Whether to try to mask head until a threshold of accuracy.")
-    parser.add_argument("--masking_threshold", default=0.9, type=float, help="masking threshold in term of metrics"
-                                                                             "(stop masking when metric < threshold * original metric value).")
-    parser.add_argument("--masking_amount", default=0.1, type=float, help="Amount to heads to masking at each masking step.")
-    parser.add_argument("--metric_name", default="acc", type=str, help="Metric to use for head masking.")
-
-    parser.add_argument("--max_seq_length", default=128, type=int, help="The maximum total input sequence length after WordPiece tokenization. \n"
-                             "Sequences longer than this will be truncated, and sequences shorter \n"
-                             "than this will be padded.")
-    parser.add_argument("--batch_size", default=1, type=int, help="Batch size.")
-
-    parser.add_argument("--seed", type=int, default=42)
-    parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus")
-    parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available")
-    parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.")
-    parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.")
-    args = parser.parse_args()
-
-    if args.server_ip and args.server_port:
-        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
-        import ptvsd
-        print("Waiting for debugger attach")
-        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
-        ptvsd.wait_for_attach()
-
-    # Setup devices and distributed training
-    if args.local_rank == -1 or args.no_cuda:
-        args.device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
-        n_gpu = torch.cuda.device_count()
-    else:
-        torch.cuda.set_device(args.local_rank)
-        args.device = torch.device("cuda", args.local_rank)
-        n_gpu = 1
-        torch.distributed.init_process_group(backend='nccl')  # Initializes the distributed backend
-
-    # Setup logging
-    logging.basicConfig(level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
-    logger.info("device: {} n_gpu: {}, distributed: {}".format(args.device, n_gpu, bool(args.local_rank != -1)))
-
-    # Set seeds
-    np.random.seed(args.seed)
-    torch.random.manual_seed(args.seed)
-    if n_gpu > 0:
-        torch.cuda.manual_seed(args.seed)
-
-    # Prepare GLUE task
-    task_name = args.task_name.lower()
-    processor = processors[task_name]()
-    label_list = processor.get_labels()
-    args.output_mode = output_modes[task_name]
-    args.num_labels = len(label_list)
-
-    # Prepare output directory
-    if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and not args.overwrite_output_dir:
-        raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir))
-    if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
-        os.makedirs(args.output_dir)
-
-    # Load model & tokenizer
-    if args.local_rank not in [-1, 0]:
-        torch.distributed.barrier()  # Make sure only one distributed process download model & vocab
-    tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path)
-
-    # Load a model with all BERTology options on:
-    #   output_attentions => will output attention weights
-    #   keep_multihead_output => will store gradient of attention head outputs for head importance computation
-    #       see: http://arxiv.org/abs/1905.10650
-    model = BertForSequenceClassification.from_pretrained(args.model_name_or_path,
-                                                          num_labels=args.num_labels,
-                                                          output_attentions=True,
-                                                          keep_multihead_output=True)
-    if args.local_rank == 0:
-        torch.distributed.barrier()  # Make sure only one distributed process download model & vocab
-    model.to(args.device)
-    if args.local_rank != -1:
-        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True)
-    model.eval()
-
-    # Prepare dataset for the GLUE task
-    eval_examples = processor.get_dev_examples(args.data_dir)
-    cached_eval_features_file = os.path.join(args.data_dir, 'dev_{0}_{1}_{2}'.format(
-        list(filter(None, args.model_name_or_path.split('/'))).pop(), str(args.max_seq_length), str(task_name)))
-    try:
-        eval_features = torch.load(cached_eval_features_file)
-    except:
-        eval_features = convert_examples_to_features(eval_examples, label_list, args.max_seq_length, tokenizer, args.output_mode)
-        if args.local_rank in [-1, 0]:
-            logger.info("Saving eval features to cache file %s", cached_eval_features_file)
-            torch.save(eval_features, cached_eval_features_file)
-
-    all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
-    all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
-    all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
-    all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long if args.output_mode == "classification" else torch.float)
-    eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
-
-    if args.data_subset > 0:
-        eval_data = Subset(eval_data, list(range(min(args.data_subset, len(eval_data)))))
-
-    eval_sampler = SequentialSampler(eval_data) if args.local_rank == -1 else DistributedSampler(eval_data)
-    eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.batch_size)
-
-    # Print/save training arguments
-    print(args)
-    torch.save(args, os.path.join(args.output_dir, 'run_args.bin'))
-
-    # Compute head entropy and importance score
-    attn_entropy, head_importance, _, _ = compute_heads_importance(args, model, eval_dataloader)
-
-    # Print/save matrices
-    np.save(os.path.join(args.output_dir, 'attn_entropy.npy'), attn_entropy.detach().cpu().numpy())
-    np.save(os.path.join(args.output_dir, 'head_importance.npy'), head_importance.detach().cpu().numpy())
-
-    logger.info("Attention entropies")
-    print_2d_tensor(attn_entropy)
-    logger.info("Head importance scores")
-    print_2d_tensor(head_importance)
-    logger.info("Head ranked by importance scores")
-    head_ranks = torch.zeros(head_importance.numel(), dtype=torch.long, device=args.device)
-    head_ranks[head_importance.view(-1).sort(descending=True)[1]] = torch.arange(head_importance.numel(), device=args.device)
-    head_ranks = head_ranks.view_as(head_importance)
-    print_2d_tensor(head_ranks)
-
-    # Do masking if we want to
-    if args.try_masking and args.masking_threshold > 0.0 and args.masking_threshold < 1.0:
-        _, head_importance, preds, labels = compute_heads_importance(args, model, eval_dataloader, compute_entropy=False)
-        preds = np.argmax(preds, axis=1) if args.output_mode == "classification" else np.squeeze(preds)
-        original_score = compute_metrics(task_name, preds, labels)[args.metric_name]
-        logger.info("Pruning: original score: %f, threshold: %f", original_score, original_score * args.masking_threshold)
-
-        new_head_mask = torch.ones_like(head_importance)
-        num_to_mask = max(1, int(new_head_mask.numel() * args.masking_amount))
-
-        current_score = original_score
-        while current_score >= original_score * args.masking_threshold:
-            head_mask = new_head_mask.clone() # save current head mask
-            # heads from least important to most - keep only not-masked heads
-            head_importance[head_mask == 0.0] = float('Inf')
-            current_heads_to_mask = head_importance.view(-1).sort()[1]
-
-            if len(current_heads_to_mask) <= num_to_mask:
-                break
-
-            # mask heads
-            current_heads_to_mask = current_heads_to_mask[:num_to_mask]
-            logger.info("Heads to mask: %s", str(current_heads_to_mask.tolist()))
-            new_head_mask = new_head_mask.view(-1)
-            new_head_mask[current_heads_to_mask] = 0.0
-            new_head_mask = new_head_mask.view_as(head_mask)
-            print_2d_tensor(new_head_mask)
-
-            # Compute metric and head importance again
-            _, head_importance, preds, labels = compute_heads_importance(args, model, eval_dataloader, compute_entropy=False, head_mask=new_head_mask)
-            preds = np.argmax(preds, axis=1) if args.output_mode == "classification" else np.squeeze(preds)
-            current_score = compute_metrics(task_name, preds, labels)[args.metric_name]
-            logger.info("Masking: current score: %f, remaning heads %d (%.1f percents)", current_score, new_head_mask.sum(), new_head_mask.sum()/new_head_mask.numel() * 100)
-
-        logger.info("Final head mask")
-        print_2d_tensor(head_mask)
-        np.save(os.path.join(args.output_dir, 'head_mask.npy'), head_mask.detach().cpu().numpy())
-
-        # Try pruning and test time speedup
-        # Pruning is like masking but we actually remove the masked weights
-        before_time = datetime.now()
-        _, _, preds, labels = compute_heads_importance(args, model, eval_dataloader,
-                                                       compute_entropy=False, compute_importance=False, head_mask=head_mask)
-        preds = np.argmax(preds, axis=1) if args.output_mode == "classification" else np.squeeze(preds)
-        score_masking = compute_metrics(task_name, preds, labels)[args.metric_name]
-        original_time = datetime.now() - before_time
-
-        original_num_params = sum(p.numel() for p in model.parameters())
-        heads_to_prune = dict((layer, (1 - head_mask[layer].long()).nonzero().tolist()) for layer in range(len(head_mask)))
-        assert sum(len(h) for h in heads_to_prune.values()) == (1 - head_mask.long()).sum().item()
-        model.bert.prune_heads(heads_to_prune)
-        pruned_num_params = sum(p.numel() for p in model.parameters())
-
-        before_time = datetime.now()
-        _, _, preds, labels = compute_heads_importance(args, model, eval_dataloader,
-                                                       compute_entropy=False, compute_importance=False, head_mask=None)
-        preds = np.argmax(preds, axis=1) if args.output_mode == "classification" else np.squeeze(preds)
-        score_pruning = compute_metrics(task_name, preds, labels)[args.metric_name]
-        new_time = datetime.now() - before_time
-
-        logger.info("Pruning: original num of params: %.2e, after pruning %.2e (%.1f percents)", original_num_params, pruned_num_params, pruned_num_params/original_num_params * 100)
-        logger.info("Pruning: score with masking: %f score with pruning: %f", score_masking, score_pruning)
-        logger.info("Pruning: speed ratio (new timing / original timing): %f percents", original_time/new_time * 100)
-
-if __name__ == '__main__':
-    run_model()
--- a/examples/extract_features.py
+++ b/examples/extract_features.py
@@ -1,297 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Extract pre-computed feature vectors from a PyTorch BERT model."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import argparse
-import collections
-import logging
-import json
-import re
-
-import torch
-from torch.utils.data import TensorDataset, DataLoader, SequentialSampler
-from torch.utils.data.distributed import DistributedSampler
-
-from pytorch_pretrained_bert.tokenization import BertTokenizer
-from pytorch_pretrained_bert.modeling import BertModel
-
-logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s', 
-                    datefmt = '%m/%d/%Y %H:%M:%S',
-                    level = logging.INFO)
-logger = logging.getLogger(__name__)
-
-
-class InputExample(object):
-
-    def __init__(self, unique_id, text_a, text_b):
-        self.unique_id = unique_id
-        self.text_a = text_a
-        self.text_b = text_b
-
-
-class InputFeatures(object):
-    """A single set of features of data."""
-
-    def __init__(self, unique_id, tokens, input_ids, input_mask, input_type_ids):
-        self.unique_id = unique_id
-        self.tokens = tokens
-        self.input_ids = input_ids
-        self.input_mask = input_mask
-        self.input_type_ids = input_type_ids
-
-
-def convert_examples_to_features(examples, seq_length, tokenizer):
-    """Loads a data file into a list of `InputFeature`s."""
-
-    features = []
-    for (ex_index, example) in enumerate(examples):
-        tokens_a = tokenizer.tokenize(example.text_a)
-
-        tokens_b = None
-        if example.text_b:
-            tokens_b = tokenizer.tokenize(example.text_b)
-
-        if tokens_b:
-            # Modifies `tokens_a` and `tokens_b` in place so that the total
-            # length is less than the specified length.
-            # Account for [CLS], [SEP], [SEP] with "- 3"
-            _truncate_seq_pair(tokens_a, tokens_b, seq_length - 3)
-        else:
-            # Account for [CLS] and [SEP] with "- 2"
-            if len(tokens_a) > seq_length - 2:
-                tokens_a = tokens_a[0:(seq_length - 2)]
-
-        # The convention in BERT is:
-        # (a) For sequence pairs:
-        #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
-        #  type_ids:   0   0  0    0    0     0      0   0    1  1  1   1  1   1
-        # (b) For single sequences:
-        #  tokens:   [CLS] the dog is hairy . [SEP]
-        #  type_ids:   0   0   0   0  0     0   0
-        #
-        # Where "type_ids" are used to indicate whether this is the first
-        # sequence or the second sequence. The embedding vectors for `type=0` and
-        # `type=1` were learned during pre-training and are added to the wordpiece
-        # embedding vector (and position vector). This is not *strictly* necessary
-        # since the [SEP] token unambigiously separates the sequences, but it makes
-        # it easier for the model to learn the concept of sequences.
-        #
-        # For classification tasks, the first vector (corresponding to [CLS]) is
-        # used as as the "sentence vector". Note that this only makes sense because
-        # the entire model is fine-tuned.
-        tokens = []
-        input_type_ids = []
-        tokens.append("[CLS]")
-        input_type_ids.append(0)
-        for token in tokens_a:
-            tokens.append(token)
-            input_type_ids.append(0)
-        tokens.append("[SEP]")
-        input_type_ids.append(0)
-
-        if tokens_b:
-            for token in tokens_b:
-                tokens.append(token)
-                input_type_ids.append(1)
-            tokens.append("[SEP]")
-            input_type_ids.append(1)
-
-        input_ids = tokenizer.convert_tokens_to_ids(tokens)
-
-        # The mask has 1 for real tokens and 0 for padding tokens. Only real
-        # tokens are attended to.
-        input_mask = [1] * len(input_ids)
-
-        # Zero-pad up to the sequence length.
-        while len(input_ids) < seq_length:
-            input_ids.append(0)
-            input_mask.append(0)
-            input_type_ids.append(0)
-
-        assert len(input_ids) == seq_length
-        assert len(input_mask) == seq_length
-        assert len(input_type_ids) == seq_length
-
-        if ex_index < 5:
-            logger.info("*** Example ***")
-            logger.info("unique_id: %s" % (example.unique_id))
-            logger.info("tokens: %s" % " ".join([str(x) for x in tokens]))
-            logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
-            logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
-            logger.info(
-                "input_type_ids: %s" % " ".join([str(x) for x in input_type_ids]))
-
-        features.append(
-            InputFeatures(
-                unique_id=example.unique_id,
-                tokens=tokens,
-                input_ids=input_ids,
-                input_mask=input_mask,
-                input_type_ids=input_type_ids))
-    return features
-
-
-def _truncate_seq_pair(tokens_a, tokens_b, max_length):
-    """Truncates a sequence pair in place to the maximum length."""
-
-    # This is a simple heuristic which will always truncate the longer sequence
-    # one token at a time. This makes more sense than truncating an equal percent
-    # of tokens from each, since if one sequence is very short then each token
-    # that's truncated likely contains more information than a longer sequence.
-    while True:
-        total_length = len(tokens_a) + len(tokens_b)
-        if total_length <= max_length:
-            break
-        if len(tokens_a) > len(tokens_b):
-            tokens_a.pop()
-        else:
-            tokens_b.pop()
-
-
-def read_examples(input_file):
-    """Read a list of `InputExample`s from an input file."""
-    examples = []
-    unique_id = 0
-    with open(input_file, "r", encoding='utf-8') as reader:
-        while True:
-            line = reader.readline()
-            if not line:
-                break
-            line = line.strip()
-            text_a = None
-            text_b = None
-            m = re.match(r"^(.*) \|\|\| (.*)$", line)
-            if m is None:
-                text_a = line
-            else:
-                text_a = m.group(1)
-                text_b = m.group(2)
-            examples.append(
-                InputExample(unique_id=unique_id, text_a=text_a, text_b=text_b))
-            unique_id += 1
-    return examples
-
-
-def main():
-    parser = argparse.ArgumentParser()
-
-    ## Required parameters
-    parser.add_argument("--input_file", default=None, type=str, required=True)
-    parser.add_argument("--output_file", default=None, type=str, required=True)
-    parser.add_argument("--bert_model", default=None, type=str, required=True,
-                        help="Bert pre-trained model selected in the list: bert-base-uncased, "
-                             "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.")
-
-    ## Other parameters
-    parser.add_argument("--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.")
-    parser.add_argument("--layers", default="-1,-2,-3,-4", type=str)
-    parser.add_argument("--max_seq_length", default=128, type=int,
-                        help="The maximum total input sequence length after WordPiece tokenization. Sequences longer "
-                            "than this will be truncated, and sequences shorter than this will be padded.")
-    parser.add_argument("--batch_size", default=32, type=int, help="Batch size for predictions.")
-    parser.add_argument("--local_rank",
-                        type=int,
-                        default=-1,
-                        help = "local_rank for distributed training on gpus")
-    parser.add_argument("--no_cuda",
-                        action='store_true',
-                        help="Whether not to use CUDA when available")
-
-    args = parser.parse_args()
-
-    if args.local_rank == -1 or args.no_cuda:
-        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
-        n_gpu = torch.cuda.device_count()
-    else:
-        device = torch.device("cuda", args.local_rank)
-        n_gpu = 1
-        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
-        torch.distributed.init_process_group(backend='nccl')
-    logger.info("device: {} n_gpu: {} distributed training: {}".format(device, n_gpu, bool(args.local_rank != -1)))
-
-    layer_indexes = [int(x) for x in args.layers.split(",")]
-
-    tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)
-
-    examples = read_examples(args.input_file)
-
-    features = convert_examples_to_features(
-        examples=examples, seq_length=args.max_seq_length, tokenizer=tokenizer)
-
-    unique_id_to_feature = {}
-    for feature in features:
-        unique_id_to_feature[feature.unique_id] = feature
-
-    model = BertModel.from_pretrained(args.bert_model)
-    model.to(device)
-
-    if args.local_rank != -1:
-        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
-                                                          output_device=args.local_rank)
-    elif n_gpu > 1:
-        model = torch.nn.DataParallel(model)
-
-    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
-    all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
-    all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
-
-    eval_data = TensorDataset(all_input_ids, all_input_mask, all_example_index)
-    if args.local_rank == -1:
-        eval_sampler = SequentialSampler(eval_data)
-    else:
-        eval_sampler = DistributedSampler(eval_data)
-    eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.batch_size)
-
-    model.eval()
-    with open(args.output_file, "w", encoding='utf-8') as writer:
-        for input_ids, input_mask, example_indices in eval_dataloader:
-            input_ids = input_ids.to(device)
-            input_mask = input_mask.to(device)
-
-            all_encoder_layers, _ = model(input_ids, token_type_ids=None, attention_mask=input_mask)
-            all_encoder_layers = all_encoder_layers
-
-            for b, example_index in enumerate(example_indices):
-                feature = features[example_index.item()]
-                unique_id = int(feature.unique_id)
-                # feature = unique_id_to_feature[unique_id]
-                output_json = collections.OrderedDict()
-                output_json["linex_index"] = unique_id
-                all_out_features = []
-                for (i, token) in enumerate(feature.tokens):
-                    all_layers = []
-                    for (j, layer_index) in enumerate(layer_indexes):
-                        layer_output = all_encoder_layers[int(layer_index)].detach().cpu().numpy()
-                        layer_output = layer_output[b]
-                        layers = collections.OrderedDict()
-                        layers["index"] = layer_index
-                        layers["values"] = [
-                            round(x.item(), 6) for x in layer_output[i]
-                        ]
-                        all_layers.append(layers)
-                    out_features = collections.OrderedDict()
-                    out_features["token"] = token
-                    out_features["layers"] = all_layers
-                    all_out_features.append(out_features)
-                output_json["features"] = all_out_features
-                writer.write(json.dumps(output_json) + "\n")
-
-
-if __name__ == "__main__":
-    main()
--- a/examples/lm_finetuning/finetune_on_pregenerated.py
+++ b/examples/lm_finetuning/finetune_on_pregenerated.py
@@ -13,10 +13,10 @@ from torch.utils.data import DataLoader, Dataset, RandomSampler
 from torch.utils.data.distributed import DistributedSampler
 from tqdm import tqdm

-from pytorch_pretrained_bert import WEIGHTS_NAME, CONFIG_NAME
-from pytorch_pretrained_bert.modeling import BertForPreTraining
-from pytorch_pretrained_bert.tokenization import BertTokenizer
-from pytorch_pretrained_bert.optimization import BertAdam, WarmupLinearSchedule
+from pytorch_transformers import WEIGHTS_NAME, CONFIG_NAME
+from pytorch_transformers.modeling_bert import BertForPreTraining
+from pytorch_transformers.tokenization_bert import BertTokenizer
+from pytorch_transformers.optimization import AdamW, WarmupLinearSchedule

 InputFeatures = namedtuple("InputFeatures", "input_ids input_mask segment_ids lm_label_ids is_next")

@@ -273,7 +273,7 @@ def main():
        warmup_linear = WarmupLinearSchedule(warmup=args.warmup_proportion,
                                             t_total=num_train_optimization_steps)
    else:
-        optimizer = BertAdam(optimizer_grouped_parameters,
+        optimizer = AdamW(optimizer_grouped_parameters,
                             lr=args.learning_rate,
                             warmup=args.warmup_proportion,
                             t_total=num_train_optimization_steps)
--- a/examples/lm_finetuning/pregenerate_training_data.py
+++ b/examples/lm_finetuning/pregenerate_training_data.py
@@ -6,7 +6,7 @@ import shelve
 from multiprocessing import Pool

 from random import random, randrange, randint, shuffle, choice
-from pytorch_pretrained_bert.tokenization import BertTokenizer
+from pytorch_transformers.tokenization_bert import BertTokenizer
 import numpy as np
 import json
 import collections
--- a/examples/lm_finetuning/simple_lm_finetuning.py
+++ b/examples/lm_finetuning/simple_lm_finetuning.py
@@ -29,10 +29,10 @@ from torch.utils.data import DataLoader, Dataset, RandomSampler
 from torch.utils.data.distributed import DistributedSampler
 from tqdm import tqdm, trange

-from pytorch_pretrained_bert import WEIGHTS_NAME, CONFIG_NAME
-from pytorch_pretrained_bert.modeling import BertForPreTraining
-from pytorch_pretrained_bert.tokenization import BertTokenizer
-from pytorch_pretrained_bert.optimization import BertAdam, WarmupLinearSchedule
+from pytorch_transformers import WEIGHTS_NAME, CONFIG_NAME
+from pytorch_transformers.modeling_bert import BertForPreTraining
+from pytorch_transformers.tokenization_bert import BertTokenizer
+from pytorch_transformers.optimization import BertAdam, WarmupLinearSchedule

 logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                    datefmt='%m/%d/%Y %H:%M:%S',
--- a/examples/run_bertology.py
+++ b/examples/run_bertology.py
@@ -0,0 +1,346 @@
+#!/usr/bin/env python3
+# Copyright 2018 CMU and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Bertology: this script shows how you can explore the internals of the models in the library to:
+    - compute the entropy of the head attentions
+    - compute the importance of each head
+    - prune (remove) the low importance head.
+    Some parts of this script are adapted from the code of Michel et al. (http://arxiv.org/abs/1905.10650)
+    which is available at https://github.com/pmichel31415/are-16-heads-really-better-than-1
+"""
+import os
+import argparse
+import logging
+from datetime import timedelta, datetime
+from tqdm import tqdm
+
+import numpy as np
+
+import torch
+from torch.utils.data import DataLoader, SequentialSampler, TensorDataset, Subset
+from torch.utils.data.distributed import DistributedSampler
+from torch.nn import CrossEntropyLoss, MSELoss
+
+from pytorch_transformers import (WEIGHTS_NAME,
+                                  BertConfig, BertForSequenceClassification, BertTokenizer,
+                                  XLMConfig, XLMForSequenceClassification, XLMTokenizer,
+                                  XLNetConfig, XLNetForSequenceClassification, XLNetTokenizer)
+
+from run_glue import set_seed, load_and_cache_examples, ALL_MODELS, MODEL_CLASSES
+
+from utils_glue import (compute_metrics, convert_examples_to_features,
+                        output_modes, processors)
+
+logger = logging.getLogger(__name__)
+
+
+def entropy(p):
+    """ Compute the entropy of a probability distribution """
+    plogp = p * torch.log(p)
+    plogp[p == 0] = 0
+    return -plogp.sum(dim=-1)
+
+
+def print_2d_tensor(tensor):
+    """ Print a 2D tensor """
+    logger.info("lv, h >\t" + "\t".join(f"{x + 1}" for x in range(len(tensor))))
+    for row in range(len(tensor)):
+        if tensor.dtype != torch.long:
+            logger.info(f"layer {row + 1}:\t" + "\t".join(f"{x:.5f}" for x in tensor[row].cpu().data))
+        else:
+            logger.info(f"layer {row + 1}:\t" + "\t".join(f"{x:d}" for x in tensor[row].cpu().data))
+
+
+def compute_heads_importance(args, model, eval_dataloader, compute_entropy=True, compute_importance=True, head_mask=None):
+    """ This method shows how to compute:
+        - head attention entropy
+        - head importance scores according to http://arxiv.org/abs/1905.10650
+    """
+    # Prepare our tensors
+    n_layers, n_heads = model.bert.config.num_hidden_layers, model.bert.config.num_attention_heads
+    head_importance = torch.zeros(n_layers, n_heads).to(args.device)
+    attn_entropy = torch.zeros(n_layers, n_heads).to(args.device)
+
+    if head_mask is None:
+        head_mask = torch.ones(n_layers, n_heads).to(args.device)
+    head_mask.requires_grad_(requires_grad=True)
+    preds = None
+    labels = None
+    tot_tokens = 0.0
+
+    for step, batch in enumerate(tqdm(eval_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])):
+        batch = tuple(t.to(args.device) for t in batch)
+        input_ids, input_mask, segment_ids, label_ids = batch
+
+        # Do a forward pass (not with torch.no_grad() since we need gradients for importance score - see below)
+        outputs = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids, head_mask=head_mask)
+        loss, logits, all_attentions = outputs[0], outputs[1], outputs[-1]  # Loss and logits are the first, attention the last
+        loss.backward()  # Backpropagate to populate the gradients in the head mask
+
+        if compute_entropy:
+            for layer, attn in enumerate(all_attentions):
+                masked_entropy = entropy(attn.detach()) * input_mask.float().unsqueeze(1)
+                attn_entropy[layer] += masked_entropy.sum(-1).sum(0).detach()
+
+        if compute_importance:
+            head_importance += head_mask.grad.abs().detach()
+
+        # Also store our logits/labels if we want to compute metrics afterwards
+        if preds is None:
+            preds = logits.detach().cpu().numpy()
+            labels = label_ids.detach().cpu().numpy()
+        else:
+            preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
+            labels = np.append(labels, label_ids.detach().cpu().numpy(), axis=0)
+
+        tot_tokens += input_mask.float().detach().sum().data
+
+    # Normalize
+    attn_entropy /= tot_tokens
+    head_importance /= tot_tokens
+    # Layerwise importance normalization
+    if not args.dont_normalize_importance_by_layer:
+        exponent = 2
+        norm_by_layer = torch.pow(torch.pow(head_importance, exponent).sum(-1), 1/exponent)
+        head_importance /= norm_by_layer.unsqueeze(-1) + 1e-20
+
+    if not args.dont_normalize_global_importance:
+        head_importance = (head_importance - head_importance.min()) / (head_importance.max() - head_importance.min())
+
+    # Print/save matrices
+    np.save(os.path.join(args.output_dir, 'attn_entropy.npy'), attn_entropy.detach().cpu().numpy())
+    np.save(os.path.join(args.output_dir, 'head_importance.npy'), head_importance.detach().cpu().numpy())
+
+    logger.info("Attention entropies")
+    print_2d_tensor(attn_entropy)
+    logger.info("Head importance scores")
+    print_2d_tensor(head_importance)
+    logger.info("Head ranked by importance scores")
+    head_ranks = torch.zeros(head_importance.numel(), dtype=torch.long, device=args.device)
+    head_ranks[head_importance.view(-1).sort(descending=True)[1]] = torch.arange(head_importance.numel(), device=args.device)
+    head_ranks = head_ranks.view_as(head_importance)
+    print_2d_tensor(head_ranks)
+
+    return attn_entropy, head_importance, preds, labels
+
+
+def mask_heads(args, model, eval_dataloader):
+    """ This method shows how to mask head (set some heads to zero), to test the effect on the network,
+        based on the head importance scores, as described in Michel et al. (http://arxiv.org/abs/1905.10650)
+    """
+    _, head_importance, preds, labels = compute_heads_importance(args, model, eval_dataloader, compute_entropy=False)
+    preds = np.argmax(preds, axis=1) if args.output_mode == "classification" else np.squeeze(preds)
+    original_score = compute_metrics(args.task_name, preds, labels)[args.metric_name]
+    logger.info("Pruning: original score: %f, threshold: %f", original_score, original_score * args.masking_threshold)
+
+    new_head_mask = torch.ones_like(head_importance)
+    num_to_mask = max(1, int(new_head_mask.numel() * args.masking_amount))
+
+    current_score = original_score
+    while current_score >= original_score * args.masking_threshold:
+        head_mask = new_head_mask.clone() # save current head mask
+        # heads from least important to most - keep only not-masked heads
+        head_importance[head_mask == 0.0] = float('Inf')
+        current_heads_to_mask = head_importance.view(-1).sort()[1]
+
+        if len(current_heads_to_mask) <= num_to_mask:
+            break
+
+        # mask heads
+        current_heads_to_mask = current_heads_to_mask[:num_to_mask]
+        logger.info("Heads to mask: %s", str(current_heads_to_mask.tolist()))
+        new_head_mask = new_head_mask.view(-1)
+        new_head_mask[current_heads_to_mask] = 0.0
+        new_head_mask = new_head_mask.view_as(head_mask)
+        print_2d_tensor(new_head_mask)
+
+        # Compute metric and head importance again
+        _, head_importance, preds, labels = compute_heads_importance(args, model, eval_dataloader, compute_entropy=False, head_mask=new_head_mask)
+        preds = np.argmax(preds, axis=1) if args.output_mode == "classification" else np.squeeze(preds)
+        current_score = compute_metrics(args.task_name, preds, labels)[args.metric_name]
+        logger.info("Masking: current score: %f, remaning heads %d (%.1f percents)", current_score, new_head_mask.sum(), new_head_mask.sum()/new_head_mask.numel() * 100)
+
+    logger.info("Final head mask")
+    print_2d_tensor(head_mask)
+    np.save(os.path.join(args.output_dir, 'head_mask.npy'), head_mask.detach().cpu().numpy())
+
+    return head_mask
+
+
+def prune_heads(args, model, eval_dataloader, head_mask):
+    """ This method shows how to prune head (remove heads weights) based on
+        the head importance scores as described in Michel et al. (http://arxiv.org/abs/1905.10650)
+    """
+    # Try pruning and test time speedup
+    # Pruning is like masking but we actually remove the masked weights
+    before_time = datetime.now()
+    _, _, preds, labels = compute_heads_importance(args, model, eval_dataloader,
+                                                   compute_entropy=False, compute_importance=False, head_mask=head_mask)
+    preds = np.argmax(preds, axis=1) if args.output_mode == "classification" else np.squeeze(preds)
+    score_masking = compute_metrics(args.task_name, preds, labels)[args.metric_name]
+    original_time = datetime.now() - before_time
+
+    original_num_params = sum(p.numel() for p in model.parameters())
+    heads_to_prune = dict((layer, (1 - head_mask[layer].long()).nonzero().tolist()) for layer in range(len(head_mask)))
+    assert sum(len(h) for h in heads_to_prune.values()) == (1 - head_mask.long()).sum().item()
+    model.prune_heads(heads_to_prune)
+    pruned_num_params = sum(p.numel() for p in model.parameters())
+
+    before_time = datetime.now()
+    _, _, preds, labels = compute_heads_importance(args, model, eval_dataloader,
+                                                    compute_entropy=False, compute_importance=False, head_mask=None)
+    preds = np.argmax(preds, axis=1) if args.output_mode == "classification" else np.squeeze(preds)
+    score_pruning = compute_metrics(args.task_name, preds, labels)[args.metric_name]
+    new_time = datetime.now() - before_time
+
+    logger.info("Pruning: original num of params: %.2e, after pruning %.2e (%.1f percents)", original_num_params, pruned_num_params, pruned_num_params/original_num_params * 100)
+    logger.info("Pruning: score with masking: %f score with pruning: %f", score_masking, score_pruning)
+    logger.info("Pruning: speed ratio (new timing / original timing): %f percents", original_time/new_time * 100)
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--data_dir", default=None, type=str, required=True,
+                        help="The input data dir. Should contain the .tsv files (or other data files) for the task.")
+    parser.add_argument("--model_name", default=None, type=str, required=True,
+                        help="Bert/XLNet/XLM pre-trained model selected in the list: " + ", ".join(ALL_MODELS))
+    parser.add_argument("--task_name", default=None, type=str, required=True,
+                        help="The name of the task to train selected in the list: " + ", ".join(processors.keys()))
+    parser.add_argument("--output_dir", default=None, type=str, required=True,
+                        help="The output directory where the model predictions and checkpoints will be written.")
+
+    ## Other parameters
+    parser.add_argument("--config_name", default="", type=str,
+                        help="Pretrained config name or path if not the same as model_name")
+    parser.add_argument("--tokenizer_name", default="", type=str,
+                        help="Pretrained tokenizer name or path if not the same as model_name")
+    parser.add_argument("--cache_dir", default="", type=str,
+                        help="Where do you want to store the pre-trained models downloaded from s3")
+    parser.add_argument("--data_subset", type=int, default=-1,
+                        help="If > 0: limit the data to a subset of data_subset instances.")
+    parser.add_argument("--overwrite_output_dir", action='store_true',
+                        help="Whether to overwrite data in output directory")
+
+    parser.add_argument("--dont_normalize_importance_by_layer", action='store_true',
+                        help="Don't normalize importance score by layers")
+    parser.add_argument("--dont_normalize_global_importance", action='store_true',
+                        help="Don't normalize all importance scores between 0 and 1")
+
+    parser.add_argument("--try_masking", action='store_true',
+                        help="Whether to try to mask head until a threshold of accuracy.")
+    parser.add_argument("--masking_threshold", default=0.9, type=float,
+                        help="masking threshold in term of metrics (stop masking when metric < threshold * original metric value).")
+    parser.add_argument("--masking_amount", default=0.1, type=float,
+                        help="Amount to heads to masking at each masking step.")
+    parser.add_argument("--metric_name", default="acc", type=str,
+                        help="Metric to use for head masking.")
+
+    parser.add_argument("--max_seq_length", default=128, type=int,
+                        help="The maximum total input sequence length after WordPiece tokenization. \n"
+                             "Sequences longer than this will be truncated, sequences shorter padded.")
+    parser.add_argument("--batch_size", default=1, type=int, help="Batch size.")
+
+    parser.add_argument("--seed", type=int, default=42)
+    parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus")
+    parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available")
+    parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.")
+    parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.")
+    args = parser.parse_args()
+
+    if args.server_ip and args.server_port:
+        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
+        import ptvsd
+        print("Waiting for debugger attach")
+        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
+        ptvsd.wait_for_attach()
+
+    # Setup devices and distributed training
+    if args.local_rank == -1 or args.no_cuda:
+        args.device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
+        args.n_gpu = torch.cuda.device_count()
+    else:
+        torch.cuda.set_device(args.local_rank)
+        args.device = torch.device("cuda", args.local_rank)
+        args.n_gpu = 1
+        torch.distributed.init_process_group(backend='nccl')  # Initializes the distributed backend
+
+    # Setup logging
+    logging.basicConfig(level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
+    logger.info("device: {} n_gpu: {}, distributed: {}".format(args.device, args.n_gpu, bool(args.local_rank != -1)))
+
+    # Set seeds
+    set_seed(args)
+
+    # Prepare GLUE task
+    args.task_name = args.task_name.lower()
+    if args.task_name not in processors:
+        raise ValueError("Task not found: %s" % (args.task_name))
+    processor = processors[args.task_name]()
+    args.output_mode = output_modes[args.task_name]
+    label_list = processor.get_labels()
+    num_labels = len(label_list)
+
+    # Load pretrained model and tokenizer
+    if args.local_rank not in [-1, 0]:
+        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
+
+    args.model_type = ""
+    for key in MODEL_CLASSES:
+        if key in args.model_name.lower():
+            args.model_type = key  # take the first match in model types
+            break
+    config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
+    config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name,
+                                          num_labels=num_labels, finetuning_task=args.task_name,
+                                          output_attentions=True)
+    tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name)
+    model = model_class.from_pretrained(args.model_name, from_tf=bool('.ckpt' in args.model_name), config=config)
+
+    if args.local_rank == 0:
+        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
+
+    # Distributed and parallel training
+    model.to(args.device)
+    if args.local_rank != -1:
+        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
+                                                          output_device=args.local_rank,
+                                                          find_unused_parameters=True)
+    elif args.n_gpu > 1:
+        model = torch.nn.DataParallel(model)
+
+    # Print/save training arguments
+    torch.save(args, os.path.join(args.output_dir, 'run_args.bin'))
+    logger.info("Training/evaluation parameters %s", args)
+
+    # Prepare dataset for the GLUE task
+    eval_data = load_and_cache_examples(args, args.task_name, tokenizer, evaluate=True)
+    if args.data_subset > 0:
+        eval_data = Subset(eval_data, list(range(min(args.data_subset, len(eval_data)))))
+    eval_sampler = SequentialSampler(eval_data) if args.local_rank == -1 else DistributedSampler(eval_data)
+    eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.batch_size)
+
+
+    # Compute head entropy and importance score
+    compute_heads_importance(args, model, eval_dataloader)
+
+
+    # Try head masking (set heads to zero until the score goes under a threshole)
+    # and head pruning (remove masked heads and see the effect on the network)
+    if args.try_masking and args.masking_threshold > 0.0 and args.masking_threshold < 1.0:
+        head_mask = mask_heads(args, model, eval_dataloader)
+        prune_heads(args, model, eval_dataloader, head_mask)
+
+
+if __name__ == '__main__':
+    main()
--- a/examples/run_classifier.py
+++ b/examples/run_classifier.py
@@ -1,541 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""BERT finetuning runner."""
-
-from __future__ import absolute_import, division, print_function
-
-import argparse
-import logging
-import os
-import sys
-import random
-from tqdm import tqdm, trange
-
-import numpy as np
-
-import torch
-from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
-                              TensorDataset)
-from torch.utils.data.distributed import DistributedSampler
-from torch.nn import CrossEntropyLoss, MSELoss
-
-from tensorboardX import SummaryWriter
-
-from pytorch_pretrained_bert.file_utils import WEIGHTS_NAME, CONFIG_NAME
-from pytorch_pretrained_bert.modeling import BertForSequenceClassification
-from pytorch_pretrained_bert.tokenization import BertTokenizer
-from pytorch_pretrained_bert.optimization import BertAdam, WarmupLinearSchedule
-
-from run_classifier_dataset_utils import processors, output_modes, convert_examples_to_features, compute_metrics
-
-if sys.version_info[0] == 2:
-    import cPickle as pickle
-else:
-    import pickle
-
-
-logger = logging.getLogger(__name__)
-
-
-def main():
-    parser = argparse.ArgumentParser()
-
-    ## Required parameters
-    parser.add_argument("--data_dir",
-                        default=None,
-                        type=str,
-                        required=True,
-                        help="The input data dir. Should contain the .tsv files (or other data files) for the task.")
-    parser.add_argument("--bert_model", default=None, type=str, required=True,
-                        help="Bert pre-trained model selected in the list: bert-base-uncased, "
-                        "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, "
-                        "bert-base-multilingual-cased, bert-base-chinese.")
-    parser.add_argument("--task_name",
-                        default=None,
-                        type=str,
-                        required=True,
-                        help="The name of the task to train.")
-    parser.add_argument("--output_dir",
-                        default=None,
-                        type=str,
-                        required=True,
-                        help="The output directory where the model predictions and checkpoints will be written.")
-
-    ## Other parameters
-    parser.add_argument("--cache_dir",
-                        default="",
-                        type=str,
-                        help="Where do you want to store the pre-trained models downloaded from s3")
-    parser.add_argument("--max_seq_length",
-                        default=128,
-                        type=int,
-                        help="The maximum total input sequence length after WordPiece tokenization. \n"
-                             "Sequences longer than this will be truncated, and sequences shorter \n"
-                             "than this will be padded.")
-    parser.add_argument("--do_train",
-                        action='store_true',
-                        help="Whether to run training.")
-    parser.add_argument("--do_eval",
-                        action='store_true',
-                        help="Whether to run eval on the dev set.")
-    parser.add_argument("--do_lower_case",
-                        action='store_true',
-                        help="Set this flag if you are using an uncased model.")
-    parser.add_argument("--train_batch_size",
-                        default=32,
-                        type=int,
-                        help="Total batch size for training.")
-    parser.add_argument("--eval_batch_size",
-                        default=8,
-                        type=int,
-                        help="Total batch size for eval.")
-    parser.add_argument("--learning_rate",
-                        default=5e-5,
-                        type=float,
-                        help="The initial learning rate for Adam.")
-    parser.add_argument("--num_train_epochs",
-                        default=3.0,
-                        type=float,
-                        help="Total number of training epochs to perform.")
-    parser.add_argument("--warmup_proportion",
-                        default=0.1,
-                        type=float,
-                        help="Proportion of training to perform linear learning rate warmup for. "
-                             "E.g., 0.1 = 10%% of training.")
-    parser.add_argument("--no_cuda",
-                        action='store_true',
-                        help="Whether not to use CUDA when available")
-    parser.add_argument('--overwrite_output_dir',
-                        action='store_true',
-                        help="Overwrite the content of the output directory")
-    parser.add_argument("--local_rank",
-                        type=int,
-                        default=-1,
-                        help="local_rank for distributed training on gpus")
-    parser.add_argument('--seed',
-                        type=int,
-                        default=42,
-                        help="random seed for initialization")
-    parser.add_argument('--gradient_accumulation_steps',
-                        type=int,
-                        default=1,
-                        help="Number of updates steps to accumulate before performing a backward/update pass.")
-    parser.add_argument('--fp16',
-                        action='store_true',
-                        help="Whether to use 16-bit float precision instead of 32-bit")
-    parser.add_argument('--loss_scale',
-                        type=float, default=0,
-                        help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
-                             "0 (default value): dynamic loss scaling.\n"
-                             "Positive power of 2: static loss scaling value.\n")
-    parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.")
-    parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.")
-    args = parser.parse_args()
-
-    if args.server_ip and args.server_port:
-        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
-        import ptvsd
-        print("Waiting for debugger attach")
-        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
-        ptvsd.wait_for_attach()
-
-    if args.local_rank == -1 or args.no_cuda:
-        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
-        n_gpu = torch.cuda.device_count()
-    else:
-        torch.cuda.set_device(args.local_rank)
-        device = torch.device("cuda", args.local_rank)
-        n_gpu = 1
-        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
-        torch.distributed.init_process_group(backend='nccl')
-    args.device = device
-
-    logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
-                        datefmt = '%m/%d/%Y %H:%M:%S',
-                        level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
-
-    logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format(
-        device, n_gpu, bool(args.local_rank != -1), args.fp16))
-
-    if args.gradient_accumulation_steps < 1:
-        raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(
-                            args.gradient_accumulation_steps))
-
-    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps
-
-    random.seed(args.seed)
-    np.random.seed(args.seed)
-    torch.manual_seed(args.seed)
-    if n_gpu > 0:
-        torch.cuda.manual_seed_all(args.seed)
-
-    if not args.do_train and not args.do_eval:
-        raise ValueError("At least one of `do_train` or `do_eval` must be True.")
-
-    if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir:
-        raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir))
-    if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
-        os.makedirs(args.output_dir)
-
-    task_name = args.task_name.lower()
-
-    if task_name not in processors:
-        raise ValueError("Task not found: %s" % (task_name))
-
-    processor = processors[task_name]()
-    output_mode = output_modes[task_name]
-
-    label_list = processor.get_labels()
-    num_labels = len(label_list)
-
-    if args.local_rank not in [-1, 0]:
-        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
-    tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)
-    model = BertForSequenceClassification.from_pretrained(args.bert_model, num_labels=num_labels)
-    if args.local_rank == 0:
-        torch.distributed.barrier()
-
-    if args.fp16:
-        model.half()
-    model.to(device)
-    if args.local_rank != -1:
-        model = torch.nn.parallel.DistributedDataParallel(model,
-                                                          device_ids=[args.local_rank],
-                                                          output_device=args.local_rank,
-                                                          find_unused_parameters=True)
-    elif n_gpu > 1:
-        model = torch.nn.DataParallel(model)
-
-    global_step = 0
-    nb_tr_steps = 0
-    tr_loss = 0
-
-    if args.do_train:
-        if args.local_rank in [-1, 0]:
-            tb_writer = SummaryWriter()
-
-        # Prepare data loader
-        train_examples = processor.get_train_examples(args.data_dir)
-        cached_train_features_file = os.path.join(args.data_dir, 'train_{0}_{1}_{2}'.format(
-            list(filter(None, args.bert_model.split('/'))).pop(),
-                        str(args.max_seq_length),
-                        str(task_name)))
-        try:
-            with open(cached_train_features_file, "rb") as reader:
-                train_features = pickle.load(reader)
-        except:
-            train_features = convert_examples_to_features(
-                train_examples, label_list, args.max_seq_length, tokenizer, output_mode)
-            if args.local_rank == -1 or torch.distributed.get_rank() == 0:
-                logger.info("  Saving train features into cached file %s", cached_train_features_file)
-                with open(cached_train_features_file, "wb") as writer:
-                    pickle.dump(train_features, writer)
-
-        all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
-        all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
-        all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)
-
-        if output_mode == "classification":
-            all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long)
-        elif output_mode == "regression":
-            all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.float)
-
-        train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
-        if args.local_rank == -1:
-            train_sampler = RandomSampler(train_data)
-        else:
-            train_sampler = DistributedSampler(train_data)
-        train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)
-
-        num_train_optimization_steps = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
-
-        # Prepare optimizer
-
-        param_optimizer = list(model.named_parameters())
-        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
-        optimizer_grouped_parameters = [
-            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
-            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
-            ]
-        if args.fp16:
-            try:
-                from apex.optimizers import FP16_Optimizer
-                from apex.optimizers import FusedAdam
-            except ImportError:
-                raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
-
-            optimizer = FusedAdam(optimizer_grouped_parameters,
-                                  lr=args.learning_rate,
-                                  bias_correction=False,
-                                  max_grad_norm=1.0)
-            if args.loss_scale == 0:
-                optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
-            else:
-                optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale)
-            warmup_linear = WarmupLinearSchedule(warmup=args.warmup_proportion,
-                                                 t_total=num_train_optimization_steps)
-
-        else:
-            optimizer = BertAdam(optimizer_grouped_parameters,
-                                 lr=args.learning_rate,
-                                 warmup=args.warmup_proportion,
-                                 t_total=num_train_optimization_steps)
-
-        logger.info("***** Running training *****")
-        logger.info("  Num examples = %d", len(train_examples))
-        logger.info("  Batch size = %d", args.train_batch_size)
-        logger.info("  Num steps = %d", num_train_optimization_steps)
-
-        model.train()
-        for _ in trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]):
-            tr_loss = 0
-            nb_tr_examples, nb_tr_steps = 0, 0
-            for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])):
-                batch = tuple(t.to(device) for t in batch)
-                input_ids, input_mask, segment_ids, label_ids = batch
-
-                # define a new function to compute loss values for both output_modes
-                logits = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask)
-
-                if output_mode == "classification":
-                    loss_fct = CrossEntropyLoss()
-                    loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1))
-                elif output_mode == "regression":
-                    loss_fct = MSELoss()
-                    loss = loss_fct(logits.view(-1), label_ids.view(-1))
-
-                if n_gpu > 1:
-                    loss = loss.mean() # mean() to average on multi-gpu.
-                if args.gradient_accumulation_steps > 1:
-                    loss = loss / args.gradient_accumulation_steps
-
-                if args.fp16:
-                    optimizer.backward(loss)
-                else:
-                    loss.backward()
-
-                tr_loss += loss.item()
-                nb_tr_examples += input_ids.size(0)
-                nb_tr_steps += 1
-                if (step + 1) % args.gradient_accumulation_steps == 0:
-                    if args.fp16:
-                        # modify learning rate with special warm up BERT uses
-                        # if args.fp16 is False, BertAdam is used that handles this automatically
-                        lr_this_step = args.learning_rate * warmup_linear.get_lr(global_step, args.warmup_proportion)
-                        for param_group in optimizer.param_groups:
-                            param_group['lr'] = lr_this_step
-                    optimizer.step()
-                    optimizer.zero_grad()
-                    global_step += 1
-                    if args.local_rank in [-1, 0]:
-                        tb_writer.add_scalar('lr', optimizer.get_lr()[0], global_step)
-                        tb_writer.add_scalar('loss', loss.item(), global_step)
-
-    ### Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
-    ### Example:
-    if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
-        # Save a trained model, configuration and tokenizer
-        model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
-
-        # If we save using the predefined names, we can load using `from_pretrained`
-        output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
-        output_config_file = os.path.join(args.output_dir, CONFIG_NAME)
-
-        torch.save(model_to_save.state_dict(), output_model_file)
-        model_to_save.config.to_json_file(output_config_file)
-        tokenizer.save_vocabulary(args.output_dir)
-
-        # Load a trained model and vocabulary that you have fine-tuned
-        model = BertForSequenceClassification.from_pretrained(args.output_dir, num_labels=num_labels)
-        tokenizer = BertTokenizer.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
-
-        # Good practice: save your training arguments together with the trained model
-        output_args_file = os.path.join(args.output_dir, 'training_args.bin')
-        torch.save(args, output_args_file)
-    else:
-        model = BertForSequenceClassification.from_pretrained(args.bert_model, num_labels=num_labels)
-
-    model.to(device)
-
-    ### Evaluation
-    if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
-        eval_examples = processor.get_dev_examples(args.data_dir)
-        cached_eval_features_file = os.path.join(args.data_dir, 'dev_{0}_{1}_{2}'.format(
-            list(filter(None, args.bert_model.split('/'))).pop(),
-                        str(args.max_seq_length),
-                        str(task_name)))
-        try:
-            with open(cached_eval_features_file, "rb") as reader:
-                eval_features = pickle.load(reader)
-        except:
-            eval_features = convert_examples_to_features(
-                eval_examples, label_list, args.max_seq_length, tokenizer, output_mode)
-            if args.local_rank == -1 or torch.distributed.get_rank() == 0:
-                logger.info("  Saving eval features into cached file %s", cached_eval_features_file)
-                with open(cached_eval_features_file, "wb") as writer:
-                    pickle.dump(eval_features, writer)
-
-
-        logger.info("***** Running evaluation *****")
-        logger.info("  Num examples = %d", len(eval_examples))
-        logger.info("  Batch size = %d", args.eval_batch_size)
-        all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
-        all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
-        all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
-
-        if output_mode == "classification":
-            all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long)
-        elif output_mode == "regression":
-            all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.float)
-
-        eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
-        # Run prediction for full data
-        if args.local_rank == -1:
-            eval_sampler = SequentialSampler(eval_data)
-        else:
-            eval_sampler = DistributedSampler(eval_data)  # Note that this sampler samples randomly
-        eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)
-
-        model.eval()
-        eval_loss = 0
-        nb_eval_steps = 0
-        preds = []
-        out_label_ids = None
-
-        for input_ids, input_mask, segment_ids, label_ids in tqdm(eval_dataloader, desc="Evaluating"):
-            input_ids = input_ids.to(device)
-            input_mask = input_mask.to(device)
-            segment_ids = segment_ids.to(device)
-            label_ids = label_ids.to(device)
-
-            with torch.no_grad():
-                logits = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask)
-
-            # create eval loss and other metric required by the task
-            if output_mode == "classification":
-                loss_fct = CrossEntropyLoss()
-                tmp_eval_loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1))
-            elif output_mode == "regression":
-                loss_fct = MSELoss()
-                tmp_eval_loss = loss_fct(logits.view(-1), label_ids.view(-1))
-
-            eval_loss += tmp_eval_loss.mean().item()
-            nb_eval_steps += 1
-            if len(preds) == 0:
-                preds.append(logits.detach().cpu().numpy())
-                out_label_ids = label_ids.detach().cpu().numpy()
-            else:
-                preds[0] = np.append(
-                    preds[0], logits.detach().cpu().numpy(), axis=0)
-                out_label_ids = np.append(
-                    out_label_ids, label_ids.detach().cpu().numpy(), axis=0)
-
-        eval_loss = eval_loss / nb_eval_steps
-        preds = preds[0]
-        if output_mode == "classification":
-            preds = np.argmax(preds, axis=1)
-        elif output_mode == "regression":
-            preds = np.squeeze(preds)
-        result = compute_metrics(task_name, preds, out_label_ids)
-
-        loss = tr_loss/global_step if args.do_train else None
-
-        result['eval_loss'] = eval_loss
-        result['global_step'] = global_step
-        result['loss'] = loss
-
-        output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
-        with open(output_eval_file, "w") as writer:
-            logger.info("***** Eval results *****")
-            for key in sorted(result.keys()):
-                logger.info("  %s = %s", key, str(result[key]))
-                writer.write("%s = %s\n" % (key, str(result[key])))
-
-        # hack for MNLI-MM
-        if task_name == "mnli":
-            task_name = "mnli-mm"
-            processor = processors[task_name]()
-
-            if os.path.exists(args.output_dir + '-MM') and os.listdir(args.output_dir + '-MM') and args.do_train:
-                raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir))
-            if not os.path.exists(args.output_dir + '-MM'):
-                os.makedirs(args.output_dir + '-MM')
-
-            eval_examples = processor.get_dev_examples(args.data_dir)
-            eval_features = convert_examples_to_features(
-                eval_examples, label_list, args.max_seq_length, tokenizer, output_mode)
-            logger.info("***** Running evaluation *****")
-            logger.info("  Num examples = %d", len(eval_examples))
-            logger.info("  Batch size = %d", args.eval_batch_size)
-            all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
-            all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
-            all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
-            all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long)
-
-            eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
-            # Run prediction for full data
-            eval_sampler = SequentialSampler(eval_data)
-            eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)
-
-            model.eval()
-            eval_loss = 0
-            nb_eval_steps = 0
-            preds = []
-            out_label_ids = None
-
-            for input_ids, input_mask, segment_ids, label_ids in tqdm(eval_dataloader, desc="Evaluating"):
-                input_ids = input_ids.to(device)
-                input_mask = input_mask.to(device)
-                segment_ids = segment_ids.to(device)
-                label_ids = label_ids.to(device)
-
-                with torch.no_grad():
-                    logits = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=None)
-
-                loss_fct = CrossEntropyLoss()
-                tmp_eval_loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1))
-
-                eval_loss += tmp_eval_loss.mean().item()
-                nb_eval_steps += 1
-                if len(preds) == 0:
-                    preds.append(logits.detach().cpu().numpy())
-                    out_label_ids = label_ids.detach().cpu().numpy()
-                else:
-                    preds[0] = np.append(
-                        preds[0], logits.detach().cpu().numpy(), axis=0)
-                    out_label_ids = np.append(
-                        out_label_ids, label_ids.detach().cpu().numpy(), axis=0)
-
-            eval_loss = eval_loss / nb_eval_steps
-            preds = preds[0]
-            preds = np.argmax(preds, axis=1)
-            result = compute_metrics(task_name, preds, out_label_ids)
-
-            loss = tr_loss/global_step if args.do_train else None
-
-            result['eval_loss'] = eval_loss
-            result['global_step'] = global_step
-            result['loss'] = loss
-
-            output_eval_file = os.path.join(args.output_dir + '-MM', "eval_results.txt")
-            with open(output_eval_file, "w") as writer:
-                logger.info("***** Eval results *****")
-                for key in sorted(result.keys()):
-                    logger.info("  %s = %s", key, str(result[key]))
-                    writer.write("%s = %s\n" % (key, str(result[key])))
-
-if __name__ == "__main__":
-    main()
--- a/examples/run_generation.py
+++ b/examples/run_generation.py
@@ -0,0 +1,195 @@
+#!/usr/bin/env python3
+# coding=utf-8
+# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Conditional text generation with the auto-regressive models of the library (GPT/GPT-2/Transformer-XL/XLNet)
+"""
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import argparse
+import logging
+from tqdm import trange
+
+import torch
+import torch.nn.functional as F
+import numpy as np
+
+from pytorch_transformers import GPT2Config, OpenAIGPTConfig, XLNetConfig, TransfoXLConfig
+
+from pytorch_transformers import GPT2LMHeadModel, GPT2Tokenizer
+from pytorch_transformers import OpenAIGPTLMHeadModel, OpenAIGPTTokenizer
+from pytorch_transformers import XLNetLMHeadModel, XLNetTokenizer
+from pytorch_transformers import TransfoXLLMHeadModel, TransfoXLTokenizer
+
+
+logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
+                    datefmt = '%m/%d/%Y %H:%M:%S',
+                    level = logging.INFO)
+logger = logging.getLogger(__name__)
+
+MAX_LENGTH = int(10000)  # Hardcoded max length to avoid infinite loop
+
+ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) for conf in (GPT2Config, OpenAIGPTConfig, XLNetConfig, TransfoXLConfig)), ())
+
+MODEL_CLASSES = {
+    'gpt2': (GPT2LMHeadModel, GPT2Tokenizer),
+    'openai-gpt': (OpenAIGPTLMHeadModel, OpenAIGPTTokenizer),
+    'xlnet': (XLNetLMHeadModel, XLNetTokenizer),
+    'transfo-xl': (TransfoXLLMHeadModel, TransfoXLTokenizer),
+}
+
+# Padding text to help Transformer-XL and XLNet with short prompts as proposed by Aman Rusia
+# in https://github.com/rusiaaman/XLNet-gen#methodology
+# and https://medium.com/@amanrusia/xlnet-speaks-comparison-to-gpt-2-ea1a4e9ba39e
+PADDING_TEXT = """ In 1991, the remains of Russian Tsar Nicholas II and his family
+(except for Alexei and Maria) are discovered.
+The voice of Nicholas's young son, Tsarevich Alexei Nikolaevich, narrates the
+remainder of the story. 1883 Western Siberia,
+a young Grigori Rasputin is asked by his father and a group of men to perform magic.
+Rasputin has a vision and denounces one of the men as a horse thief. Although his
+father initially slaps him for making such an accusation, Rasputin watches as the
+man is chased outside and beaten. Twenty years later, Rasputin sees a vision of
+the Virgin Mary, prompting him to become a priest. Rasputin quickly becomes famous,
+with people, even a bishop, begging for his blessing. <eod> </s> <eos>"""
+
+
+def set_seed(args):
+    np.random.seed(args.seed)
+    torch.manual_seed(args.seed)
+    if args.n_gpu > 0:
+        torch.cuda.manual_seed_all(args.seed)
+
+
+def top_k_top_p_filtering(logits, top_k=0, top_p=0.0, filter_value=-float('Inf')):
+    """ Filter a distribution of logits using top-k and/or nucleus (top-p) filtering
+        Args:
+            logits: logits distribution shape (vocabulary size)
+            top_k > 0: keep only top k tokens with highest probability (top-k filtering).
+            top_p > 0.0: keep the top tokens with cumulative probability >= top_p (nucleus filtering).
+                Nucleus filtering is described in Holtzman et al. (http://arxiv.org/abs/1904.09751)
+        From: https://gist.github.com/thomwolf/1a5a29f6962089e871b94cbd09daf317
+    """
+    assert logits.dim() == 1  # batch size 1 for now - could be updated for more but the code would be less clear
+    top_k = min(top_k, logits.size(-1))  # Safety check
+    if top_k > 0:
+        # Remove all tokens with a probability less than the last token of the top-k
+        indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
+        logits[indices_to_remove] = filter_value
+
+    if top_p > 0.0:
+        sorted_logits, sorted_indices = torch.sort(logits, descending=True)
+        cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
+
+        # Remove tokens with cumulative probability above the threshold
+        sorted_indices_to_remove = cumulative_probs > top_p
+        # Shift the indices to the right to keep also the first token above the threshold
+        sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
+        sorted_indices_to_remove[..., 0] = 0
+
+        indices_to_remove = sorted_indices[sorted_indices_to_remove]
+        logits[indices_to_remove] = filter_value
+    return logits
+
+
+def sample_sequence(model, length, context, num_samples=1, temperature=1, top_k=0, top_p=0.0, is_xlnet=False, device='cpu'):
+    context = torch.tensor(context, dtype=torch.long, device=device)
+    context = context.unsqueeze(0).repeat(num_samples, 1)
+    generated = context
+    with torch.no_grad():
+        for _ in trange(length):
+
+            inputs = {'input_ids': generated}
+            if is_xlnet: 
+                # XLNet is a direct (predict same token, not next token) and bi-directional model by default
+                # => need one additional dummy token in the input (will be masked), attention mask and target mapping (see model docstring)
+                input_ids = torch.cat((generated, torch.zeros((1, 1), dtype=torch.long, device=device)), dim=1)
+                perm_mask = torch.zeros((1, input_ids.shape[1], input_ids.shape[1]), dtype=torch.float, device=device)
+                perm_mask[:, :, -1] = 1.0  # Previous tokens don't see last token
+                target_mapping = torch.zeros((1, 1, input_ids.shape[1]), dtype=torch.float, device=device)
+                target_mapping[0, 0, -1] = 1.0  # predict last token
+                inputs = {'input_ids': input_ids, 'perm_mask': perm_mask, 'target_mapping': target_mapping}
+
+            outputs = model(**inputs)  # Note: we could also use 'past' with GPT-2/Transfo-XL/XLNet (cached hidden-states)
+            next_token_logits = outputs[0][0, -1, :] / temperature
+            filtered_logits = top_k_top_p_filtering(next_token_logits, top_k=top_k, top_p=top_p)
+            next_token = torch.multinomial(F.softmax(filtered_logits, dim=-1), num_samples=1)
+            generated = torch.cat((generated, next_token.unsqueeze(0)), dim=1)
+    return generated
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model_type", default=None, type=str, required=True,
+                        help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()))
+    parser.add_argument("--model_name_or_path", default=None, type=str, required=True,
+                        help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS))
+    parser.add_argument("--prompt", type=str, default="")
+    parser.add_argument("--padding_text", type=str, default="")
+    parser.add_argument("--length", type=int, default=20)
+    parser.add_argument("--temperature", type=float, default=1.0)
+    parser.add_argument("--top_k", type=int, default=0)
+    parser.add_argument("--top_p", type=float, default=0.9)
+    parser.add_argument("--no_cuda", action='store_true',
+                        help="Avoid using CUDA when available")
+    parser.add_argument('--seed', type=int, default=42,
+                        help="random seed for initialization")
+    args = parser.parse_args()
+
+    args.device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
+    args.n_gpu = torch.cuda.device_count()
+
+    set_seed(args)
+
+    args.model_type = args.model_type.lower()
+    model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
+    tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path)
+    model = model_class.from_pretrained(args.model_name_or_path)
+    model.to(args.device)
+    model.eval()
+
+    if args.length < 0 and model.config.max_position_embeddings > 0:
+        args.length = model.config.max_position_embeddings
+    elif 0 < model.config.max_position_embeddings < args.length:
+        args.length = model.config.max_position_embeddings  # No generation bigger than model size 
+    elif args.length < 0:
+        args.length = MAX_LENGTH  # avoid infinite loop
+
+    print(args)
+    while True:
+        raw_text = args.prompt if args.prompt else input("Model prompt >>> ")
+        if args.model_type in ["transfo-xl", "xlnet"]:
+            # Models with memory likes to have a long prompt for short inputs.
+            raw_text = (args.padding_text if args.padding_text else PADDING_TEXT) + raw_text
+        context_tokens = tokenizer.encode(raw_text)
+        out = sample_sequence(
+            model=model,
+            context=context_tokens,
+            length=args.length,
+            temperature=args.temperature,
+            top_k=args.top_k,
+            top_p=args.top_p,
+            device=args.device,
+            is_xlnet=bool(args.model_type == "xlnet"),
+        )
+        out = out[0, len(context_tokens):].tolist()
+        text = tokenizer.decode(out, clean_up_tokenization_spaces=True)
+        print(text)
+        if args.prompt:
+            break
+    return text
+
+
+if __name__ == '__main__':
+    main()
--- a/examples/run_glue.py
+++ b/examples/run_glue.py
@@ -0,0 +1,475 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Finetuning the library models for sequence classification on GLUE (Bert, XLM, XLNet)."""
+
+from __future__ import absolute_import, division, print_function
+
+import argparse
+import glob
+import logging
+import os
+import random
+
+import numpy as np
+import torch
+from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
+                              TensorDataset)
+from torch.utils.data.distributed import DistributedSampler
+from tensorboardX import SummaryWriter
+from tqdm import tqdm, trange
+
+from pytorch_transformers import (WEIGHTS_NAME, BertConfig,
+                                  BertForSequenceClassification, BertTokenizer,
+                                  XLMConfig, XLMForSequenceClassification,
+                                  XLMTokenizer, XLNetConfig,
+                                  XLNetForSequenceClassification,
+                                  XLNetTokenizer)
+
+from pytorch_transformers import AdamW, WarmupLinearSchedule
+
+from utils_glue import (compute_metrics, convert_examples_to_features,
+                        output_modes, processors)
+
+logger = logging.getLogger(__name__)
+
+ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) for conf in (BertConfig, XLNetConfig, XLMConfig)), ())
+
+MODEL_CLASSES = {
+    'bert': (BertConfig, BertForSequenceClassification, BertTokenizer),
+    'xlnet': (XLNetConfig, XLNetForSequenceClassification, XLNetTokenizer),
+    'xlm': (XLMConfig, XLMForSequenceClassification, XLMTokenizer),
+}
+
+
+def set_seed(args):
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+    torch.manual_seed(args.seed)
+    if args.n_gpu > 0:
+        torch.cuda.manual_seed_all(args.seed)
+
+
+def train(args, train_dataset, model, tokenizer):
+    """ Train the model """
+    if args.local_rank in [-1, 0]:
+        tb_writer = SummaryWriter()
+
+    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
+    train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
+    train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size)
+
+    if args.max_steps > 0:
+        t_total = args.max_steps
+        args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
+    else:
+        t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
+
+    # Prepare optimizer and schedule (linear warmup and decay)
+    no_decay = ['bias', 'LayerNorm.weight']
+    optimizer_grouped_parameters = [
+        {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay},
+        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
+        ]
+    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
+    scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total)
+    if args.fp16:
+        try:
+            from apex import amp
+        except ImportError:
+            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
+        model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)
+
+    # Train!
+    logger.info("***** Running training *****")
+    logger.info("  Num examples = %d", len(train_dataset))
+    logger.info("  Num Epochs = %d", args.num_train_epochs)
+    logger.info("  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
+    logger.info("  Total train batch size (w. parallel, distributed & accumulation) = %d",
+                   args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1))
+    logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
+    logger.info("  Total optimization steps = %d", t_total)
+
+    global_step = 0
+    tr_loss, logging_loss = 0.0, 0.0
+    model.zero_grad()
+    train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0])
+    set_seed(args)  # Added here for reproductibility (even between python 2 and 3)
+    for _ in train_iterator:
+        epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
+        for step, batch in enumerate(epoch_iterator):
+            model.train()
+            batch = tuple(t.to(args.device) for t in batch)
+            inputs = {'input_ids':      batch[0],
+                      'attention_mask': batch[1],
+                      'token_type_ids': batch[2] if args.model_type in ['bert', 'xlnet'] else None,  # XLM don't use segment_ids
+                      'labels':         batch[3]}
+            ouputs = model(**inputs)
+            loss = ouputs[0]  # model outputs are always tuple in pytorch-transformers (see doc)
+
+            if args.n_gpu > 1:
+                loss = loss.mean() # mean() to average on multi-gpu parallel training
+            if args.gradient_accumulation_steps > 1:
+                loss = loss / args.gradient_accumulation_steps
+
+            if args.fp16:
+                with amp.scale_loss(loss, optimizer) as scaled_loss:
+                    scaled_loss.backward()
+                torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
+            else:
+                loss.backward()
+                torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
+
+            tr_loss += loss.item()
+            if (step + 1) % args.gradient_accumulation_steps == 0:
+                scheduler.step()  # Update learning rate schedule
+                optimizer.step()
+                model.zero_grad()
+                global_step += 1
+
+                if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
+                    # Log metrics
+                    if args.local_rank == -1 and args.evaluate_during_training:  # Only evaluate when single GPU otherwise metrics may not average well
+                        results = evaluate(args, model, tokenizer)
+                        for key, value in results.items():
+                            tb_writer.add_scalar('eval_{}'.format(key), value, global_step)
+                    tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step)
+                    tb_writer.add_scalar('loss', (tr_loss - logging_loss)/args.logging_steps, global_step)
+                    logging_loss = tr_loss
+
+                if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
+                    # Save model checkpoint
+                    output_dir = os.path.join(args.output_dir, 'checkpoint-{}'.format(global_step))
+                    if not os.path.exists(output_dir):
+                        os.makedirs(output_dir)
+                    model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
+                    model_to_save.save_pretrained(output_dir)
+                    torch.save(args, os.path.join(output_dir, 'training_args.bin'))
+                    logger.info("Saving model checkpoint to %s", output_dir)
+
+            if args.max_steps > 0 and global_step > args.max_steps:
+                epoch_iterator.close()
+                break
+        if args.max_steps > 0 and global_step > args.max_steps:
+            train_iterator.close()
+            break
+
+    if args.local_rank in [-1, 0]:
+        tb_writer.close()
+
+    return global_step, tr_loss / global_step
+
+
+def evaluate(args, model, tokenizer, prefix=""):
+    # Loop to handle MNLI double evaluation (matched, mis-matched)
+    eval_task_names = ("mnli", "mnli-mm") if args.task_name == "mnli" else (args.task_name,)
+    eval_outputs_dirs = (args.output_dir, args.output_dir + '-MM') if args.task_name == "mnli" else (args.output_dir,)
+
+    results = {}
+    for eval_task, eval_output_dir in zip(eval_task_names, eval_outputs_dirs):
+        eval_dataset = load_and_cache_examples(args, eval_task, tokenizer, evaluate=True)
+
+        if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]:
+            os.makedirs(eval_output_dir)
+
+        args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
+        # Note that DistributedSampler samples randomly
+        eval_sampler = SequentialSampler(eval_dataset) if args.local_rank == -1 else DistributedSampler(eval_dataset)
+        eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)
+
+        # Eval!
+        logger.info("***** Running evaluation {} *****".format(prefix))
+        logger.info("  Num examples = %d", len(eval_dataset))
+        logger.info("  Batch size = %d", args.eval_batch_size)
+        eval_loss = 0.0
+        nb_eval_steps = 0
+        preds = None
+        out_label_ids = None
+        for batch in tqdm(eval_dataloader, desc="Evaluating"):
+            model.eval()
+            batch = tuple(t.to(args.device) for t in batch)
+
+            with torch.no_grad():
+                inputs = {'input_ids':      batch[0],
+                          'attention_mask': batch[1],
+                          'token_type_ids': batch[2] if args.model_type in ['bert', 'xlnet'] else None,  # XLM don't use segment_ids
+                          'labels':         batch[3]}
+                outputs = model(**inputs)
+                tmp_eval_loss, logits = outputs[:2]
+
+                eval_loss += tmp_eval_loss.mean().item()
+            nb_eval_steps += 1
+            if preds is None:
+                preds = logits.detach().cpu().numpy()
+                out_label_ids = inputs['labels'].detach().cpu().numpy()
+            else:
+                preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
+                out_label_ids = np.append(out_label_ids, inputs['labels'].detach().cpu().numpy(), axis=0)
+
+        eval_loss = eval_loss / nb_eval_steps
+        if args.output_mode == "classification":
+            preds = np.argmax(preds, axis=1)
+        elif args.output_mode == "regression":
+            preds = np.squeeze(preds)
+        result = compute_metrics(eval_task, preds, out_label_ids)
+        results.update(result)
+
+        output_eval_file = os.path.join(eval_output_dir, "eval_results.txt")
+        with open(output_eval_file, "w") as writer:
+            logger.info("***** Eval results {} *****".format(prefix))
+            for key in sorted(result.keys()):
+                logger.info("  %s = %s", key, str(result[key]))
+                writer.write("%s = %s\n" % (key, str(result[key])))
+
+    return results
+
+
+def load_and_cache_examples(args, task, tokenizer, evaluate=False):
+    processor = processors[task]()
+    output_mode = output_modes[task]
+    # Load data features from cache or dataset file
+    cached_features_file = os.path.join(args.data_dir, 'cached_{}_{}_{}_{}'.format(
+        'dev' if evaluate else 'train',
+        list(filter(None, args.model_name_or_path.split('/'))).pop(),
+        str(args.max_seq_length),
+        str(task)))
+    if os.path.exists(cached_features_file):
+        logger.info("Loading features from cached file %s", cached_features_file)
+        features = torch.load(cached_features_file)
+    else:
+        logger.info("Creating features from dataset file at %s", args.data_dir)
+        label_list = processor.get_labels()
+        examples = processor.get_dev_examples(args.data_dir) if evaluate else processor.get_train_examples(args.data_dir)
+        features = convert_examples_to_features(examples, label_list, args.max_seq_length, tokenizer, output_mode,
+            cls_token_at_end=bool(args.model_type in ['xlnet']),            # xlnet has a cls token at the end
+            cls_token=tokenizer.cls_token,
+            sep_token=tokenizer.sep_token,
+            cls_token_segment_id=2 if args.model_type in ['xlnet'] else 1,
+            pad_on_left=bool(args.model_type in ['xlnet']),                 # pad on the left for xlnet
+            pad_token_segment_id=4 if args.model_type in ['xlnet'] else 0)
+        if args.local_rank in [-1, 0]:
+            logger.info("Saving features into cached file %s", cached_features_file)
+            torch.save(features, cached_features_file)
+
+    # Convert to Tensors and build dataset
+    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
+    all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
+    all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
+    if output_mode == "classification":
+        all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long)
+    elif output_mode == "regression":
+        all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.float)
+
+    dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
+    return dataset
+
+
+def main():
+    parser = argparse.ArgumentParser()
+
+    ## Required parameters
+    parser.add_argument("--data_dir", default=None, type=str, required=True,
+                        help="The input data dir. Should contain the .tsv files (or other data files) for the task.")
+    parser.add_argument("--model_type", default=None, type=str, required=True,
+                        help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()))
+    parser.add_argument("--model_name_or_path", default=None, type=str, required=True,
+                        help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS))
+    parser.add_argument("--task_name", default=None, type=str, required=True,
+                        help="The name of the task to train selected in the list: " + ", ".join(processors.keys()))
+    parser.add_argument("--output_dir", default=None, type=str, required=True,
+                        help="The output directory where the model predictions and checkpoints will be written.")
+
+    ## Other parameters
+    parser.add_argument("--config_name", default="", type=str,
+                        help="Pretrained config name or path if not the same as model_name")
+    parser.add_argument("--tokenizer_name", default="", type=str,
+                        help="Pretrained tokenizer name or path if not the same as model_name")
+    parser.add_argument("--cache_dir", default="", type=str,
+                        help="Where do you want to store the pre-trained models downloaded from s3")
+    parser.add_argument("--max_seq_length", default=128, type=int,
+                        help="The maximum total input sequence length after tokenization. Sequences longer "
+                             "than this will be truncated, sequences shorter will be padded.")
+    parser.add_argument("--do_train", action='store_true',
+                        help="Whether to run training.")
+    parser.add_argument("--do_eval", action='store_true',
+                        help="Whether to run eval on the dev set.")
+    parser.add_argument("--evaluate_during_training", action='store_true',
+                        help="Rul evaluation during training at each logging step.")
+    parser.add_argument("--do_lower_case", action='store_true',
+                        help="Set this flag if you are using an uncased model.")
+
+    parser.add_argument("--per_gpu_train_batch_size", default=8, type=int,
+                        help="Batch size per GPU/CPU for training.")
+    parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int,
+                        help="Batch size per GPU/CPU for evaluation.")
+    parser.add_argument('--gradient_accumulation_steps', type=int, default=1,
+                        help="Number of updates steps to accumulate before performing a backward/update pass.")
+    parser.add_argument("--learning_rate", default=5e-5, type=float,
+                        help="The initial learning rate for Adam.")
+    parser.add_argument("--weight_decay", default=0.0, type=float,
+                        help="Weight deay if we apply some.")
+    parser.add_argument("--adam_epsilon", default=1e-8, type=float,
+                        help="Epsilon for Adam optimizer.")
+    parser.add_argument("--max_grad_norm", default=1.0, type=float,
+                        help="Max gradient norm.")
+    parser.add_argument("--num_train_epochs", default=3.0, type=float,
+                        help="Total number of training epochs to perform.")
+    parser.add_argument("--max_steps", default=-1, type=int,
+                        help="If > 0: set total number of training steps to perform. Override num_train_epochs.")
+    parser.add_argument("--warmup_steps", default=0, type=int,
+                        help="Linear warmup over warmup_steps.")
+
+    parser.add_argument('--logging_steps', type=int, default=50,
+                        help="Log every X updates steps.")
+    parser.add_argument('--save_steps', type=int, default=50,
+                        help="Save checkpoint every X updates steps.")
+    parser.add_argument("--eval_all_checkpoints", action='store_true',
+                        help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number")
+    parser.add_argument("--no_cuda", action='store_true',
+                        help="Avoid using CUDA when available")
+    parser.add_argument('--overwrite_output_dir', action='store_true',
+                        help="Overwrite the content of the output directory")
+    parser.add_argument('--overwrite_cache', action='store_true',
+                        help="Overwrite the cached training and evaluation sets")
+    parser.add_argument('--seed', type=int, default=42,
+                        help="random seed for initialization")
+
+    parser.add_argument('--fp16', action='store_true',
+                        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit")
+    parser.add_argument('--fp16_opt_level', type=str, default='O1',
+                        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
+                             "See details at https://nvidia.github.io/apex/amp.html")
+    parser.add_argument("--local_rank", type=int, default=-1,
+                        help="For distributed training: local_rank")
+    parser.add_argument('--server_ip', type=str, default='', help="For distant debugging.")
+    parser.add_argument('--server_port', type=str, default='', help="For distant debugging.")
+    args = parser.parse_args()
+
+    if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir:
+        raise ValueError("Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(args.output_dir))
+
+    # Setup distant debugging if needed
+    if args.server_ip and args.server_port:
+        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
+        import ptvsd
+        print("Waiting for debugger attach")
+        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
+        ptvsd.wait_for_attach()
+
+    # Setup CUDA, GPU & distributed training
+    if args.local_rank == -1 or args.no_cuda:
+        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
+        args.n_gpu = torch.cuda.device_count()
+    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
+        torch.cuda.set_device(args.local_rank)
+        device = torch.device("cuda", args.local_rank)
+        torch.distributed.init_process_group(backend='nccl')
+        args.n_gpu = 1
+    args.device = device
+
+    # Setup logging
+    logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
+                        datefmt = '%m/%d/%Y %H:%M:%S',
+                        level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
+    logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
+                    args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16)
+
+    # Set seed
+    set_seed(args)
+
+    # Prepare GLUE task
+    args.task_name = args.task_name.lower()
+    if args.task_name not in processors:
+        raise ValueError("Task not found: %s" % (args.task_name))
+    processor = processors[args.task_name]()
+    args.output_mode = output_modes[args.task_name]
+    label_list = processor.get_labels()
+    num_labels = len(label_list)
+
+    # Load pretrained model and tokenizer
+    if args.local_rank not in [-1, 0]:
+        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
+
+    args.model_type = args.model_type.lower()
+    config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
+    config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path, num_labels=num_labels, finetuning_task=args.task_name)
+    tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, do_lower_case=args.do_lower_case)
+    model = model_class.from_pretrained(args.model_name_or_path, from_tf=bool('.ckpt' in args.model_name_or_path), config=config)
+
+    if args.local_rank == 0:
+        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
+
+    # Distributed and parallel training
+    model.to(args.device)
+    if args.local_rank != -1:
+        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
+                                                          output_device=args.local_rank,
+                                                          find_unused_parameters=True)
+    elif args.n_gpu > 1:
+        model = torch.nn.DataParallel(model)
+
+    logger.info("Training/evaluation parameters %s", args)
+
+
+    # Training
+    if args.do_train:
+        train_dataset = load_and_cache_examples(args, args.task_name, tokenizer, evaluate=False)
+        global_step, tr_loss = train(args, train_dataset, model, tokenizer)
+        logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
+
+
+    # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
+    if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
+        # Create output directory if needed
+        if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
+            os.makedirs(args.output_dir)
+
+        logger.info("Saving model checkpoint to %s", args.output_dir)
+        # Save a trained model, configuration and tokenizer using `save_pretrained()`.
+        # They can then be reloaded using `from_pretrained()`
+        model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
+        model_to_save.save_pretrained(args.output_dir)
+        tokenizer.save_pretrained(args.output_dir)
+
+        # Good practice: save your training arguments together with the trained model
+        torch.save(args, os.path.join(args.output_dir, 'training_args.bin'))
+
+        # Load a trained model and vocabulary that you have fine-tuned
+        model = model_class.from_pretrained(args.output_dir)
+        tokenizer = tokenizer_class.from_pretrained(args.output_dir)
+        model.to(args.device)
+
+
+    # Evaluation
+    results = {}
+    if args.do_eval and args.local_rank in [-1, 0]:
+        checkpoints = [args.output_dir]
+        if args.eval_all_checkpoints:
+            checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True)))
+            logging.getLogger("pytorch_transformers.modeling_utils").setLevel(logging.WARN)  # Reduce logging
+        logger.info("Evaluate the following checkpoints: %s", checkpoints)
+        for checkpoint in checkpoints:
+            global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else ""
+            model = model_class.from_pretrained(checkpoint)
+            model.to(args.device)
+            result = evaluate(args, model, tokenizer, prefix=global_step)
+            result = dict((k + '_{}'.format(global_step), v) for k, v in result.items())
+            results.update(result)
+
+    return results
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/run_gpt2.py
+++ b/examples/run_gpt2.py
@@ -1,131 +0,0 @@
-#!/usr/bin/env python3
-
-import argparse
-import logging
-from tqdm import trange
-
-import torch
-import torch.nn.functional as F
-import numpy as np
-
-from pytorch_pretrained_bert import GPT2LMHeadModel, GPT2Tokenizer
-
-logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
-                    datefmt = '%m/%d/%Y %H:%M:%S',
-                    level = logging.INFO)
-logger = logging.getLogger(__name__)
-
-def top_k_logits(logits, k):
-    """
-    Masks everything but the k top entries as -infinity (1e10).
-    Used to mask logits such that e^-infinity -> 0 won't contribute to the
-    sum of the denominator.
-    """
-    if k == 0:
-        return logits
-    else:
-        values = torch.topk(logits, k)[0]
-        batch_mins = values[:, -1].view(-1, 1).expand_as(logits)
-        return torch.where(logits < batch_mins, torch.ones_like(logits) * -1e10, logits)
-
-def sample_sequence(model, length, start_token=None, batch_size=None, context=None, temperature=1, top_k=0, device='cuda', sample=True):
-    if start_token is None:
-        assert context is not None, 'Specify exactly one of start_token and context!'
-        context = torch.tensor(context, device=device, dtype=torch.long).unsqueeze(0).repeat(batch_size, 1)
-    else:
-        assert context is None, 'Specify exactly one of start_token and context!'
-        context = torch.full((batch_size, 1), start_token, device=device, dtype=torch.long)
-    prev = context
-    output = context
-    past = None
-    with torch.no_grad():
-        for i in trange(length):
-            logits, past = model(prev, past=past)
-            logits = logits[:, -1, :] / temperature
-            logits = top_k_logits(logits, k=top_k)
-            log_probs = F.softmax(logits, dim=-1)
-            if sample:
-                prev = torch.multinomial(log_probs, num_samples=1)
-            else:
-                _, prev = torch.topk(log_probs, k=1, dim=-1)
-            output = torch.cat((output, prev), dim=1)
-    return output
-
-def run_model():
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--model_name_or_path', type=str, default='gpt2', help='pretrained model name or path to local checkpoint')
-    parser.add_argument("--seed", type=int, default=0)
-    parser.add_argument("--nsamples", type=int, default=1)
-    parser.add_argument("--batch_size", type=int, default=-1)
-    parser.add_argument("--length", type=int, default=-1)
-    parser.add_argument("--temperature", type=float, default=1.0)
-    parser.add_argument("--top_k", type=int, default=0)
-    parser.add_argument('--unconditional', action='store_true', help='If true, unconditional generation.')
-    args = parser.parse_args()
-    print(args)
-
-    if args.batch_size == -1:
-        args.batch_size = 1
-    assert args.nsamples % args.batch_size == 0
-
-    np.random.seed(args.seed)
-    torch.random.manual_seed(args.seed)
-    torch.cuda.manual_seed(args.seed)
-    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-
-    enc = GPT2Tokenizer.from_pretrained(args.model_name_or_path)
-    model = GPT2LMHeadModel.from_pretrained(args.model_name_or_path)
-    model.to(device)
-    model.eval()
-
-    if args.length == -1:
-        args.length = model.config.n_ctx // 2
-    elif args.length > model.config.n_ctx:
-        raise ValueError("Can't get samples longer than window size: %s" % model.config.n_ctx)
-
-    while True:
-        context_tokens = []
-        if not args.unconditional:
-            raw_text = input("Model prompt >>> ")
-            while not raw_text:
-                print('Prompt should not be empty!')
-                raw_text = input("Model prompt >>> ")
-            context_tokens = enc.encode(raw_text)
-            generated = 0
-            for _ in range(args.nsamples // args.batch_size):
-                out = sample_sequence(
-                    model=model, length=args.length,
-                    context=context_tokens,
-                    start_token=None,
-                    batch_size=args.batch_size,
-                    temperature=args.temperature, top_k=args.top_k, device=device
-                )
-                out = out[:, len(context_tokens):].tolist()
-                for i in range(args.batch_size):
-                    generated += 1
-                    text = enc.decode(out[i])
-                    print("=" * 40 + " SAMPLE " + str(generated) + " " + "=" * 40)
-                    print(text)
-            print("=" * 80)
-        else:
-            generated = 0
-            for _ in range(args.nsamples // args.batch_size):
-                out = sample_sequence(
-                    model=model, length=args.length,
-                    context=None,
-                    start_token=enc.encoder['<|endoftext|>'],
-                    batch_size=args.batch_size,
-                    temperature=args.temperature, top_k=args.top_k, device=device
-                )
-                out = out[:,1:].tolist()
-                for i in range(args.batch_size):
-                    generated += 1
-                    text = enc.decode(out[i])
-                    print("=" * 40 + " SAMPLE " + str(generated) + " " + "=" * 40)
-                    print(text)
-            print("=" * 80)
-
-if __name__ == '__main__':
-    run_model()
-
-
--- a/examples/run_squad.py
+++ b/examples/run_squad.py
@@ -13,7 +13,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Run BERT on SQuAD."""
+""" Finetuning the library models for question-answering on SQuAD (Bert, XLM, XLNet)."""

 from __future__ import absolute_import, division, print_function

@@ -21,8 +21,7 @@ import argparse
 import logging
 import os
 import random
-import sys
-from io import open
+import glob

 import numpy as np
 import torch
@@ -33,36 +32,306 @@ from tqdm import tqdm, trange

 from tensorboardX import SummaryWriter

-from pytorch_pretrained_bert.file_utils import WEIGHTS_NAME, CONFIG_NAME
-from pytorch_pretrained_bert.modeling import BertForQuestionAnswering
-from pytorch_pretrained_bert.optimization import BertAdam, WarmupLinearSchedule
-from pytorch_pretrained_bert.tokenization import BertTokenizer
+from pytorch_transformers import (WEIGHTS_NAME, BertConfig,
+                                  BertForQuestionAnswering, BertTokenizer,
+                                  XLMConfig, XLMForQuestionAnswering,
+                                  XLMTokenizer, XLNetConfig,
+                                  XLNetForQuestionAnswering,
+                                  XLNetTokenizer)

-from run_squad_dataset_utils import read_squad_examples, convert_examples_to_features, RawResult, write_predictions
+from pytorch_transformers import AdamW, WarmupLinearSchedule

-if sys.version_info[0] == 2:
-    import cPickle as pickle
-else:
-    import pickle
+from utils_squad import (read_squad_examples, convert_examples_to_features,
+                         RawResult, write_predictions,
+                         RawResultExtended, write_predictions_extended)
+
+# The follwing import is the official SQuAD evaluation script (2.0).
+# You can remove it from the dependencies if you are using this script outside of the library
+# We've added it here for automated tests (see examples/test_examples.py file)
+from utils_squad_evaluate import EVAL_OPTS, main as evaluate_on_squad

 logger = logging.getLogger(__name__)

+ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) \
+                  for conf in (BertConfig, XLNetConfig, XLMConfig)), ())
+
+MODEL_CLASSES = {
+    'bert': (BertConfig, BertForQuestionAnswering, BertTokenizer),
+    'xlnet': (XLNetConfig, XLNetForQuestionAnswering, XLNetTokenizer),
+    'xlm': (XLMConfig, XLMForQuestionAnswering, XLMTokenizer),
+}
+
+def set_seed(args):
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+    torch.manual_seed(args.seed)
+    if args.n_gpu > 0:
+        torch.cuda.manual_seed_all(args.seed)
+
+def to_list(tensor):
+    return tensor.detach().cpu().tolist()
+
+def train(args, train_dataset, model, tokenizer):
+    """ Train the model """
+    if args.local_rank in [-1, 0]:
+        tb_writer = SummaryWriter()
+
+    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
+    train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
+    train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size)
+
+    if args.max_steps > 0:
+        t_total = args.max_steps
+        args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
+    else:
+        t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
+
+    # Prepare optimizer and schedule (linear warmup and decay)
+    no_decay = ['bias', 'LayerNorm.weight']
+    optimizer_grouped_parameters = [
+        {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay},
+        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
+        ]
+    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
+    scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total)
+    if args.fp16:
+        try:
+            from apex import amp
+        except ImportError:
+            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
+        model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)
+
+    # Train!
+    logger.info("***** Running training *****")
+    logger.info("  Num examples = %d", len(train_dataset))
+    logger.info("  Num Epochs = %d", args.num_train_epochs)
+    logger.info("  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
+    logger.info("  Total train batch size (w. parallel, distributed & accumulation) = %d",
+                   args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1))
+    logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
+    logger.info("  Total optimization steps = %d", t_total)
+
+    global_step = 0
+    tr_loss, logging_loss = 0.0, 0.0
+    model.zero_grad()
+    train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0])
+    set_seed(args)  # Added here for reproductibility (even between python 2 and 3)
+    for _ in train_iterator:
+        epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
+        for step, batch in enumerate(epoch_iterator):
+            model.train()
+            batch = tuple(t.to(args.device) for t in batch)
+            inputs = {'input_ids':       batch[0],
+                      'token_type_ids':  None if args.model_type == 'xlm' else batch[1],  # XLM don't use segment_ids
+                      'attention_mask':  batch[2],
+                      'start_positions': batch[3],
+                      'end_positions':   batch[4]}
+            if args.model_type in ['xlnet', 'xlm']:
+                inputs.update({'cls_index': batch[5],
+                               'p_mask':    batch[6]})
+            ouputs = model(**inputs)
+            loss = ouputs[0]  # model outputs are always tuple in pytorch-transformers (see doc)
+
+            if args.n_gpu > 1:
+                loss = loss.mean() # mean() to average on multi-gpu parallel (not distributed) training
+            if args.gradient_accumulation_steps > 1:
+                loss = loss / args.gradient_accumulation_steps
+
+            if args.fp16:
+                with amp.scale_loss(loss, optimizer) as scaled_loss:
+                    scaled_loss.backward()
+                torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
+            else:
+                loss.backward()
+                torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
+
+            tr_loss += loss.item()
+            if (step + 1) % args.gradient_accumulation_steps == 0:
+                scheduler.step()  # Update learning rate schedule
+                optimizer.step()
+                model.zero_grad()
+                global_step += 1
+
+                if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
+                    # Log metrics
+                    if args.local_rank == -1 and args.evaluate_during_training:  # Only evaluate when single GPU otherwise metrics may not average well
+                        results = evaluate(args, model, tokenizer)
+                        for key, value in results.items():
+                            tb_writer.add_scalar('eval_{}'.format(key), value, global_step)
+                    tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step)
+                    tb_writer.add_scalar('loss', (tr_loss - logging_loss)/args.logging_steps, global_step)
+                    logging_loss = tr_loss
+
+                if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
+                    # Save model checkpoint
+                    output_dir = os.path.join(args.output_dir, 'checkpoint-{}'.format(global_step))
+                    if not os.path.exists(output_dir):
+                        os.makedirs(output_dir)
+                    model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
+                    model_to_save.save_pretrained(output_dir)
+                    torch.save(args, os.path.join(output_dir, 'training_args.bin'))
+                    logger.info("Saving model checkpoint to %s", output_dir)
+
+            if args.max_steps > 0 and global_step > args.max_steps:
+                epoch_iterator.close()
+                break
+        if args.max_steps > 0 and global_step > args.max_steps:
+            train_iterator.close()
+            break
+
+    if args.local_rank in [-1, 0]:
+        tb_writer.close()
+
+    return global_step, tr_loss / global_step
+
+
+def evaluate(args, model, tokenizer, prefix=""):
+    dataset, examples, features = load_and_cache_examples(args, tokenizer, evaluate=True, output_examples=True)
+
+    if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
+        os.makedirs(args.output_dir)
+
+    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
+    # Note that DistributedSampler samples randomly
+    eval_sampler = SequentialSampler(dataset) if args.local_rank == -1 else DistributedSampler(dataset)
+    eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)
+
+    # Eval!
+    logger.info("***** Running evaluation {} *****".format(prefix))
+    logger.info("  Num examples = %d", len(dataset))
+    logger.info("  Batch size = %d", args.eval_batch_size)
+    all_results = []
+    for batch in tqdm(eval_dataloader, desc="Evaluating"):
+        model.eval()
+        batch = tuple(t.to(args.device) for t in batch)
+        with torch.no_grad():
+            inputs = {'input_ids':      batch[0],
+                      'token_type_ids': None if args.model_type == 'xlm' else batch[1],  # XLM don't use segment_ids
+                      'attention_mask': batch[2]}
+            example_indices = batch[3]
+            if args.model_type in ['xlnet', 'xlm']:
+                inputs.update({'cls_index': batch[4],
+                               'p_mask':    batch[5]})
+            outputs = model(**inputs)
+
+        for i, example_index in enumerate(example_indices):
+            eval_feature = features[example_index.item()]
+            unique_id = int(eval_feature.unique_id)
+            if args.model_type in ['xlnet', 'xlm']:
+                # XLNet uses a more complex post-processing procedure
+                result = RawResultExtended(unique_id            = unique_id,
+                                           start_top_log_probs  = to_list(outputs[0][i]),
+                                           start_top_index      = to_list(outputs[1][i]),
+                                           end_top_log_probs    = to_list(outputs[2][i]),
+                                           end_top_index        = to_list(outputs[3][i]),
+                                           cls_logits           = to_list(outputs[4][i]))
+            else:
+                result = RawResult(unique_id    = unique_id,
+                                   start_logits = to_list(outputs[0][i]),
+                                   end_logits   = to_list(outputs[1][i]))
+            all_results.append(result)
+
+    # Compute predictions
+    output_prediction_file = os.path.join(args.output_dir, "predictions_{}.json".format(prefix))
+    output_nbest_file = os.path.join(args.output_dir, "nbest_predictions_{}.json".format(prefix))
+    output_null_log_odds_file = os.path.join(args.output_dir, "null_odds_{}.json".format(prefix))
+
+    if args.model_type in ['xlnet', 'xlm']:
+        # XLNet uses a more complex post-processing procedure
+        write_predictions_extended(examples, features, all_results, args.n_best_size,
+                        args.max_answer_length, output_prediction_file,
+                        output_nbest_file, output_null_log_odds_file, args.predict_file,
+                        model.config.start_n_top, model.config.end_n_top,
+                        args.version_2_with_negative, tokenizer, args.verbose_logging)
+    else:
+        write_predictions(examples, features, all_results, args.n_best_size,
+                        args.max_answer_length, args.do_lower_case, output_prediction_file,
+                        output_nbest_file, output_null_log_odds_file, args.verbose_logging,
+                        args.version_2_with_negative, args.null_score_diff_threshold)
+
+    # Evaluate with the official SQuAD script
+    evaluate_options = EVAL_OPTS(data_file=args.predict_file,
+                                 pred_file=output_prediction_file,
+                                 na_prob_file=output_null_log_odds_file)
+    results = evaluate_on_squad(evaluate_options)
+    return results
+
+
+def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False):
+    # Load data features from cache or dataset file
+    input_file = args.predict_file if evaluate else args.train_file
+    cached_features_file = os.path.join(os.path.dirname(input_file), 'cached_{}_{}_{}'.format(
+        'dev' if evaluate else 'train',
+        list(filter(None, args.model_name_or_path.split('/'))).pop(),
+        str(args.max_seq_length)))
+    if os.path.exists(cached_features_file) and not args.overwrite_cache and not output_examples:
+        logger.info("Loading features from cached file %s", cached_features_file)
+        features = torch.load(cached_features_file)
+    else:
+        logger.info("Creating features from dataset file at %s", input_file)
+        examples = read_squad_examples(input_file=input_file,
+                                                is_training=not evaluate,
+                                                version_2_with_negative=args.version_2_with_negative)
+        features = convert_examples_to_features(examples=examples,
+                                                tokenizer=tokenizer,
+                                                max_seq_length=args.max_seq_length,
+                                                doc_stride=args.doc_stride,
+                                                max_query_length=args.max_query_length,
+                                                is_training=not evaluate)
+        if args.local_rank in [-1, 0]:
+            logger.info("Saving features into cached file %s", cached_features_file)
+            torch.save(features, cached_features_file)
+
+    # Convert to Tensors and build dataset
+    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
+    all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
+    all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
+    all_cls_index = torch.tensor([f.cls_index for f in features], dtype=torch.long)
+    all_p_mask = torch.tensor([f.p_mask for f in features], dtype=torch.float)
+    if evaluate:
+        all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
+        dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
+                                all_example_index, all_cls_index, all_p_mask)
+    else:
+        all_start_positions = torch.tensor([f.start_position for f in features], dtype=torch.long)
+        all_end_positions = torch.tensor([f.end_position for f in features], dtype=torch.long)
+        dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
+                                all_start_positions, all_end_positions,
+                                all_cls_index, all_p_mask)
+
+    if output_examples:
+        return dataset, examples, features
+    return dataset
+

 def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
-    parser.add_argument("--bert_model", default=None, type=str, required=True,
-                        help="Bert pre-trained model selected in the list: bert-base-uncased, "
-                        "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, "
-                        "bert-base-multilingual-cased, bert-base-chinese.")
+    parser.add_argument("--train_file", default=None, type=str, required=True,
+                        help="SQuAD json for training. E.g., train-v1.1.json")
+    parser.add_argument("--predict_file", default=None, type=str, required=True,
+                        help="SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json")
+    parser.add_argument("--model_type", default=None, type=str, required=True,
+                        help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()))
+    parser.add_argument("--model_name_or_path", default=None, type=str, required=True,
+                        help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS))
    parser.add_argument("--output_dir", default=None, type=str, required=True,
                        help="The output directory where the model checkpoints and predictions will be written.")

    ## Other parameters
-    parser.add_argument("--train_file", default=None, type=str, help="SQuAD json for training. E.g., train-v1.1.json")
-    parser.add_argument("--predict_file", default=None, type=str,
-                        help="SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json")
+    parser.add_argument("--config_name", default="", type=str,
+                        help="Pretrained config name or path if not the same as model_name")
+    parser.add_argument("--tokenizer_name", default="", type=str,
+                        help="Pretrained tokenizer name or path if not the same as model_name")
+    parser.add_argument("--cache_dir", default="", type=str,
+                        help="Where do you want to store the pre-trained models downloaded from s3")
+
+    parser.add_argument('--version_2_with_negative', action='store_true',
+                        help='If true, the SQuAD examples contain some that do not have an answer.')
+    parser.add_argument('--null_score_diff_threshold', type=float, default=0.0,
+                        help="If null_score - best_non_null is greater than the threshold predict null.")
+
    parser.add_argument("--max_seq_length", default=384, type=int,
                        help="The maximum total input sequence length after WordPiece tokenization. Sequences "
                             "longer than this will be truncated, and sequences shorter than this will be padded.")
@@ -71,65 +340,74 @@ def main():
    parser.add_argument("--max_query_length", default=64, type=int,
                        help="The maximum number of tokens for the question. Questions longer than this will "
                             "be truncated to this length.")
-    parser.add_argument("--do_train", action='store_true', help="Whether to run training.")
-    parser.add_argument("--do_predict", action='store_true', help="Whether to run eval on the dev set.")
-    parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.")
-    parser.add_argument("--predict_batch_size", default=8, type=int, help="Total batch size for predictions.")
-    parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
+    parser.add_argument("--do_train", action='store_true',
+                        help="Whether to run training.")
+    parser.add_argument("--do_eval", action='store_true',
+                        help="Whether to run eval on the dev set.")
+    parser.add_argument("--evaluate_during_training", action='store_true',
+                        help="Rul evaluation during training at each logging step.")
+    parser.add_argument("--do_lower_case", action='store_true',
+                        help="Set this flag if you are using an uncased model.")
+
+    parser.add_argument("--per_gpu_train_batch_size", default=8, type=int,
+                        help="Batch size per GPU/CPU for training.")
+    parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int,
+                        help="Batch size per GPU/CPU for evaluation.")
+    parser.add_argument("--learning_rate", default=5e-5, type=float,
+                        help="The initial learning rate for Adam.")
+    parser.add_argument('--gradient_accumulation_steps', type=int, default=1,
+                        help="Number of updates steps to accumulate before performing a backward/update pass.")
+    parser.add_argument("--weight_decay", default=0.0, type=float,
+                        help="Weight deay if we apply some.")
+    parser.add_argument("--adam_epsilon", default=1e-8, type=float,
+                        help="Epsilon for Adam optimizer.")
+    parser.add_argument("--max_grad_norm", default=1.0, type=float,
+                        help="Max gradient norm.")
    parser.add_argument("--num_train_epochs", default=3.0, type=float,
                        help="Total number of training epochs to perform.")
-    parser.add_argument("--warmup_proportion", default=0.1, type=float,
-                        help="Proportion of training to perform linear learning rate warmup for. E.g., 0.1 = 10%% "
-                             "of training.")
+    parser.add_argument("--max_steps", default=-1, type=int,
+                        help="If > 0: set total number of training steps to perform. Override num_train_epochs.")
+    parser.add_argument("--warmup_steps", default=0, type=int,
+                        help="Linear warmup over warmup_steps.")
    parser.add_argument("--n_best_size", default=20, type=int,
-                        help="The total number of n-best predictions to generate in the nbest_predictions.json "
-                             "output file.")
+                        help="The total number of n-best predictions to generate in the nbest_predictions.json output file.")
    parser.add_argument("--max_answer_length", default=30, type=int,
                        help="The maximum length of an answer that can be generated. This is needed because the start "
                             "and end predictions are not conditioned on one another.")
    parser.add_argument("--verbose_logging", action='store_true',
                        help="If true, all of the warnings related to data processing will be printed. "
                             "A number of warnings are expected for a normal SQuAD evaluation.")
-    parser.add_argument("--no_cuda",
-                        action='store_true',
+
+    parser.add_argument('--logging_steps', type=int, default=50,
+                        help="Log every X updates steps.")
+    parser.add_argument('--save_steps', type=int, default=50,
+                        help="Save checkpoint every X updates steps.")
+    parser.add_argument("--eval_all_checkpoints", action='store_true',
+                        help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number")
+    parser.add_argument("--no_cuda", action='store_true',
                        help="Whether not to use CUDA when available")
-    parser.add_argument('--seed',
-                        type=int,
-                        default=42,
-                        help="random seed for initialization")
-    parser.add_argument('--gradient_accumulation_steps',
-                        type=int,
-                        default=1,
-                        help="Number of updates steps to accumulate before performing a backward/update pass.")
-    parser.add_argument("--do_lower_case",
-                        action='store_true',
-                        help="Whether to lower case the input text. True for uncased models, False for cased models.")
-    parser.add_argument("--local_rank",
-                        type=int,
-                        default=-1,
-                        help="local_rank for distributed training on gpus")
-    parser.add_argument('--fp16',
-                        action='store_true',
-                        help="Whether to use 16-bit float precision instead of 32-bit")
-    parser.add_argument('--overwrite_output_dir',
-                        action='store_true',
+    parser.add_argument('--overwrite_output_dir', action='store_true',
                        help="Overwrite the content of the output directory")
-    parser.add_argument('--loss_scale',
-                        type=float, default=0,
-                        help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
-                             "0 (default value): dynamic loss scaling.\n"
-                             "Positive power of 2: static loss scaling value.\n")
-    parser.add_argument('--version_2_with_negative',
-                        action='store_true',
-                        help='If true, the SQuAD examples contain some that do not have an answer.')
-    parser.add_argument('--null_score_diff_threshold',
-                        type=float, default=0.0,
-                        help="If null_score - best_non_null is greater than the threshold predict null.")
+    parser.add_argument('--overwrite_cache', action='store_true',
+                        help="Overwrite the cached training and evaluation sets")
+    parser.add_argument('--seed', type=int, default=42,
+                        help="random seed for initialization")
+
+    parser.add_argument("--local_rank", type=int, default=-1,
+                        help="local_rank for distributed training on gpus")
+    parser.add_argument('--fp16', action='store_true',
+                        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit")
+    parser.add_argument('--fp16_opt_level', type=str, default='O1',
+                        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
+                             "See details at https://nvidia.github.io/apex/amp.html")
    parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.")
    parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.")
    args = parser.parse_args()
-    print(args)

+    if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir:
+        raise ValueError("Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(args.output_dir))
+
+    # Setup distant debugging if needed
    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd
@@ -137,263 +415,105 @@ def main():
        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
        ptvsd.wait_for_attach()

+    # Setup CUDA, GPU & distributed training
    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
-        n_gpu = torch.cuda.device_count()
-    else:
+        args.n_gpu = torch.cuda.device_count()
+    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
-        n_gpu = 1
-        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
+        args.n_gpu = 1
+    args.device = device

+    # Setup logging
    logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                        datefmt = '%m/%d/%Y %H:%M:%S',
                        level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
+    logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
+                    args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16)

-    logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format(
-        device, n_gpu, bool(args.local_rank != -1), args.fp16))
-
-    if args.gradient_accumulation_steps < 1:
-        raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(
-                            args.gradient_accumulation_steps))
-
-    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps
-
-    random.seed(args.seed)
-    np.random.seed(args.seed)
-    torch.manual_seed(args.seed)
-    if n_gpu > 0:
-        torch.cuda.manual_seed_all(args.seed)
-
-    if not args.do_train and not args.do_predict:
-        raise ValueError("At least one of `do_train` or `do_predict` must be True.")
-
-    if args.do_train:
-        if not args.train_file:
-            raise ValueError(
-                "If `do_train` is True, then `train_file` must be specified.")
-    if args.do_predict:
-        if not args.predict_file:
-            raise ValueError(
-                "If `do_predict` is True, then `predict_file` must be specified.")
-
-    if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir:
-        raise ValueError("Output directory () already exists and is not empty.")
-    if not os.path.exists(args.output_dir):
-        os.makedirs(args.output_dir)
+    # Set seed
+    set_seed(args)

+    # Load pretrained model and tokenizer
    if args.local_rank not in [-1, 0]:
        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
-    tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)
-    model = BertForQuestionAnswering.from_pretrained(args.bert_model)
-    if args.local_rank == 0:
-        torch.distributed.barrier()

-    if args.fp16:
-        model.half()
-    model.to(device)
+    args.model_type = args.model_type.lower()
+    config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
+    config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path)
+    tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, do_lower_case=args.do_lower_case)
+    model = model_class.from_pretrained(args.model_name_or_path, from_tf=bool('.ckpt' in args.model_name_or_path), config=config)
+
+    if args.local_rank == 0:
+        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
+
+    # Distributed and parrallel training
+    model.to(args.device)
    if args.local_rank != -1:
-        model = torch.nn.parallel.DistributedDataParallel(model,
-                                                          device_ids=[args.local_rank],
+        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
                                                          output_device=args.local_rank,
                                                          find_unused_parameters=True)
-    elif n_gpu > 1:
+    elif args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

+    logger.info("Training/evaluation parameters %s", args)
+
+    # Training
    if args.do_train:
-        if args.local_rank in [-1, 0]:
-            tb_writer = SummaryWriter()
-        # Prepare data loader
-        train_examples = read_squad_examples(
-            input_file=args.train_file, is_training=True, version_2_with_negative=args.version_2_with_negative)
-        cached_train_features_file = args.train_file+'_{0}_{1}_{2}_{3}'.format(
-            list(filter(None, args.bert_model.split('/'))).pop(), str(args.max_seq_length), str(args.doc_stride), str(args.max_query_length))
-        try:
-            with open(cached_train_features_file, "rb") as reader:
-                train_features = pickle.load(reader)
-        except:
-            train_features = convert_examples_to_features(
-                examples=train_examples,
-                tokenizer=tokenizer,
-                max_seq_length=args.max_seq_length,
-                doc_stride=args.doc_stride,
-                max_query_length=args.max_query_length,
-                is_training=True)
-            if args.local_rank == -1 or torch.distributed.get_rank() == 0:
-                logger.info("  Saving train features into cached file %s", cached_train_features_file)
-                with open(cached_train_features_file, "wb") as writer:
-                    pickle.dump(train_features, writer)
+        train_dataset = load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False)
+        global_step, tr_loss = train(args, train_dataset, model, tokenizer)
+        logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)

-        all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
-        all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
-        all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)
-        all_start_positions = torch.tensor([f.start_position for f in train_features], dtype=torch.long)
-        all_end_positions = torch.tensor([f.end_position for f in train_features], dtype=torch.long)
-        train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
-                                   all_start_positions, all_end_positions)
-        if args.local_rank == -1:
-            train_sampler = RandomSampler(train_data)
-        else:
-            train_sampler = DistributedSampler(train_data)

-        train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)
-        num_train_optimization_steps = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
-        # if args.local_rank != -1:
-        #     num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size()
+    # Save the trained model and the tokenizer
+    if args.local_rank == -1 or torch.distributed.get_rank() == 0:
+        # Create output directory if needed
+        if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
+            os.makedirs(args.output_dir)

-        # Prepare optimizer
-        param_optimizer = list(model.named_parameters())
-
-        # hack to remove pooler, which is not used
-        # thus it produce None grad that break apex
-        param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]]
-
-        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
-        optimizer_grouped_parameters = [
-            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
-            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
-            ]
-
-        if args.fp16:
-            try:
-                from apex.optimizers import FP16_Optimizer
-                from apex.optimizers import FusedAdam
-            except ImportError:
-                raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
-
-            optimizer = FusedAdam(optimizer_grouped_parameters,
-                                  lr=args.learning_rate,
-                                  bias_correction=False,
-                                  max_grad_norm=1.0)
-            if args.loss_scale == 0:
-                optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
-            else:
-                optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale)
-            warmup_linear = WarmupLinearSchedule(warmup=args.warmup_proportion,
-                                                 t_total=num_train_optimization_steps)
-        else:
-            optimizer = BertAdam(optimizer_grouped_parameters,
-                                 lr=args.learning_rate,
-                                 warmup=args.warmup_proportion,
-                                 t_total=num_train_optimization_steps)
-
-        global_step = 0
-
-        logger.info("***** Running training *****")
-        logger.info("  Num orig examples = %d", len(train_examples))
-        logger.info("  Num split examples = %d", len(train_features))
-        logger.info("  Batch size = %d", args.train_batch_size)
-        logger.info("  Num steps = %d", num_train_optimization_steps)
-
-        model.train()
-        for epoch in trange(int(args.num_train_epochs), desc="Epoch"):
-            for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])):
-                if n_gpu == 1:
-                    batch = tuple(t.to(device) for t in batch) # multi-gpu does scattering it-self
-                input_ids, input_mask, segment_ids, start_positions, end_positions = batch
-                loss = model(input_ids, segment_ids, input_mask, start_positions, end_positions)
-                if n_gpu > 1:
-                    loss = loss.mean() # mean() to average on multi-gpu.
-                if args.gradient_accumulation_steps > 1:
-                    loss = loss / args.gradient_accumulation_steps
-
-                if args.fp16:
-                    optimizer.backward(loss)
-                else:
-                    loss.backward()
-                if (step + 1) % args.gradient_accumulation_steps == 0:
-                    if args.fp16:
-                        # modify learning rate with special warm up BERT uses
-                        # if args.fp16 is False, BertAdam is used and handles this automatically
-                        lr_this_step = args.learning_rate * warmup_linear.get_lr(global_step, args.warmup_proportion)
-                        for param_group in optimizer.param_groups:
-                            param_group['lr'] = lr_this_step
-                    optimizer.step()
-                    optimizer.zero_grad()
-                    global_step += 1
-                    if args.local_rank in [-1, 0]:
-                        tb_writer.add_scalar('lr', optimizer.get_lr()[0], global_step)
-                        tb_writer.add_scalar('loss', loss.item(), global_step)
-
-    if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
-        # Save a trained model, configuration and tokenizer
-        model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
-
-        # If we save using the predefined names, we can load using `from_pretrained`
-        output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
-        output_config_file = os.path.join(args.output_dir, CONFIG_NAME)
-
-        torch.save(model_to_save.state_dict(), output_model_file)
-        model_to_save.config.to_json_file(output_config_file)
-        tokenizer.save_vocabulary(args.output_dir)
-
-        # Load a trained model and vocabulary that you have fine-tuned
-        model = BertForQuestionAnswering.from_pretrained(args.output_dir)
-        tokenizer = BertTokenizer.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
+        logger.info("Saving model checkpoint to %s", args.output_dir)
+        # Save a trained model, configuration and tokenizer using `save_pretrained()`.
+        # They can then be reloaded using `from_pretrained()`
+        model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
+        model_to_save.save_pretrained(args.output_dir)
+        tokenizer.save_pretrained(args.output_dir)

        # Good practice: save your training arguments together with the trained model
-        output_args_file = os.path.join(args.output_dir, 'training_args.bin')
-        torch.save(args, output_args_file)
-    else:
+        torch.save(args, os.path.join(args.output_dir, 'training_args.bin'))
+
        # Load a trained model and vocabulary that you have fine-tuned
-        model = BertForQuestionAnswering.from_pretrained(args.output_dir)
-        tokenizer = BertTokenizer.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
+        model = model_class.from_pretrained(args.output_dir)
+        tokenizer = tokenizer_class.from_pretrained(args.output_dir)
+        model.to(args.device)

-    model.to(device)

-    if args.do_predict and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
-        eval_examples = read_squad_examples(
-            input_file=args.predict_file, is_training=False, version_2_with_negative=args.version_2_with_negative)
-        eval_features = convert_examples_to_features(
-            examples=eval_examples,
-            tokenizer=tokenizer,
-            max_seq_length=args.max_seq_length,
-            doc_stride=args.doc_stride,
-            max_query_length=args.max_query_length,
-            is_training=False)
+    # Evaluation - we can ask to evaluate all the checkpoints (sub-directories) in a directory
+    results = {}
+    if args.do_eval and args.local_rank in [-1, 0]:
+        checkpoints = [args.output_dir]
+        if args.eval_all_checkpoints:
+            checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True)))
+            logging.getLogger("pytorch_transformers.modeling_utils").setLevel(logging.WARN)  # Reduce model loading logs

-        logger.info("***** Running predictions *****")
-        logger.info("  Num orig examples = %d", len(eval_examples))
-        logger.info("  Num split examples = %d", len(eval_features))
-        logger.info("  Batch size = %d", args.predict_batch_size)
+        logger.info("Evaluate the following checkpoints: %s", checkpoints)

-        all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
-        all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
-        all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
-        all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
-        eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_example_index)
-        # Run prediction for full data
-        eval_sampler = SequentialSampler(eval_data)
-        eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.predict_batch_size)
+        for checkpoint in checkpoints:
+            # Reload the model
+            global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else ""
+            model = model_class.from_pretrained(checkpoint)
+            model.to(args.device)

-        model.eval()
-        all_results = []
-        logger.info("Start evaluating")
-        for input_ids, input_mask, segment_ids, example_indices in tqdm(eval_dataloader, desc="Evaluating", disable=args.local_rank not in [-1, 0]):
-            if len(all_results) % 1000 == 0:
-                logger.info("Processing example: %d" % (len(all_results)))
-            input_ids = input_ids.to(device)
-            input_mask = input_mask.to(device)
-            segment_ids = segment_ids.to(device)
-            with torch.no_grad():
-                batch_start_logits, batch_end_logits = model(input_ids, segment_ids, input_mask)
-            for i, example_index in enumerate(example_indices):
-                start_logits = batch_start_logits[i].detach().cpu().tolist()
-                end_logits = batch_end_logits[i].detach().cpu().tolist()
-                eval_feature = eval_features[example_index.item()]
-                unique_id = int(eval_feature.unique_id)
-                all_results.append(RawResult(unique_id=unique_id,
-                                             start_logits=start_logits,
-                                             end_logits=end_logits))
-        output_prediction_file = os.path.join(args.output_dir, "predictions.json")
-        output_nbest_file = os.path.join(args.output_dir, "nbest_predictions.json")
-        output_null_log_odds_file = os.path.join(args.output_dir, "null_odds.json")
-        write_predictions(eval_examples, eval_features, all_results,
-                          args.n_best_size, args.max_answer_length,
-                          args.do_lower_case, output_prediction_file,
-                          output_nbest_file, output_null_log_odds_file, args.verbose_logging,
-                          args.version_2_with_negative, args.null_score_diff_threshold)
+            # Evaluate
+            result = evaluate(args, model, tokenizer, prefix=global_step)
+
+            result = dict((k + ('_{}'.format(global_step) if global_step else ''), v) for k, v in result.items())
+            results.update(result)
+
+    logger.info("Results: {}".format(results))
+
+    return results


 if __name__ == "__main__":
--- a/examples/single_model_scripts/run_openai_gpt.py
+++ b/examples/single_model_scripts/run_openai_gpt.py
@@ -39,8 +39,8 @@ import torch
 from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
                              TensorDataset)

-from pytorch_pretrained_bert import (OpenAIGPTDoubleHeadsModel, OpenAIGPTTokenizer,
-                                     OpenAIAdam, cached_path, WEIGHTS_NAME, CONFIG_NAME)
+from pytorch_transformers import (OpenAIGPTDoubleHeadsModel, OpenAIGPTTokenizer,
+                                     AdamW, cached_path, WEIGHTS_NAME, CONFIG_NAME)

 ROCSTORIES_URL = "https://s3.amazonaws.com/datasets.huggingface.co/ROCStories.tar.gz"

@@ -191,7 +191,7 @@ def main():
            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
            ]
        num_train_optimization_steps = len(train_dataloader) * args.num_train_epochs
-        optimizer = OpenAIAdam(optimizer_grouped_parameters,
+        optimizer = AdamW(optimizer_grouped_parameters,
                               lr=args.learning_rate,
                               warmup=args.warmup_proportion,
                               max_grad_norm=args.max_grad_norm,
--- a/examples/single_model_scripts/run_swag.py
+++ b/examples/single_model_scripts/run_swag.py
@@ -32,10 +32,10 @@ from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
 from torch.utils.data.distributed import DistributedSampler
 from tqdm import tqdm, trange

-from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE, WEIGHTS_NAME, CONFIG_NAME
-from pytorch_pretrained_bert.modeling import BertForMultipleChoice, BertConfig
-from pytorch_pretrained_bert.optimization import BertAdam, WarmupLinearSchedule
-from pytorch_pretrained_bert.tokenization import BertTokenizer
+from pytorch_transformers.file_utils import PYTORCH_PRETRAINED_BERT_CACHE, WEIGHTS_NAME, CONFIG_NAME
+from pytorch_transformers.modeling_bert import BertForMultipleChoice, BertConfig
+from pytorch_transformers.optimization import AdamW, WarmupLinearSchedule
+from pytorch_transformers.tokenization_bert import BertTokenizer

 logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                    datefmt = '%m/%d/%Y %H:%M:%S',
--- a/examples/single_model_scripts/run_transfo_xl.py
+++ b/examples/single_model_scripts/run_transfo_xl.py
@@ -28,7 +28,7 @@ import math

 import torch

-from pytorch_pretrained_bert import TransfoXLLMHeadModel, TransfoXLCorpus, TransfoXLTokenizer
+from pytorch_transformers import TransfoXLLMHeadModel, TransfoXLCorpus, TransfoXLTokenizer

 logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                    datefmt = '%m/%d/%Y %H:%M:%S',
--- a/examples/test_examples.py
+++ b/examples/test_examples.py
@@ -0,0 +1,111 @@
+# coding=utf-8
+# Copyright 2018 HuggingFace Inc..
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import sys
+import unittest
+import argparse
+import logging
+
+try:
+    # python 3.4+ can use builtin unittest.mock instead of mock package
+    from unittest.mock import patch
+except ImportError:
+    from mock import patch
+
+import run_glue
+import run_squad
+import run_generation
+
+logging.basicConfig(level=logging.DEBUG)
+
+logger = logging.getLogger()
+
+def get_setup_file():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-f')
+    args = parser.parse_args()
+    return args.f
+
+class ExamplesTests(unittest.TestCase):
+
+    def test_run_glue(self):
+        stream_handler = logging.StreamHandler(sys.stdout)
+        logger.addHandler(stream_handler)
+
+        testargs = ["run_glue.py",
+                    "--data_dir=./examples/tests_samples/MRPC/",
+                    "--task_name=mrpc",
+                    "--do_train",
+                    "--do_eval",
+                    "--output_dir=./examples/tests_samples/temp_dir",
+                    "--per_gpu_train_batch_size=2",
+                    "--per_gpu_eval_batch_size=1",
+                    "--learning_rate=1e-4",
+                    "--max_steps=10",
+                    "--warmup_steps=2",
+                    "--overwrite_output_dir",
+                    "--seed=42"]
+        model_type, model_name = ("--model_type=bert",
+                                  "--model_name_or_path=bert-base-uncased")
+        with patch.object(sys, 'argv', testargs + [model_type, model_name]):
+            result = run_glue.main()
+            for value in result.values():
+                self.assertGreaterEqual(value, 0.75)
+
+    def test_run_squad(self):
+        stream_handler = logging.StreamHandler(sys.stdout)
+        logger.addHandler(stream_handler)
+
+        testargs = ["run_squad.py",
+                    "--train_file=./examples/tests_samples/SQUAD/dev-v2.0-small.json",
+                    "--predict_file=./examples/tests_samples/SQUAD/dev-v2.0-small.json",
+                    "--model_name=bert-base-uncased",
+                    "--output_dir=./examples/tests_samples/temp_dir",
+                    "--max_steps=10",
+                    "--warmup_steps=2",
+                    "--do_train",
+                    "--do_eval",
+                    "--version_2_with_negative",
+                    "--learning_rate=1e-4",
+                    "--per_gpu_train_batch_size=2",
+                    "--per_gpu_eval_batch_size=1",
+                    "--overwrite_output_dir",
+                    "--seed=42"]
+        model_type, model_name = ("--model_type=bert",
+                                  "--model_name_or_path=bert-base-uncased")
+        with patch.object(sys, 'argv', testargs + [model_type, model_name]):
+            result = run_squad.main()
+            self.assertGreaterEqual(result['f1'], 30)
+            self.assertGreaterEqual(result['exact'], 30)
+
+    def test_generation(self):
+        stream_handler = logging.StreamHandler(sys.stdout)
+        logger.addHandler(stream_handler)
+
+        testargs = ["run_generation.py",
+                    "--prompt=Hello",
+                    "--length=10",
+                    "--seed=42"]
+        model_type, model_name = ("--model_type=openai-gpt",
+                                  "--model_name_or_path=openai-gpt")
+        with patch.object(sys, 'argv', testargs + [model_type, model_name]):
+            result = run_generation.main()
+            self.assertGreaterEqual(len(result), 10)
+
+if __name__ == "__main__":
+    unittest.main()
--- a/examples/tests_samples/.gitignore
+++ b/examples/tests_samples/.gitignore
@@ -0,0 +1,6 @@
+*.*
+cache*
+temp*
+!*.tsv
+!*.json
+!.gitignore
--- a/examples/tests_samples/MRPC/dev.tsv
+++ b/examples/tests_samples/MRPC/dev.tsv
@@ -0,0 +1,7 @@
+Quality	#1 ID	#2 ID	#1 String	#2 String
+1	1355540	1355592	He said the foodservice pie business doesn 't fit the company 's long-term growth strategy .	" The foodservice pie business does not fit our long-term growth strategy .
+0	2029631	2029565	Magnarelli said Racicot hated the Iraqi regime and looked forward to using his long years of training in the war .	His wife said he was " 100 percent behind George Bush " and looked forward to using his years of training in the war .
+0	487993	487952	The dollar was at 116.92 yen against the yen , flat on the session , and at 1.2891 against the Swiss franc , also flat .	The dollar was at 116.78 yen JPY = , virtually flat on the session , and at 1.2871 against the Swiss franc CHF = , down 0.1 percent .
+1	1989515	1989458	The AFL-CIO is waiting until October to decide if it will endorse a candidate .	The AFL-CIO announced Wednesday that it will decide in October whether to endorse a candidate before the primaries .
+0	1783137	1782659	No dates have been set for the civil or the criminal trial .	No dates have been set for the criminal or civil cases , but Shanley has pleaded not guilty .
+1	3039165	3039036	Wal-Mart said it would check all of its million-plus domestic workers to ensure they were legally employed .	It has also said it would review all of its domestic employees more than 1 million to ensure they have legal status .
--- a/examples/tests_samples/MRPC/train.tsv
+++ b/examples/tests_samples/MRPC/train.tsv
@@ -0,0 +1,7 @@
+Quality	#1 ID	#2 ID	#1 String	#2 String
+1	1355540	1355592	He said the foodservice pie business doesn 't fit the company 's long-term growth strategy .	" The foodservice pie business does not fit our long-term growth strategy .
+0	2029631	2029565	Magnarelli said Racicot hated the Iraqi regime and looked forward to using his long years of training in the war .	His wife said he was " 100 percent behind George Bush " and looked forward to using his years of training in the war .
+0	487993	487952	The dollar was at 116.92 yen against the yen , flat on the session , and at 1.2891 against the Swiss franc , also flat .	The dollar was at 116.78 yen JPY = , virtually flat on the session , and at 1.2871 against the Swiss franc CHF = , down 0.1 percent .
+1	1989515	1989458	The AFL-CIO is waiting until October to decide if it will endorse a candidate .	The AFL-CIO announced Wednesday that it will decide in October whether to endorse a candidate before the primaries .
+0	1783137	1782659	No dates have been set for the civil or the criminal trial .	No dates have been set for the criminal or civil cases , but Shanley has pleaded not guilty .
+1	3039165	3039036	Wal-Mart said it would check all of its million-plus domestic workers to ensure they were legally employed .	It has also said it would review all of its domestic employees more than 1 million to ensure they have legal status .
--- a/examples/tests_samples/SQUAD/dev-v2.0-small.json
+++ b/examples/tests_samples/SQUAD/dev-v2.0-small.json
@@ -0,0 +1,140 @@
+{
+    "version": "v2.0",
+    "data": [{
+        "title": "Normans",
+        "paragraphs": [{
+            "qas": [{
+                "question": "In what country is Normandy located?",
+                "id": "56ddde6b9a695914005b9628",
+                "answers": [{
+                    "text": "France",
+                    "answer_start": 159
+                }],
+                "is_impossible": false
+            }, {
+                "question": "When were the Normans in Normandy?",
+                "id": "56ddde6b9a695914005b9629",
+                "answers": [{
+                    "text": "10th and 11th centuries",
+                    "answer_start": 94
+                }],
+                "is_impossible": false
+            }, {
+                "question": "From which countries did the Norse originate?",
+                "id": "56ddde6b9a695914005b962a",
+                "answers": [{
+                    "text": "Denmark, Iceland and Norway",
+                    "answer_start": 256
+                }],
+                "is_impossible": false
+            }, {
+                "plausible_answers": [{
+                    "text": "Rollo",
+                    "answer_start": 308
+                }],
+                "question": "Who did King Charles III swear fealty to?",
+                "id": "5ad39d53604f3c001a3fe8d3",
+                "answers": [],
+                "is_impossible": true
+            }, {
+                "plausible_answers": [{
+                    "text": "10th century",
+                    "answer_start": 671
+                }],
+                "question": "When did the Frankish identity emerge?",
+                "id": "5ad39d53604f3c001a3fe8d4",
+                "answers": [],
+                "is_impossible": true
+            }],
+            "context": "The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave their name to Normandy, a region in France. They were descended from Norse (\"Norman\" comes from \"Norseman\") raiders and pirates from Denmark, Iceland and Norway who, under their leader Rollo, agreed to swear fealty to King Charles III of West Francia. Through generations of assimilation and mixing with the native Frankish and Roman-Gaulish populations, their descendants would gradually merge with the Carolingian-based cultures of West Francia. The distinct cultural and ethnic identity of the Normans emerged initially in the first half of the 10th century, and it continued to evolve over the succeeding centuries."
+        }, {
+            "qas": [{
+                "question": "Who was the duke in the battle of Hastings?",
+                "id": "56dddf4066d3e219004dad5f",
+                "answers": [{
+                    "text": "William the Conqueror",
+                    "answer_start": 1022
+                }],
+                "is_impossible": false
+            }, {
+                "plausible_answers": [{
+                    "text": "Antioch",
+                    "answer_start": 1295
+                }],
+                "question": "What principality did William the conquerer found?",
+                "id": "5ad3a266604f3c001a3fea2b",
+                "answers": [],
+                "is_impossible": true
+            }],
+            "context": "The Norman dynasty had a major political, cultural and military impact on medieval Europe and even the Near East. The Normans were famed for their martial spirit and eventually for their Christian piety, becoming exponents of the Catholic orthodoxy into which they assimilated. They adopted the Gallo-Romance language of the Frankish land they settled, their dialect becoming known as Norman, Normaund or Norman French, an important literary language. The Duchy of Normandy, which they formed by treaty with the French crown, was a great fief of medieval France, and under Richard I of Normandy was forged into a cohesive and formidable principality in feudal tenure. The Normans are noted both for their culture, such as their unique Romanesque architecture and musical traditions, and for their significant military accomplishments and innovations. Norman adventurers founded the Kingdom of Sicily under Roger II after conquering southern Italy on the Saracens and Byzantines, and an expedition on behalf of their duke, William the Conqueror, led to the Norman conquest of England at the Battle of Hastings in 1066. Norman cultural and military influence spread from these new European centres to the Crusader states of the Near East, where their prince Bohemond I founded the Principality of Antioch in the Levant, to Scotland and Wales in Great Britain, to Ireland, and to the coasts of north Africa and the Canary Islands."
+        }]
+    }, {
+        "title": "Computational_complexity_theory",
+        "paragraphs": [{
+            "qas": [{
+                "question": "What branch of theoretical computer science deals with broadly classifying computational problems by difficulty and class of relationship?",
+                "id": "56e16182e3433e1400422e28",
+                "answers": [{
+                    "text": "Computational complexity theory",
+                    "answer_start": 0
+                }],
+                "is_impossible": false
+            }, {
+                "plausible_answers": [{
+                    "text": "algorithm",
+                    "answer_start": 472
+                }],
+                "question": "What is a manual application of mathematical steps?",
+                "id": "5ad5316b5b96ef001a10ab76",
+                "answers": [],
+                "is_impossible": true
+            }],
+            "context": "Computational complexity theory is a branch of the theory of computation in theoretical computer science that focuses on classifying computational problems according to their inherent difficulty, and relating those classes to each other. A computational problem is understood to be a task that is in principle amenable to being solved by a computer, which is equivalent to stating that the problem may be solved by mechanical application of mathematical steps, such as an algorithm."
+        }, {
+            "qas": [{
+                "question": "What measure of a computational problem broadly defines the inherent difficulty of the solution?",
+                "id": "56e16839cd28a01900c67887",
+                "answers": [{
+                    "text": "if its solution requires significant resources",
+                    "answer_start": 46
+                }],
+                "is_impossible": false
+            }, {
+                "question": "What method is used to intuitively assess or quantify the amount of resources required to solve a computational problem?",
+                "id": "56e16839cd28a01900c67888",
+                "answers": [{
+                    "text": "mathematical models of computation",
+                    "answer_start": 176
+                }],
+                "is_impossible": false
+            }, {
+                "question": "What are two basic primary resources used to guage complexity?",
+                "id": "56e16839cd28a01900c67889",
+                "answers": [{
+                    "text": "time and storage",
+                    "answer_start": 305
+                }],
+                "is_impossible": false
+            }, {
+                "plausible_answers": [{
+                    "text": "the number of gates in a circuit",
+                    "answer_start": 436
+                }],
+                "question": "What unit is measured to determine circuit simplicity?",
+                "id": "5ad532575b96ef001a10ab7f",
+                "answers": [],
+                "is_impossible": true
+            }, {
+                "plausible_answers": [{
+                    "text": "the number of processors",
+                    "answer_start": 502
+                }],
+                "question": "What number is used in perpendicular computing?",
+                "id": "5ad532575b96ef001a10ab80",
+                "answers": [],
+                "is_impossible": true
+            }],
+            "context": "A problem is regarded as inherently difficult if its solution requires significant resources, whatever the algorithm used. The theory formalizes this intuition, by introducing mathematical models of computation to study these problems and quantifying the amount of resources needed to solve them, such as time and storage. Other complexity measures are also used, such as the amount of communication (used in communication complexity), the number of gates in a circuit (used in circuit complexity) and the number of processors (used in parallel computing). One of the roles of computational complexity theory is to determine the practical limits on what computers can and cannot do."
+        }]
+    }]
+}
--- a/examples/run_classifier_dataset_utils.py
+++ b/examples/run_classifier_dataset_utils.py
@@ -21,6 +21,7 @@ import csv
 import logging
 import os
 import sys
+from io import open

 from scipy.stats import pearsonr, spearmanr
 from sklearn.metrics import matthews_corrcoef, f1_score
@@ -77,7 +78,7 @@ class DataProcessor(object):
    @classmethod
    def _read_tsv(cls, input_file, quotechar=None):
        """Reads a tab separated value file."""
-        with open(input_file, "r", encoding="utf-8") as f:
+        with open(input_file, "r", encoding="utf-8-sig") as f:
            reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
            lines = []
            for line in reader:
@@ -388,8 +389,18 @@ class WnliProcessor(DataProcessor):


 def convert_examples_to_features(examples, label_list, max_seq_length,
-                                 tokenizer, output_mode):
-    """Loads a data file into a list of `InputBatch`s."""
+                                 tokenizer, output_mode,
+                                 cls_token_at_end=False, pad_on_left=False,
+                                 cls_token='[CLS]', sep_token='[SEP]', pad_token=0,
+                                 sequence_a_segment_id=0, sequence_b_segment_id=1,
+                                 cls_token_segment_id=1, pad_token_segment_id=0,
+                                 mask_padding_with_zero=True):
+    """ Loads a data file into a list of `InputBatch`s
+        `cls_token_at_end` define the location of the CLS token:
+            - False (Default, BERT/XLM pattern): [CLS] + A + [SEP] + B + [SEP]
+            - True (XLNet/GPT pattern): A + [SEP] + B + [SEP] + [CLS]
+        `cls_token_segment_id` define the segment id associated to the CLS token (0 for BERT, 2 for XLNet)
+    """

    label_map = {label : i for i, label in enumerate(label_list)}

@@ -415,10 +426,10 @@ def convert_examples_to_features(examples, label_list, max_seq_length,
        # The convention in BERT is:
        # (a) For sequence pairs:
        #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
-        #  type_ids: 0   0  0    0    0     0       0 0    1  1  1  1   1 1
+        #  type_ids:   0   0  0    0    0     0       0   0   1  1  1  1   1   1
        # (b) For single sequences:
        #  tokens:   [CLS] the dog is hairy . [SEP]
-        #  type_ids: 0   0   0   0  0     0 0
+        #  type_ids:   0   0   0   0  0     0   0
        #
        # Where "type_ids" are used to indicate whether this is the first
        # sequence or the second sequence. The embedding vectors for `type=0` and
@@ -430,24 +441,36 @@ def convert_examples_to_features(examples, label_list, max_seq_length,
        # For classification tasks, the first vector (corresponding to [CLS]) is
        # used as as the "sentence vector". Note that this only makes sense because
        # the entire model is fine-tuned.
-        tokens = ["[CLS]"] + tokens_a + ["[SEP]"]
-        segment_ids = [0] * len(tokens)
+        tokens = tokens_a + [sep_token]
+        segment_ids = [sequence_a_segment_id] * len(tokens)

        if tokens_b:
-            tokens += tokens_b + ["[SEP]"]
-            segment_ids += [1] * (len(tokens_b) + 1)
+            tokens += tokens_b + [sep_token]
+            segment_ids += [sequence_b_segment_id] * (len(tokens_b) + 1)
+
+        if cls_token_at_end:
+            tokens = tokens + [cls_token]
+            segment_ids = segment_ids + [cls_token_segment_id]
+        else:
+            tokens = [cls_token] + tokens
+            segment_ids = [cls_token_segment_id] + segment_ids

        input_ids = tokenizer.convert_tokens_to_ids(tokens)

        # The mask has 1 for real tokens and 0 for padding tokens. Only real
        # tokens are attended to.
-        input_mask = [1] * len(input_ids)
+        input_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)

        # Zero-pad up to the sequence length.
-        padding = [0] * (max_seq_length - len(input_ids))
-        input_ids += padding
-        input_mask += padding
-        segment_ids += padding
+        padding_length = max_seq_length - len(input_ids)
+        if pad_on_left:
+            input_ids = ([pad_token] * padding_length) + input_ids
+            input_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + input_mask
+            segment_ids = ([pad_token_segment_id] * padding_length) + segment_ids
+        else:
+            input_ids = input_ids + ([pad_token] * padding_length)
+            input_mask = input_mask + ([0 if mask_padding_with_zero else 1] * padding_length)
+            segment_ids = segment_ids + ([pad_token_segment_id] * padding_length)

        assert len(input_ids) == max_seq_length
        assert len(input_mask) == max_seq_length
@@ -467,8 +490,7 @@ def convert_examples_to_features(examples, label_list, max_seq_length,
                    [str(x) for x in tokens]))
            logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
            logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
-            logger.info(
-                    "segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
+            logger.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
            logger.info("label: %s (id = %d)" % (example.label, label_id))

        features.append(
@@ -561,6 +583,7 @@ processors = {
 output_modes = {
    "cola": "classification",
    "mnli": "classification",
+    "mnli-mm": "classification",
    "mrpc": "classification",
    "sst-2": "classification",
    "sts-b": "regression",
@@ -569,3 +592,15 @@ output_modes = {
    "rte": "classification",
    "wnli": "classification",
 }
+
+GLUE_TASKS_NUM_LABELS = {
+    "cola": 2,
+    "mnli": 3,
+    "mrpc": 2,
+    "sst-2": 2,
+    "sts-b": 1,
+    "qqp": 2,
+    "qnli": 2,
+    "rte": 2,
+    "wnli": 2,
+}
--- a/examples/run_squad_dataset_utils.py
+++ b/examples/run_squad_dataset_utils.py
@@ -1,3 +1,4 @@
+
 # coding=utf-8
 # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
 # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
@@ -23,7 +24,10 @@ import math
 import collections
 from io import open

-from pytorch_pretrained_bert.tokenization import BasicTokenizer, whitespace_tokenize
+from pytorch_transformers.tokenization_bert import BasicTokenizer, whitespace_tokenize
+
+# Required by XLNet evaluation method to compute optimal threshold (see write_predictions_extended() method)
+from utils_squad_evaluate import find_all_best_thresh_v2, make_qid_to_has_ans, get_raw_scores

 logger = logging.getLogger(__name__)

@@ -81,6 +85,9 @@ class InputFeatures(object):
                 input_ids,
                 input_mask,
                 segment_ids,
+                 cls_index,
+                 p_mask,
+                 paragraph_len,
                 start_position=None,
                 end_position=None,
                 is_impossible=None):
@@ -93,6 +100,9 @@ class InputFeatures(object):
        self.input_ids = input_ids
        self.input_mask = input_mask
        self.segment_ids = segment_ids
+        self.cls_index = cls_index
+        self.p_mask = p_mask
+        self.paragraph_len = paragraph_len
        self.start_position = start_position
        self.end_position = end_position
        self.is_impossible = is_impossible
@@ -177,13 +187,25 @@ def read_squad_examples(input_file, is_training, version_2_with_negative):


 def convert_examples_to_features(examples, tokenizer, max_seq_length,
-                                 doc_stride, max_query_length, is_training):
+                                 doc_stride, max_query_length, is_training,
+                                 cls_token_at_end=False,
+                                 cls_token='[CLS]', sep_token='[SEP]', pad_token=0,
+                                 sequence_a_segment_id=0, sequence_b_segment_id=1,
+                                 cls_token_segment_id=0, pad_token_segment_id=0,
+                                 mask_padding_with_zero=True):
    """Loads a data file into a list of `InputBatch`s."""

    unique_id = 1000000000
+    # cnt_pos, cnt_neg = 0, 0
+    # max_N, max_M = 1024, 1024
+    # f = np.zeros((max_N, max_M), dtype=np.float32)

    features = []
    for (example_index, example) in enumerate(examples):
+
+        # if example_index % 100 == 0:
+        #     logger.info('Converting %s/%s pos %s neg %s', example_index, len(examples), cnt_pos, cnt_neg)
+
        query_tokens = tokenizer.tokenize(example.question_text)

        if len(query_tokens) > max_query_length:
@@ -238,14 +260,30 @@ def convert_examples_to_features(examples, tokenizer, max_seq_length,
            token_to_orig_map = {}
            token_is_max_context = {}
            segment_ids = []
-            tokens.append("[CLS]")
-            segment_ids.append(0)
+
+            # p_mask: mask with 1 for token than cannot be in the answer (0 for token which can be in an answer)
+            # Original TF implem also keep the classification token (set to 0) (not sure why...)
+            p_mask = []
+
+            # CLS token at the beginning
+            if not cls_token_at_end:
+                tokens.append(cls_token)
+                segment_ids.append(cls_token_segment_id)
+                p_mask.append(0)
+                cls_index = 0
+
+            # Query
            for token in query_tokens:
                tokens.append(token)
-                segment_ids.append(0)
-            tokens.append("[SEP]")
-            segment_ids.append(0)
+                segment_ids.append(sequence_a_segment_id)
+                p_mask.append(1)

+            # SEP token
+            tokens.append(sep_token)
+            segment_ids.append(sequence_a_segment_id)
+            p_mask.append(1)
+
+            # Paragraph
            for i in range(doc_span.length):
                split_token_index = doc_span.start + i
                token_to_orig_map[len(tokens)] = tok_to_orig_index[split_token_index]
@@ -254,29 +292,43 @@ def convert_examples_to_features(examples, tokenizer, max_seq_length,
                                                       split_token_index)
                token_is_max_context[len(tokens)] = is_max_context
                tokens.append(all_doc_tokens[split_token_index])
-                segment_ids.append(1)
-            tokens.append("[SEP]")
-            segment_ids.append(1)
+                segment_ids.append(sequence_b_segment_id)
+                p_mask.append(0)
+            paragraph_len = doc_span.length
+
+            # SEP token
+            tokens.append(sep_token)
+            segment_ids.append(sequence_b_segment_id)
+            p_mask.append(1)
+
+            # CLS token at the end
+            if cls_token_at_end:
+                tokens.append(cls_token)
+                segment_ids.append(cls_token_segment_id)
+                p_mask.append(0)
+                cls_index = len(tokens) - 1  # Index of classification token

            input_ids = tokenizer.convert_tokens_to_ids(tokens)

            # The mask has 1 for real tokens and 0 for padding tokens. Only real
            # tokens are attended to.
-            input_mask = [1] * len(input_ids)
+            input_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)

            # Zero-pad up to the sequence length.
            while len(input_ids) < max_seq_length:
-                input_ids.append(0)
-                input_mask.append(0)
-                segment_ids.append(0)
+                input_ids.append(pad_token)
+                input_mask.append(0 if mask_padding_with_zero else 1)
+                segment_ids.append(pad_token_segment_id)
+                p_mask.append(1)

            assert len(input_ids) == max_seq_length
            assert len(input_mask) == max_seq_length
            assert len(segment_ids) == max_seq_length

+            span_is_impossible = example.is_impossible
            start_position = None
            end_position = None
-            if is_training and not example.is_impossible:
+            if is_training and not span_is_impossible:
                # For training, if our document chunk does not contain an annotation
                # we throw it out, since there is nothing to predict.
                doc_start = doc_span.start
@@ -288,13 +340,16 @@ def convert_examples_to_features(examples, tokenizer, max_seq_length,
                if out_of_span:
                    start_position = 0
                    end_position = 0
+                    span_is_impossible = True
                else:
                    doc_offset = len(query_tokens) + 2
                    start_position = tok_start_position - doc_start + doc_offset
                    end_position = tok_end_position - doc_start + doc_offset
-            if is_training and example.is_impossible:
-                start_position = 0
-                end_position = 0
+
+            if is_training and span_is_impossible:
+                start_position = cls_index
+                end_position = cls_index
+
            if example_index < 20:
                logger.info("*** Example ***")
                logger.info("unique_id: %s" % (unique_id))
@@ -311,9 +366,9 @@ def convert_examples_to_features(examples, tokenizer, max_seq_length,
                    "input_mask: %s" % " ".join([str(x) for x in input_mask]))
                logger.info(
                    "segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
-                if is_training and example.is_impossible:
+                if is_training and span_is_impossible:
                    logger.info("impossible example")
-                if is_training and not example.is_impossible:
+                if is_training and not span_is_impossible:
                    answer_text = " ".join(tokens[start_position:(end_position + 1)])
                    logger.info("start_position: %d" % (start_position))
                    logger.info("end_position: %d" % (end_position))
@@ -331,9 +386,12 @@ def convert_examples_to_features(examples, tokenizer, max_seq_length,
                    input_ids=input_ids,
                    input_mask=input_mask,
                    segment_ids=segment_ids,
+                    cls_index=cls_index,
+                    p_mask=p_mask,
+                    paragraph_len=paragraph_len,
                    start_position=start_position,
                    end_position=end_position,
-                    is_impossible=example.is_impossible))
+                    is_impossible=span_is_impossible))
            unique_id += 1

    return features
@@ -416,7 +474,6 @@ def _check_is_max_context(doc_spans, cur_span_index, position):
 RawResult = collections.namedtuple("RawResult",
                                   ["unique_id", "start_logits", "end_logits"])

-
 def write_predictions(all_examples, all_features, all_results, n_best_size,
                      max_answer_length, do_lower_case, output_prediction_file,
                      output_nbest_file, output_null_log_odds_file, verbose_logging,
@@ -555,7 +612,7 @@ def write_predictions(all_examples, all_features, all_results, n_best_size,
            if len(nbest)==1:
                nbest.insert(0,
                    _NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0))
-                
+
        # In very rare edge cases we could have no valid predictions. So we
        # just create a nonce prediction in this case to avoid failure.
        if not nbest:
@@ -608,6 +665,205 @@ def write_predictions(all_examples, all_features, all_results, n_best_size,
        with open(output_null_log_odds_file, "w") as writer:
            writer.write(json.dumps(scores_diff_json, indent=4) + "\n")

+    return all_predictions
+
+
+# For XLNet (and XLM which uses the same head)
+RawResultExtended = collections.namedtuple("RawResultExtended",
+    ["unique_id", "start_top_log_probs", "start_top_index",
+     "end_top_log_probs", "end_top_index", "cls_logits"])
+
+
+def write_predictions_extended(all_examples, all_features, all_results, n_best_size,
+                                max_answer_length, output_prediction_file,
+                                output_nbest_file,
+                                output_null_log_odds_file, orig_data_file,
+                                start_n_top, end_n_top, version_2_with_negative,
+                                tokenizer, verbose_logging):
+    """ XLNet write prediction logic (more complex than Bert's).
+        Write final predictions to the json file and log-odds of null if needed.
+
+        Requires utils_squad_evaluate.py
+    """
+    _PrelimPrediction = collections.namedtuple(  # pylint: disable=invalid-name
+        "PrelimPrediction",
+        ["feature_index", "start_index", "end_index",
+        "start_log_prob", "end_log_prob"])
+
+    _NbestPrediction = collections.namedtuple(  # pylint: disable=invalid-name
+        "NbestPrediction", ["text", "start_log_prob", "end_log_prob"])
+
+    logger.info("Writing predictions to: %s", output_prediction_file)
+    # logger.info("Writing nbest to: %s" % (output_nbest_file))
+
+    example_index_to_features = collections.defaultdict(list)
+    for feature in all_features:
+        example_index_to_features[feature.example_index].append(feature)
+
+    unique_id_to_result = {}
+    for result in all_results:
+        unique_id_to_result[result.unique_id] = result
+
+    all_predictions = collections.OrderedDict()
+    all_nbest_json = collections.OrderedDict()
+    scores_diff_json = collections.OrderedDict()
+
+    for (example_index, example) in enumerate(all_examples):
+        features = example_index_to_features[example_index]
+
+        prelim_predictions = []
+        # keep track of the minimum score of null start+end of position 0
+        score_null = 1000000  # large and positive
+
+        for (feature_index, feature) in enumerate(features):
+            result = unique_id_to_result[feature.unique_id]
+
+            cur_null_score = result.cls_logits
+
+            # if we could have irrelevant answers, get the min score of irrelevant
+            score_null = min(score_null, cur_null_score)
+
+            for i in range(start_n_top):
+                for j in range(end_n_top):
+                    start_log_prob = result.start_top_log_probs[i]
+                    start_index = result.start_top_index[i]
+
+                    j_index = i * end_n_top + j
+
+                    end_log_prob = result.end_top_log_probs[j_index]
+                    end_index = result.end_top_index[j_index]
+
+                    # We could hypothetically create invalid predictions, e.g., predict
+                    # that the start of the span is in the question. We throw out all
+                    # invalid predictions.
+                    if start_index >= feature.paragraph_len - 1:
+                        continue
+                    if end_index >= feature.paragraph_len - 1:
+                        continue
+
+                    if not feature.token_is_max_context.get(start_index, False):
+                        continue
+                    if end_index < start_index:
+                        continue
+                    length = end_index - start_index + 1
+                    if length > max_answer_length:
+                        continue
+
+                    prelim_predictions.append(
+                        _PrelimPrediction(
+                            feature_index=feature_index,
+                            start_index=start_index,
+                            end_index=end_index,
+                            start_log_prob=start_log_prob,
+                            end_log_prob=end_log_prob))
+
+        prelim_predictions = sorted(
+            prelim_predictions,
+            key=lambda x: (x.start_log_prob + x.end_log_prob),
+            reverse=True)
+
+        seen_predictions = {}
+        nbest = []
+        for pred in prelim_predictions:
+            if len(nbest) >= n_best_size:
+                break
+            feature = features[pred.feature_index]
+
+            # XLNet un-tokenizer
+            # Let's keep it simple for now and see if we need all this later.
+            # 
+            # tok_start_to_orig_index = feature.tok_start_to_orig_index
+            # tok_end_to_orig_index = feature.tok_end_to_orig_index
+            # start_orig_pos = tok_start_to_orig_index[pred.start_index]
+            # end_orig_pos = tok_end_to_orig_index[pred.end_index]
+            # paragraph_text = example.paragraph_text
+            # final_text = paragraph_text[start_orig_pos: end_orig_pos + 1].strip()
+
+            # Previously used Bert untokenizer
+            tok_tokens = feature.tokens[pred.start_index:(pred.end_index + 1)]
+            orig_doc_start = feature.token_to_orig_map[pred.start_index]
+            orig_doc_end = feature.token_to_orig_map[pred.end_index]
+            orig_tokens = example.doc_tokens[orig_doc_start:(orig_doc_end + 1)]
+            tok_text = tokenizer.convert_tokens_to_string(tok_tokens)
+
+            # Clean whitespace
+            tok_text = tok_text.strip()
+            tok_text = " ".join(tok_text.split())
+            orig_text = " ".join(orig_tokens)
+
+            final_text = get_final_text(tok_text, orig_text, tokenizer.do_lower_case,
+                                        verbose_logging)
+
+            if final_text in seen_predictions:
+                continue
+
+            seen_predictions[final_text] = True
+
+            nbest.append(
+                _NbestPrediction(
+                    text=final_text,
+                    start_log_prob=pred.start_log_prob,
+                    end_log_prob=pred.end_log_prob))
+
+        # In very rare edge cases we could have no valid predictions. So we
+        # just create a nonce prediction in this case to avoid failure.
+        if not nbest:
+            nbest.append(
+                _NbestPrediction(text="", start_log_prob=-1e6,
+                end_log_prob=-1e6))
+
+        total_scores = []
+        best_non_null_entry = None
+        for entry in nbest:
+            total_scores.append(entry.start_log_prob + entry.end_log_prob)
+            if not best_non_null_entry:
+                best_non_null_entry = entry
+
+        probs = _compute_softmax(total_scores)
+
+        nbest_json = []
+        for (i, entry) in enumerate(nbest):
+            output = collections.OrderedDict()
+            output["text"] = entry.text
+            output["probability"] = probs[i]
+            output["start_log_prob"] = entry.start_log_prob
+            output["end_log_prob"] = entry.end_log_prob
+            nbest_json.append(output)
+
+        assert len(nbest_json) >= 1
+        assert best_non_null_entry is not None
+
+        score_diff = score_null
+        scores_diff_json[example.qas_id] = score_diff
+        # note(zhiliny): always predict best_non_null_entry
+        # and the evaluation script will search for the best threshold
+        all_predictions[example.qas_id] = best_non_null_entry.text
+
+        all_nbest_json[example.qas_id] = nbest_json
+
+    with open(output_prediction_file, "w") as writer:
+        writer.write(json.dumps(all_predictions, indent=4) + "\n")
+
+    with open(output_nbest_file, "w") as writer:
+        writer.write(json.dumps(all_nbest_json, indent=4) + "\n")
+
+    if version_2_with_negative:
+        with open(output_null_log_odds_file, "w") as writer:
+            writer.write(json.dumps(scores_diff_json, indent=4) + "\n")
+
+    with open(orig_data_file, "r", encoding='utf-8') as reader:
+        orig_data = json.load(reader)["data"]
+
+    qid_to_has_ans = make_qid_to_has_ans(orig_data)
+    has_ans_qids = [k for k, v in qid_to_has_ans.items() if v]
+    no_ans_qids = [k for k, v in qid_to_has_ans.items() if not v]
+    exact_raw, f1_raw = get_raw_scores(orig_data, all_predictions)
+    out_eval = {}
+
+    find_all_best_thresh_v2(out_eval, all_predictions, exact_raw, f1_raw, scores_diff_json, qid_to_has_ans)
+
+    return out_eval
+

 def get_final_text(pred_text, orig_text, do_lower_case, verbose_logging=False):
    """Project the tokenized prediction back to the original text."""
--- a/examples/utils_squad_evaluate.py
+++ b/examples/utils_squad_evaluate.py
@@ -0,0 +1,330 @@
+""" Official evaluation script for SQuAD version 2.0.
+    Modified by XLNet authors to update `find_best_threshold` scripts for SQuAD V2.0
+
+In addition to basic functionality, we also compute additional statistics and
+plot precision-recall curves if an additional na_prob.json file is provided.
+This file is expected to map question ID's to the model's predicted probability
+that a question is unanswerable.
+"""
+import argparse
+import collections
+import json
+import numpy as np
+import os
+import re
+import string
+import sys
+
+class EVAL_OPTS():
+  def __init__(self, data_file, pred_file, out_file="",
+               na_prob_file="na_prob.json", na_prob_thresh=1.0,
+               out_image_dir=None, verbose=False):
+    self.data_file = data_file
+    self.pred_file = pred_file
+    self.out_file = out_file
+    self.na_prob_file = na_prob_file
+    self.na_prob_thresh = na_prob_thresh
+    self.out_image_dir = out_image_dir
+    self.verbose = verbose
+
+OPTS = None
+
+def parse_args():
+  parser = argparse.ArgumentParser('Official evaluation script for SQuAD version 2.0.')
+  parser.add_argument('data_file', metavar='data.json', help='Input data JSON file.')
+  parser.add_argument('pred_file', metavar='pred.json', help='Model predictions.')
+  parser.add_argument('--out-file', '-o', metavar='eval.json',
+                      help='Write accuracy metrics to file (default is stdout).')
+  parser.add_argument('--na-prob-file', '-n', metavar='na_prob.json',
+                      help='Model estimates of probability of no answer.')
+  parser.add_argument('--na-prob-thresh', '-t', type=float, default=1.0,
+                      help='Predict "" if no-answer probability exceeds this (default = 1.0).')
+  parser.add_argument('--out-image-dir', '-p', metavar='out_images', default=None,
+                      help='Save precision-recall curves to directory.')
+  parser.add_argument('--verbose', '-v', action='store_true')
+  if len(sys.argv) == 1:
+    parser.print_help()
+    sys.exit(1)
+  return parser.parse_args()
+
+def make_qid_to_has_ans(dataset):
+  qid_to_has_ans = {}
+  for article in dataset:
+    for p in article['paragraphs']:
+      for qa in p['qas']:
+        qid_to_has_ans[qa['id']] = bool(qa['answers'])
+  return qid_to_has_ans
+
+def normalize_answer(s):
+  """Lower text and remove punctuation, articles and extra whitespace."""
+  def remove_articles(text):
+    regex = re.compile(r'\b(a|an|the)\b', re.UNICODE)
+    return re.sub(regex, ' ', text)
+  def white_space_fix(text):
+    return ' '.join(text.split())
+  def remove_punc(text):
+    exclude = set(string.punctuation)
+    return ''.join(ch for ch in text if ch not in exclude)
+  def lower(text):
+    return text.lower()
+  return white_space_fix(remove_articles(remove_punc(lower(s))))
+
+def get_tokens(s):
+  if not s: return []
+  return normalize_answer(s).split()
+
+def compute_exact(a_gold, a_pred):
+  return int(normalize_answer(a_gold) == normalize_answer(a_pred))
+
+def compute_f1(a_gold, a_pred):
+  gold_toks = get_tokens(a_gold)
+  pred_toks = get_tokens(a_pred)
+  common = collections.Counter(gold_toks) & collections.Counter(pred_toks)
+  num_same = sum(common.values())
+  if len(gold_toks) == 0 or len(pred_toks) == 0:
+    # If either is no-answer, then F1 is 1 if they agree, 0 otherwise
+    return int(gold_toks == pred_toks)
+  if num_same == 0:
+    return 0
+  precision = 1.0 * num_same / len(pred_toks)
+  recall = 1.0 * num_same / len(gold_toks)
+  f1 = (2 * precision * recall) / (precision + recall)
+  return f1
+
+def get_raw_scores(dataset, preds):
+  exact_scores = {}
+  f1_scores = {}
+  for article in dataset:
+    for p in article['paragraphs']:
+      for qa in p['qas']:
+        qid = qa['id']
+        gold_answers = [a['text'] for a in qa['answers']
+                        if normalize_answer(a['text'])]
+        if not gold_answers:
+          # For unanswerable questions, only correct answer is empty string
+          gold_answers = ['']
+        if qid not in preds:
+          print('Missing prediction for %s' % qid)
+          continue
+        a_pred = preds[qid]
+        # Take max over all gold answers
+        exact_scores[qid] = max(compute_exact(a, a_pred) for a in gold_answers)
+        f1_scores[qid] = max(compute_f1(a, a_pred) for a in gold_answers)
+  return exact_scores, f1_scores
+
+def apply_no_ans_threshold(scores, na_probs, qid_to_has_ans, na_prob_thresh):
+  new_scores = {}
+  for qid, s in scores.items():
+    pred_na = na_probs[qid] > na_prob_thresh
+    if pred_na:
+      new_scores[qid] = float(not qid_to_has_ans[qid])
+    else:
+      new_scores[qid] = s
+  return new_scores
+
+def make_eval_dict(exact_scores, f1_scores, qid_list=None):
+  if not qid_list:
+    total = len(exact_scores)
+    return collections.OrderedDict([
+        ('exact', 100.0 * sum(exact_scores.values()) / total),
+        ('f1', 100.0 * sum(f1_scores.values()) / total),
+        ('total', total),
+    ])
+  else:
+    total = len(qid_list)
+    return collections.OrderedDict([
+        ('exact', 100.0 * sum(exact_scores[k] for k in qid_list) / total),
+        ('f1', 100.0 * sum(f1_scores[k] for k in qid_list) / total),
+        ('total', total),
+    ])
+
+def merge_eval(main_eval, new_eval, prefix):
+  for k in new_eval:
+    main_eval['%s_%s' % (prefix, k)] = new_eval[k]
+
+def plot_pr_curve(precisions, recalls, out_image, title):
+  plt.step(recalls, precisions, color='b', alpha=0.2, where='post')
+  plt.fill_between(recalls, precisions, step='post', alpha=0.2, color='b')
+  plt.xlabel('Recall')
+  plt.ylabel('Precision')
+  plt.xlim([0.0, 1.05])
+  plt.ylim([0.0, 1.05])
+  plt.title(title)
+  plt.savefig(out_image)
+  plt.clf()
+
+def make_precision_recall_eval(scores, na_probs, num_true_pos, qid_to_has_ans,
+                               out_image=None, title=None):
+  qid_list = sorted(na_probs, key=lambda k: na_probs[k])
+  true_pos = 0.0
+  cur_p = 1.0
+  cur_r = 0.0
+  precisions = [1.0]
+  recalls = [0.0]
+  avg_prec = 0.0
+  for i, qid in enumerate(qid_list):
+    if qid_to_has_ans[qid]:
+      true_pos += scores[qid]
+    cur_p = true_pos / float(i+1)
+    cur_r = true_pos / float(num_true_pos)
+    if i == len(qid_list) - 1 or na_probs[qid] != na_probs[qid_list[i+1]]:
+      # i.e., if we can put a threshold after this point
+      avg_prec += cur_p * (cur_r - recalls[-1])
+      precisions.append(cur_p)
+      recalls.append(cur_r)
+  if out_image:
+    plot_pr_curve(precisions, recalls, out_image, title)
+  return {'ap': 100.0 * avg_prec}
+
+def run_precision_recall_analysis(main_eval, exact_raw, f1_raw, na_probs, 
+                                  qid_to_has_ans, out_image_dir):
+  if out_image_dir and not os.path.exists(out_image_dir):
+    os.makedirs(out_image_dir)
+  num_true_pos = sum(1 for v in qid_to_has_ans.values() if v)
+  if num_true_pos == 0:
+    return
+  pr_exact = make_precision_recall_eval(
+      exact_raw, na_probs, num_true_pos, qid_to_has_ans,
+      out_image=os.path.join(out_image_dir, 'pr_exact.png'),
+      title='Precision-Recall curve for Exact Match score')
+  pr_f1 = make_precision_recall_eval(
+      f1_raw, na_probs, num_true_pos, qid_to_has_ans,
+      out_image=os.path.join(out_image_dir, 'pr_f1.png'),
+      title='Precision-Recall curve for F1 score')
+  oracle_scores = {k: float(v) for k, v in qid_to_has_ans.items()}
+  pr_oracle = make_precision_recall_eval(
+      oracle_scores, na_probs, num_true_pos, qid_to_has_ans,
+      out_image=os.path.join(out_image_dir, 'pr_oracle.png'),
+      title='Oracle Precision-Recall curve (binary task of HasAns vs. NoAns)')
+  merge_eval(main_eval, pr_exact, 'pr_exact')
+  merge_eval(main_eval, pr_f1, 'pr_f1')
+  merge_eval(main_eval, pr_oracle, 'pr_oracle')
+
+def histogram_na_prob(na_probs, qid_list, image_dir, name):
+  if not qid_list:
+    return
+  x = [na_probs[k] for k in qid_list]
+  weights = np.ones_like(x) / float(len(x))
+  plt.hist(x, weights=weights, bins=20, range=(0.0, 1.0))
+  plt.xlabel('Model probability of no-answer')
+  plt.ylabel('Proportion of dataset')
+  plt.title('Histogram of no-answer probability: %s' % name)
+  plt.savefig(os.path.join(image_dir, 'na_prob_hist_%s.png' % name))
+  plt.clf()
+
+def find_best_thresh(preds, scores, na_probs, qid_to_has_ans):
+  num_no_ans = sum(1 for k in qid_to_has_ans if not qid_to_has_ans[k])
+  cur_score = num_no_ans
+  best_score = cur_score
+  best_thresh = 0.0
+  qid_list = sorted(na_probs, key=lambda k: na_probs[k])
+  for i, qid in enumerate(qid_list):
+    if qid not in scores: continue
+    if qid_to_has_ans[qid]:
+      diff = scores[qid]
+    else:
+      if preds[qid]:
+        diff = -1
+      else:
+        diff = 0
+    cur_score += diff
+    if cur_score > best_score:
+      best_score = cur_score
+      best_thresh = na_probs[qid]
+  return 100.0 * best_score / len(scores), best_thresh
+
+def find_best_thresh_v2(preds, scores, na_probs, qid_to_has_ans):
+  num_no_ans = sum(1 for k in qid_to_has_ans if not qid_to_has_ans[k])
+  cur_score = num_no_ans
+  best_score = cur_score
+  best_thresh = 0.0
+  qid_list = sorted(na_probs, key=lambda k: na_probs[k])
+  for i, qid in enumerate(qid_list):
+    if qid not in scores: continue
+    if qid_to_has_ans[qid]:
+      diff = scores[qid]
+    else:
+      if preds[qid]:
+        diff = -1
+      else:
+        diff = 0
+    cur_score += diff
+    if cur_score > best_score:
+      best_score = cur_score
+      best_thresh = na_probs[qid]
+
+  has_ans_score, has_ans_cnt = 0, 0
+  for qid in qid_list:
+    if not qid_to_has_ans[qid]: continue
+    has_ans_cnt += 1
+
+    if qid not in scores: continue
+    has_ans_score += scores[qid]
+
+  return 100.0 * best_score / len(scores), best_thresh, 1.0 * has_ans_score / has_ans_cnt
+
+def find_all_best_thresh(main_eval, preds, exact_raw, f1_raw, na_probs, qid_to_has_ans):
+  best_exact, exact_thresh = find_best_thresh(preds, exact_raw, na_probs, qid_to_has_ans)
+  best_f1, f1_thresh = find_best_thresh(preds, f1_raw, na_probs, qid_to_has_ans)
+  main_eval['best_exact'] = best_exact
+  main_eval['best_exact_thresh'] = exact_thresh
+  main_eval['best_f1'] = best_f1
+  main_eval['best_f1_thresh'] = f1_thresh
+
+def find_all_best_thresh_v2(main_eval, preds, exact_raw, f1_raw, na_probs, qid_to_has_ans):
+  best_exact, exact_thresh, has_ans_exact = find_best_thresh_v2(preds, exact_raw, na_probs, qid_to_has_ans)
+  best_f1, f1_thresh, has_ans_f1 = find_best_thresh_v2(preds, f1_raw, na_probs, qid_to_has_ans)
+  main_eval['best_exact'] = best_exact
+  main_eval['best_exact_thresh'] = exact_thresh
+  main_eval['best_f1'] = best_f1
+  main_eval['best_f1_thresh'] = f1_thresh
+  main_eval['has_ans_exact'] = has_ans_exact
+  main_eval['has_ans_f1'] = has_ans_f1
+
+def main(OPTS):
+  with open(OPTS.data_file) as f:
+    dataset_json = json.load(f)
+    dataset = dataset_json['data']
+  with open(OPTS.pred_file) as f:
+    preds = json.load(f)
+  if OPTS.na_prob_file:
+    with open(OPTS.na_prob_file) as f:
+      na_probs = json.load(f)
+  else:
+    na_probs = {k: 0.0 for k in preds}
+  qid_to_has_ans = make_qid_to_has_ans(dataset)  # maps qid to True/False
+  has_ans_qids = [k for k, v in qid_to_has_ans.items() if v]
+  no_ans_qids = [k for k, v in qid_to_has_ans.items() if not v]
+  exact_raw, f1_raw = get_raw_scores(dataset, preds)
+  exact_thresh = apply_no_ans_threshold(exact_raw, na_probs, qid_to_has_ans,
+                                        OPTS.na_prob_thresh)
+  f1_thresh = apply_no_ans_threshold(f1_raw, na_probs, qid_to_has_ans,
+                                     OPTS.na_prob_thresh)
+  out_eval = make_eval_dict(exact_thresh, f1_thresh)
+  if has_ans_qids:
+    has_ans_eval = make_eval_dict(exact_thresh, f1_thresh, qid_list=has_ans_qids)
+    merge_eval(out_eval, has_ans_eval, 'HasAns')
+  if no_ans_qids:
+    no_ans_eval = make_eval_dict(exact_thresh, f1_thresh, qid_list=no_ans_qids)
+    merge_eval(out_eval, no_ans_eval, 'NoAns')
+  if OPTS.na_prob_file:
+    find_all_best_thresh(out_eval, preds, exact_raw, f1_raw, na_probs, qid_to_has_ans)
+  if OPTS.na_prob_file and OPTS.out_image_dir:
+    run_precision_recall_analysis(out_eval, exact_raw, f1_raw, na_probs, 
+                                  qid_to_has_ans, OPTS.out_image_dir)
+    histogram_na_prob(na_probs, has_ans_qids, OPTS.out_image_dir, 'hasAns')
+    histogram_na_prob(na_probs, no_ans_qids, OPTS.out_image_dir, 'noAns')
+  if OPTS.out_file:
+    with open(OPTS.out_file, 'w') as f:
+      json.dump(out_eval, f)
+  else:
+    print(json.dumps(out_eval, indent=2))
+  return out_eval
+
+if __name__ == '__main__':
+  OPTS = parse_args()
+  if OPTS.out_image_dir:
+    import matplotlib
+    matplotlib.use('Agg')
+    import matplotlib.pyplot as plt 
+  main(OPTS)