Merge pull request #254 from huggingface/python_2

Adding OpenAI GPT and Transformer-XL models, compatibility with Python 2
2019-02-11 14:19:26 +01:00
parent 2dfaf2f227 1e71f11dec
commit 03cdb2a390
30 changed files with 5729 additions and 299 deletions
--- a/examples/extract_features.py
+++ b/examples/extract_features.py
@@ -80,10 +80,10 @@ def convert_examples_to_features(examples, seq_length, tokenizer):
        # The convention in BERT is:
        # (a) For sequence pairs:
        #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
-        #  type_ids: 0   0  0    0    0     0       0 0    1  1  1  1   1 1
+        #  type_ids:   0   0  0    0    0     0      0   0    1  1  1   1  1   1
        # (b) For single sequences:
        #  tokens:   [CLS] the dog is hairy . [SEP]
-        #  type_ids: 0   0   0   0  0     0 0
+        #  type_ids:   0   0   0   0  0     0   0
        #
        # Where "type_ids" are used to indicate whether this is the first
        # sequence or the second sequence. The embedding vectors for `type=0` and
--- a/examples/run_classifier.py
+++ b/examples/run_classifier.py
@@ -15,26 +15,26 @@
 # limitations under the License.
 """BERT finetuning runner."""

-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+from __future__ import absolute_import, division, print_function

-import csv
-import os
-import logging
 import argparse
+import csv
+import logging
+import os
 import random
-from tqdm import tqdm, trange
+import sys

 import numpy as np
 import torch
-from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
+from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
+                              TensorDataset)
 from torch.utils.data.distributed import DistributedSampler
+from tqdm import tqdm, trange

-from pytorch_pretrained_bert.tokenization import BertTokenizer
-from pytorch_pretrained_bert.modeling import BertForSequenceClassification
-from pytorch_pretrained_bert.optimization import BertAdam, warmup_linear
 from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE
+from pytorch_pretrained_bert.modeling import BertForSequenceClassification, BertConfig, WEIGHTS_NAME, CONFIG_NAME
+from pytorch_pretrained_bert.tokenization import BertTokenizer
+from pytorch_pretrained_bert.optimization import BertAdam, warmup_linear

 logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                    datefmt = '%m/%d/%Y %H:%M:%S',
@@ -91,10 +91,12 @@ class DataProcessor(object):
    @classmethod
    def _read_tsv(cls, input_file, quotechar=None):
        """Reads a tab separated value file."""
-        with open(input_file, "r", encoding='utf-8') as f:
+        with open(input_file, "r") as f:
            reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
            lines = []
            for line in reader:
+                if sys.version_info[0] == 2:
+                    line = list(unicode(cell, 'utf-8') for cell in line)
                lines.append(line)
            return lines

@@ -321,6 +323,10 @@ def main():
                        help="The output directory where the model predictions and checkpoints will be written.")

    ## Other parameters
+    parser.add_argument("--cache_dir",
+                        default="",
+                        type=str,
+                        help="Where do you want to store the pre-trained models downloaded from s3")
    parser.add_argument("--max_seq_length",
                        default=128,
                        type=int,
@@ -380,9 +386,17 @@ def main():
                        help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
                             "0 (default value): dynamic loss scaling.\n"
                             "Positive power of 2: static loss scaling value.\n")
-
+    parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.")
+    parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.")
    args = parser.parse_args()

+    if args.server_ip and args.server_port:
+        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
+        import ptvsd
+        print("Waiting for debugger attach")
+        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
+        ptvsd.wait_for_attach()
+
    processors = {
        "cola": ColaProcessor,
        "mnli": MnliProcessor,
@@ -424,7 +438,8 @@ def main():

    if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train:
        raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir))
-    os.makedirs(args.output_dir, exist_ok=True)
+    if not os.path.exists(args.output_dir):
+        os.makedirs(args.output_dir)

    task_name = args.task_name.lower()

@@ -447,8 +462,9 @@ def main():
            num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size()

    # Prepare model
+    cache_dir = args.cache_dir if args.cache_dir else os.path.join(PYTORCH_PRETRAINED_BERT_CACHE, 'distributed_{}'.format(args.local_rank))
    model = BertForSequenceClassification.from_pretrained(args.bert_model,
-              cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format(args.local_rank),
+              cache_dir=cache_dir,
              num_labels = num_labels)
    if args.fp16:
        model.half()
@@ -545,15 +561,21 @@ def main():
                    optimizer.zero_grad()
                    global_step += 1

-    # Save a trained model
-    model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
-    output_model_file = os.path.join(args.output_dir, "pytorch_model.bin")
    if args.do_train:
+        # Save a trained model and the associated configuration
+        model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
+        output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
        torch.save(model_to_save.state_dict(), output_model_file)
+        output_config_file = os.path.join(args.output_dir, CONFIG_NAME)
+        with open(output_config_file, 'w') as f:
+            f.write(model_to_save.config.to_json_string())

-    # Load a trained model that you have fine-tuned
-    model_state_dict = torch.load(output_model_file)
-    model = BertForSequenceClassification.from_pretrained(args.bert_model, state_dict=model_state_dict, num_labels=num_labels)
+        # Load a trained model and config that you have fine-tuned
+        config = BertConfig(output_config_file)
+        model = BertForSequenceClassification(config, num_labels=num_labels)
+        model.load_state_dict(torch.load(output_model_file))
+    else:
+        model = BertForSequenceClassification.from_pretrained(args.bert_model, num_labels=num_labels)
    model.to(device)

    if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
--- a/examples/run_lm_finetuning.py
+++ b/examples/run_lm_finetuning.py
@@ -15,22 +15,22 @@
 # limitations under the License.
 """BERT finetuning runner."""

-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+from __future__ import absolute_import, division, print_function, unicode_literals

-import os
-import logging
 import argparse
-from tqdm import tqdm, trange
+import logging
+import os
+import random
+from io import open

 import numpy as np
 import torch
-from torch.utils.data import DataLoader, RandomSampler
+from torch.utils.data import DataLoader, Dataset, RandomSampler
 from torch.utils.data.distributed import DistributedSampler
+from tqdm import tqdm, trange

-from pytorch_pretrained_bert.tokenization import BertTokenizer
 from pytorch_pretrained_bert.modeling import BertForPreTraining
+from pytorch_pretrained_bert.tokenization import BertTokenizer
 from pytorch_pretrained_bert.optimization import BertAdam, warmup_linear

 from torch.utils.data import Dataset
@@ -179,16 +179,16 @@ class BERTDataset(Dataset):
            if self.line_buffer is None:
                # read first non-empty line of file
                while t1 == "" :
-                    t1 = self.file.__next__().strip()
-                    t2 = self.file.__next__().strip()
+                    t1 = next(self.file).strip()
+                    t2 = next(self.file).strip()
            else:
                # use t2 from previous iteration as new t1
                t1 = self.line_buffer
-                t2 = self.file.__next__().strip()
+                t2 = next(self.file).strip()
                # skip empty rows that are used for separating documents and keep track of current doc id
                while t2 == "" or t1 == "":
-                    t1 = self.file.__next__().strip()
-                    t2 = self.file.__next__().strip()
+                    t1 = next(self.file).strip()
+                    t2 = next(self.file).strip()
                    self.current_doc = self.current_doc+1
            self.line_buffer = t2

@@ -222,15 +222,15 @@ class BERTDataset(Dataset):
    def get_next_line(self):
        """ Gets next line of random_file and starts over when reaching end of file"""
        try:
-            line = self.random_file.__next__().strip()
+            line = next(self.random_file).strip()
            #keep track of which document we are currently looking at to later avoid having the same doc as t1
            if line == "":
                self.current_random_doc = self.current_random_doc + 1
-                line = self.random_file.__next__().strip()
+                line = next(self.random_file).strip()
        except StopIteration:
            self.random_file.close()
            self.random_file = open(self.corpus_path, "r", encoding=self.encoding)
-            line = self.random_file.__next__().strip()
+            line = next(self.random_file).strip()
        return line


@@ -419,6 +419,7 @@ def main():
                        help="The output directory where the model checkpoints will be written.")

    ## Other parameters
+    parser.add_argument("--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.")
    parser.add_argument("--max_seq_length",
                        default=128,
                        type=int,
@@ -506,7 +507,8 @@ def main():

    if os.path.exists(args.output_dir) and os.listdir(args.output_dir):
        raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir))
-    os.makedirs(args.output_dir, exist_ok=True)
+    if not os.path.exists(args.output_dir):
+        os.makedirs(args.output_dir)

    tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)

@@ -575,7 +577,7 @@ def main():
        if args.local_rank == -1:
            train_sampler = RandomSampler(train_dataset)
        else:
-            #TODO: check if this works with current data generator from disk that relies on file.__next__
+            #TODO: check if this works with current data generator from disk that relies on next(file)
            # (it doesn't return item back by index)
            train_sampler = DistributedSampler(train_dataset)
        train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size)
@@ -641,4 +643,4 @@ def accuracy(out, labels):


 if __name__ == "__main__":
-    main()
+    main()
--- a/examples/run_openai_gpt.py
+++ b/examples/run_openai_gpt.py
@@ -0,0 +1,259 @@
+# coding=utf-8
+# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HugginFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" OpenAI GPT model fine-tuning script.
+    Adapted from https://github.com/huggingface/pytorch-openai-transformer-lm/blob/master/train.py
+    It self adapted from https://github.com/openai/finetune-transformer-lm/blob/master/train.py
+
+    This script with default values fine-tunes and evaluate a pretrained OpenAI GPT on the RocStories dataset
+"""
+import argparse
+import os
+import csv
+import random
+import logging
+from tqdm import tqdm, trange
+
+import numpy as np
+import torch
+from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
+                              TensorDataset)
+
+from pytorch_pretrained_bert import OpenAIGPTDoubleHeadsModel, OpenAIGPTTokenizer, OpenAIAdam, cached_path
+
+ROCSTORIES_URL = "https://s3.amazonaws.com/datasets.huggingface.co/ROCStories.tar.gz"
+
+logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
+                    datefmt = '%m/%d/%Y %H:%M:%S',
+                    level = logging.INFO)
+logger = logging.getLogger(__name__)
+
+def accuracy(out, labels):
+    outputs = np.argmax(out, axis=1)
+    return np.sum(outputs == labels)
+
+def load_rocstories_dataset(dataset_path):
+    """ Output a list of tuples(story, 1st continuation, 2nd continuation, label) """
+    with open(dataset_path, encoding='utf_8') as f:
+        f = csv.reader(f)
+        output = []
+        next(f) # skip the first line
+        for line in tqdm(f):
+            output.append((' '.join(line[1:5]), line[5], line[6], int(line[-1])-1))
+    return output
+
+def pre_process_datasets(encoded_datasets, input_len, cap_length, start_token, delimiter_token, clf_token):
+    """ Pre-process datasets containing lists of tuples(story, 1st continuation, 2nd continuation, label)
+
+        To Transformer inputs of shape (n_batch, n_alternative, length) comprising for each batch, continuation:
+        input_ids[batch, alternative, :] = [start_token] + story[:cap_length] + [delimiter_token] + cont1[:cap_length] + [clf_token]
+    """
+    tensor_datasets = []
+    for dataset in encoded_datasets:
+        n_batch = len(dataset)
+        input_ids = np.zeros((n_batch, 2, input_len), dtype=np.int64)
+        mc_token_ids = np.zeros((n_batch, 2), dtype=np.int64)
+        lm_labels = np.full((n_batch, 2, input_len), fill_value=-1, dtype=np.int64)
+        mc_labels = np.zeros((n_batch,), dtype=np.int64)
+        for i, (story, cont1, cont2, mc_label), in enumerate(dataset):
+            with_cont1 = [start_token] + story[:cap_length] + [delimiter_token] + cont1[:cap_length] + [clf_token]
+            with_cont2 = [start_token] + story[:cap_length] + [delimiter_token] + cont2[:cap_length] + [clf_token]
+            input_ids[i, 0, :len(with_cont1)] = with_cont1
+            input_ids[i, 1, :len(with_cont2)] = with_cont2
+            mc_token_ids[i, 0] = len(with_cont1) - 1
+            mc_token_ids[i, 1] = len(with_cont2) - 1
+            lm_labels[i, 0, :len(with_cont1)-1] = with_cont1[1:]
+            lm_labels[i, 1, :len(with_cont2)-1] = with_cont2[1:]
+            mc_labels[i] = mc_label
+        all_inputs = (input_ids, mc_token_ids, lm_labels, mc_labels)
+        tensor_datasets.append(tuple(torch.tensor(t) for t in all_inputs))
+    return tensor_datasets
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--model_name', type=str, default='openai-gpt',
+                        help='pretrained model name')
+    parser.add_argument("--do_train", action='store_true', help="Whether to run training.")
+    parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.")
+    parser.add_argument("--output_dir", default=None, type=str, required=True,
+                        help="The output directory where the model predictions and checkpoints will be written.")
+    parser.add_argument('--train_dataset', type=str, default='')
+    parser.add_argument('--eval_dataset', type=str, default='')
+    parser.add_argument('--seed', type=int, default=42)
+    parser.add_argument('--num_train_epochs', type=int, default=3)
+    parser.add_argument('--train_batch_size', type=int, default=8)
+    parser.add_argument('--eval_batch_size', type=int, default=16)
+    parser.add_argument('--max_grad_norm', type=int, default=1)
+    parser.add_argument('--learning_rate', type=float, default=6.25e-5)
+    parser.add_argument('--warmup_proportion', type=float, default=0.002)
+    parser.add_argument('--lr_schedule', type=str, default='warmup_linear')
+    parser.add_argument('--weight_decay', type=float, default=0.01)
+    parser.add_argument('--lm_coef', type=float, default=0.9)
+    parser.add_argument('--n_valid', type=int, default=374)
+
+    parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.")
+    parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.")
+    args = parser.parse_args()
+    print(args)
+
+    if args.server_ip and args.server_port:
+        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
+        import ptvsd
+        print("Waiting for debugger attach")
+        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
+        ptvsd.wait_for_attach()
+
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+    torch.manual_seed(args.seed)
+    torch.cuda.manual_seed_all(args.seed)
+
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    n_gpu = torch.cuda.device_count()
+    logger.info("device: {}, n_gpu {}".format(device, n_gpu))
+
+    if not args.do_train and not args.do_eval:
+        raise ValueError("At least one of `do_train` or `do_eval` must be True.")
+
+    if not os.path.exists(args.output_dir):
+        os.makedirs(args.output_dir)
+
+    # Load tokenizer and model
+    # This loading functions also add new tokens and embeddings called `special tokens`
+    # These new embeddings will be fine-tuned on the RocStories dataset
+    special_tokens = ['_start_', '_delimiter_', '_classify_']
+    tokenizer = OpenAIGPTTokenizer.from_pretrained(args.model_name, special_tokens=special_tokens)
+    special_tokens_ids = list(tokenizer.convert_tokens_to_ids(token) for token in special_tokens)
+    model = OpenAIGPTDoubleHeadsModel.from_pretrained(args.model_name, num_special_tokens=len(special_tokens))
+    model.to(device)
+
+    # Load and encode the datasets
+    if not args.train_dataset and not args.eval_dataset:
+        roc_stories = cached_path(ROCSTORIES_URL)
+    def tokenize_and_encode(obj):
+        """ Tokenize and encode a nested object """
+        if isinstance(obj, str):
+            return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(obj))
+        elif isinstance(obj, int):
+            return obj
+        return list(tokenize_and_encode(o) for o in obj)
+    logger.info("Encoding dataset...")
+    train_dataset = load_rocstories_dataset(args.train_dataset)
+    eval_dataset = load_rocstories_dataset(args.eval_dataset)
+    datasets = (train_dataset, eval_dataset)
+    encoded_datasets = tokenize_and_encode(datasets)
+
+    # Compute the mex input length for the Transformer
+    max_length = model.config.n_positions // 2 - 2
+    input_length = max(len(story[:max_length]) + max(len(cont1[:max_length]), len(cont2[:max_length])) + 3  \
+                           for dataset in encoded_datasets for story, cont1, cont2, _ in dataset)
+    input_length = min(input_length, model.config.n_positions)  # Max size of input for the pre-trained model
+
+    # Prepare inputs tensors and dataloaders
+    tensor_datasets = pre_process_datasets(encoded_datasets, input_length, max_length, *special_tokens_ids)
+    train_tensor_dataset, eval_tensor_dataset = tensor_datasets[0], tensor_datasets[1]
+
+    train_data = TensorDataset(*train_tensor_dataset)
+    train_sampler = RandomSampler(train_data)
+    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)
+
+    eval_data = TensorDataset(*eval_tensor_dataset)
+    eval_sampler = SequentialSampler(eval_data)
+    eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)
+
+    # Prepare optimizer
+    param_optimizer = list(model.named_parameters())
+    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
+    optimizer_grouped_parameters = [
+        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
+        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
+        ]
+    num_train_optimization_steps = len(train_data) * args.num_train_epochs // args.train_batch_size
+    optimizer = OpenAIAdam(optimizer_grouped_parameters,
+                           lr=args.learning_rate,
+                           warmup=args.warmup_proportion,
+                           max_grad_norm=args.max_grad_norm,
+                           weight_decay=args.weight_decay,
+                           t_total=num_train_optimization_steps)
+
+    if args.do_train:
+        nb_tr_steps, tr_loss, exp_average_loss = 0, 0, None
+        model.train()
+        for _ in trange(int(args.num_train_epochs), desc="Epoch"):
+            tr_loss = 0
+            nb_tr_steps = 0
+            tqdm_bar = tqdm(train_dataloader, desc="Training")
+            for step, batch in enumerate(tqdm_bar):
+                batch = tuple(t.to(device) for t in batch)
+                input_ids, mc_token_ids, lm_labels, mc_labels = batch
+                losses = model(input_ids, mc_token_ids, lm_labels, mc_labels)
+                loss = args.lm_coef * losses[0] + losses[1]
+                loss.backward()
+                optimizer.step()
+                tr_loss += loss.item()
+                exp_average_loss = loss.item() if exp_average_loss is None else 0.7*exp_average_loss+0.3*loss.item()
+                nb_tr_steps += 1
+                tqdm_bar.desc = "Training loss: {:.2e} lr: {:.2e}".format(exp_average_loss, optimizer.get_lr()[0])
+
+    # Save a trained model
+    if args.do_train:
+        model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
+        output_model_file = os.path.join(args.output_dir, "pytorch_model.bin")
+        config = model.config
+        torch.save(model_to_save.state_dict(), output_model_file)
+
+        # Load a trained model that you have fine-tuned
+        model_state_dict = torch.load(output_model_file)
+        model = OpenAIGPTDoubleHeadsModel(config)
+        model.load_state_dict(model_state_dict)
+        model.to(device)
+
+    if args.do_eval:
+        model.eval()
+        eval_loss, eval_accuracy = 0, 0
+        nb_eval_steps, nb_eval_examples = 0, 0
+        for batch in tqdm(eval_dataloader, desc="Evaluating"):
+            batch = tuple(t.to(device) for t in batch)
+            input_ids, mc_token_ids, lm_labels, mc_labels = batch
+            with torch.no_grad():
+                _, mc_loss = model(input_ids, mc_token_ids, lm_labels, mc_labels)
+                _, mc_logits = model(input_ids, mc_token_ids)
+
+            mc_logits = mc_logits.detach().cpu().numpy()
+            mc_labels = mc_labels.to('cpu').numpy()
+            tmp_eval_accuracy = accuracy(mc_logits, mc_labels)
+
+            eval_loss += mc_loss.mean().item()
+            eval_accuracy += tmp_eval_accuracy
+
+            nb_eval_examples += input_ids.size(0)
+            nb_eval_steps += 1
+
+        eval_loss = eval_loss / nb_eval_steps
+        eval_accuracy = eval_accuracy / nb_eval_examples
+        train_loss = tr_loss/nb_tr_steps if args.do_train else None
+        result = {'eval_loss': eval_loss,
+                  'eval_accuracy': eval_accuracy,
+                  'train_loss': train_loss}
+
+        output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
+        with open(output_eval_file, "w") as writer:
+            logger.info("***** Eval results *****")
+            for key in sorted(result.keys()):
+                logger.info("  %s = %s", key, str(result[key]))
+                writer.write("%s = %s\n" % (key, str(result[key])))
+
+if __name__ == '__main__':
+    main()
--- a/examples/run_squad.py
+++ b/examples/run_squad.py
@@ -15,29 +15,36 @@
 # limitations under the License.
 """Run BERT on SQuAD."""

-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+from __future__ import absolute_import, division, print_function

 import argparse
 import collections
-import logging
 import json
+import logging
 import math
 import os
 import random
-import pickle
-from tqdm import tqdm, trange
+import sys
+from io import open

 import numpy as np
 import torch
-from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
+from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
+                              TensorDataset)
 from torch.utils.data.distributed import DistributedSampler
+from tqdm import tqdm, trange

-from pytorch_pretrained_bert.tokenization import whitespace_tokenize, BasicTokenizer, BertTokenizer
-from pytorch_pretrained_bert.modeling import BertForQuestionAnswering
-from pytorch_pretrained_bert.optimization import BertAdam, warmup_linear
 from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE
+from pytorch_pretrained_bert.modeling import BertForQuestionAnswering, BertConfig, WEIGHTS_NAME, CONFIG_NAME
+from pytorch_pretrained_bert.optimization import BertAdam, warmup_linear
+from pytorch_pretrained_bert.tokenization import (BasicTokenizer,
+                                                  BertTokenizer,
+                                                  whitespace_tokenize)
+
+if sys.version_info[0] == 2:
+    import cPickle as pickle
+else:
+    import pickle

 logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                    datefmt = '%m/%d/%Y %H:%M:%S',
@@ -863,7 +870,8 @@ def main():

    if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train:
        raise ValueError("Output directory () already exists and is not empty.")
-    os.makedirs(args.output_dir, exist_ok=True)
+    if not os.path.exists(args.output_dir):
+        os.makedirs(args.output_dir)

    tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)

@@ -879,7 +887,7 @@ def main():

    # Prepare model
    model = BertForQuestionAnswering.from_pretrained(args.bert_model,
-                cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format(args.local_rank))
+                cache_dir=os.path.join(PYTORCH_PRETRAINED_BERT_CACHE, 'distributed_{}'.format(args.local_rank)))

    if args.fp16:
        model.half()
@@ -909,7 +917,7 @@ def main():

    if args.fp16:
        try:
-            from apex.optimizer import FP16_Optimizer
+            from apex.optimizers import FP16_Optimizer
            from apex.optimizers import FusedAdam
        except ImportError:
            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
@@ -993,14 +1001,19 @@ def main():
                    optimizer.zero_grad()
                    global_step += 1

-    # Save a trained model
-    model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
-    output_model_file = os.path.join(args.output_dir, "pytorch_model.bin")
    if args.do_train:
+        # Save a trained model and the associated configuration
+        model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
+        output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
        torch.save(model_to_save.state_dict(), output_model_file)
-        # Load a trained model that you have fine-tuned
-        model_state_dict = torch.load(output_model_file)
-        model = BertForQuestionAnswering.from_pretrained(args.bert_model, state_dict=model_state_dict)
+        output_config_file = os.path.join(args.output_dir, CONFIG_NAME)
+        with open(output_config_file, 'w') as f:
+            f.write(model_to_save.config.to_json_string())
+
+        # Load a trained model and config that you have fine-tuned
+        config = BertConfig(output_config_file)
+        model = BertForQuestionAnswering(config)
+        model.load_state_dict(torch.load(output_model_file))
    else:
        model = BertForQuestionAnswering.from_pretrained(args.bert_model)

--- a/examples/run_swag.py
+++ b/examples/run_swag.py
@@ -15,22 +15,25 @@
 # limitations under the License.
 """BERT finetuning runner."""

+import argparse
+import csv
 import logging
 import os
-import argparse
 import random
-from tqdm import tqdm, trange
-import csv
+import sys
+from io import open

 import numpy as np
 import torch
-from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
+from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
+                              TensorDataset)
 from torch.utils.data.distributed import DistributedSampler
+from tqdm import tqdm, trange

-from pytorch_pretrained_bert.tokenization import BertTokenizer
+from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE
 from pytorch_pretrained_bert.modeling import BertForMultipleChoice
 from pytorch_pretrained_bert.optimization import BertAdam, warmup_linear
-from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE
+from pytorch_pretrained_bert.tokenization import BertTokenizer

 logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                    datefmt = '%m/%d/%Y %H:%M:%S',
@@ -65,17 +68,17 @@ class SwagExample(object):

    def __repr__(self):
        l = [
-            f"swag_id: {self.swag_id}",
-            f"context_sentence: {self.context_sentence}",
-            f"start_ending: {self.start_ending}",
-            f"ending_0: {self.endings[0]}",
-            f"ending_1: {self.endings[1]}",
-            f"ending_2: {self.endings[2]}",
-            f"ending_3: {self.endings[3]}",
+            "swag_id: {}".format(self.swag_id),
+            "context_sentence: {}".format(self.context_sentence),
+            "start_ending: {}".format(self.start_ending),
+            "ending_0: {}".format(self.endings[0]),
+            "ending_1: {}".format(self.endings[1]),
+            "ending_2: {}".format(self.endings[2]),
+            "ending_3: {}".format(self.endings[3]),
        ]

        if self.label is not None:
-            l.append(f"label: {self.label}")
+            l.append("label: {}".format(self.label))

        return ", ".join(l)

@@ -102,7 +105,11 @@ class InputFeatures(object):
 def read_swag_examples(input_file, is_training):
    with open(input_file, 'r', encoding='utf-8') as f:
        reader = csv.reader(f)
-        lines = list(reader)
+        lines = []
+        for line in reader:
+            if sys.version_info[0] == 2:
+                line = list(unicode(cell, 'utf-8') for cell in line)
+            lines.append(line)

    if is_training and lines[0][-1] != 'label':
        raise ValueError(
@@ -184,15 +191,15 @@ def convert_examples_to_features(examples, tokenizer, max_seq_length,
        label = example.label
        if example_index < 5:
            logger.info("*** Example ***")
-            logger.info(f"swag_id: {example.swag_id}")
+            logger.info("swag_id: {}".format(example.swag_id))
            for choice_idx, (tokens, input_ids, input_mask, segment_ids) in enumerate(choices_features):
-                logger.info(f"choice: {choice_idx}")
-                logger.info(f"tokens: {' '.join(tokens)}")
-                logger.info(f"input_ids: {' '.join(map(str, input_ids))}")
-                logger.info(f"input_mask: {' '.join(map(str, input_mask))}")
-                logger.info(f"segment_ids: {' '.join(map(str, segment_ids))}")
+                logger.info("choice: {}".format(choice_idx))
+                logger.info("tokens: {}".format(' '.join(tokens)))
+                logger.info("input_ids: {}".format(' '.join(map(str, input_ids))))
+                logger.info("input_mask: {}".format(' '.join(map(str, input_mask))))
+                logger.info("segment_ids: {}".format(' '.join(map(str, segment_ids))))
            if is_training:
-                logger.info(f"label: {label}")
+                logger.info("label: {}".format(label))

        features.append(
            InputFeatures(
@@ -344,7 +351,8 @@ def main():

    if os.path.exists(args.output_dir) and os.listdir(args.output_dir):
        raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir))
-    os.makedirs(args.output_dir, exist_ok=True)
+    if not os.path.exists(args.output_dir):
+        os.makedirs(args.output_dir)

    tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)

@@ -359,7 +367,7 @@ def main():

    # Prepare model
    model = BertForMultipleChoice.from_pretrained(args.bert_model,
-        cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format(args.local_rank),
+        cache_dir=os.path.join(PYTORCH_PRETRAINED_BERT_CACHE, 'distributed_{}'.format(args.local_rank)),
        num_choices=4)
    if args.fp16:
        model.half()
@@ -461,18 +469,25 @@ def main():
                    optimizer.zero_grad()
                    global_step += 1

-    # Save a trained model
-    model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
-    output_model_file = os.path.join(args.output_dir, "pytorch_model.bin")
-    torch.save(model_to_save.state_dict(), output_model_file)

-    # Load a trained model that you have fine-tuned
-    model_state_dict = torch.load(output_model_file)
-    model = BertForMultipleChoice.from_pretrained(args.bert_model,
-        state_dict=model_state_dict,
-        num_choices=4)
+    if args.do_train:
+        # Save a trained model and the associated configuration
+        model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
+        output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
+        torch.save(model_to_save.state_dict(), output_model_file)
+        output_config_file = os.path.join(args.output_dir, CONFIG_NAME)
+        with open(output_config_file, 'w') as f:
+            f.write(model_to_save.config.to_json_string())
+
+        # Load a trained model and config that you have fine-tuned
+        config = BertConfig(output_config_file)
+        model = BertForMultipleChoice(config, num_choices=4)
+        model.load_state_dict(torch.load(output_model_file))
+    else:
+        model = BertForMultipleChoice.from_pretrained(args.bert_model, num_choices=4)
    model.to(device)

+
    if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
        eval_examples = read_swag_examples(os.path.join(args.data_dir, 'val.csv'), is_training = True)
        eval_features = convert_examples_to_features(
--- a/examples/run_transfo_xl.py
+++ b/examples/run_transfo_xl.py
@@ -0,0 +1,152 @@
+# coding=utf-8
+# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HugginFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch Transformer XL model evaluation script.
+    Adapted from https://github.com/kimiyoung/transformer-xl.
+    In particular https://github.com/kimiyoung/transformer-xl/blob/master/pytorch/eval.py
+
+    This script with default values evaluates a pretrained Transformer-XL on WikiText 103
+"""
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import argparse
+import logging
+import time
+import math
+
+import torch
+
+from pytorch_pretrained_bert import TransfoXLLMHeadModel, TransfoXLCorpus
+
+logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
+                    datefmt = '%m/%d/%Y %H:%M:%S',
+                    level = logging.INFO)
+logger = logging.getLogger(__name__)
+
+def main():
+    parser = argparse.ArgumentParser(description='PyTorch Transformer Language Model')
+    parser.add_argument('--model_name', type=str, default='transfo-xl-wt103',
+                        help='pretrained model name')
+    parser.add_argument('--split', type=str, default='test',
+                        choices=['all', 'valid', 'test'],
+                        help='which split to evaluate')
+    parser.add_argument('--batch_size', type=int, default=10,
+                        help='batch size')
+    parser.add_argument('--tgt_len', type=int, default=128,
+                        help='number of tokens to predict')
+    parser.add_argument('--ext_len', type=int, default=0,
+                        help='length of the extended context')
+    parser.add_argument('--mem_len', type=int, default=1600,
+                        help='length of the retained previous heads')
+    parser.add_argument('--clamp_len', type=int, default=1000,
+                        help='max positional embedding index')
+    parser.add_argument('--no_cuda', action='store_true',
+                        help='Do not use CUDA even though CUA is available')
+    parser.add_argument('--work_dir', type=str, required=True,
+                        help='path to the work_dir')
+    parser.add_argument('--no_log', action='store_true',
+                        help='do not log the eval result')
+    parser.add_argument('--same_length', action='store_true',
+                        help='set same length attention with masking')
+    parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.")
+    parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.")
+    args = parser.parse_args()
+    assert args.ext_len >= 0, 'extended context length must be non-negative'
+
+    if args.server_ip and args.server_port:
+        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
+        import ptvsd
+        print("Waiting for debugger attach")
+        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
+        ptvsd.wait_for_attach()
+
+    device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
+    logger.info("device: {}".format(device))
+
+    # Load a pre-processed dataset
+    # You can also build the corpus yourself using TransfoXLCorpus methods
+    # The pre-processing involve computing word frequencies to prepare the Adaptive input and SoftMax
+    # and tokenizing the dataset
+    # The pre-processed corpus is a convertion (using the conversion script )
+    corpus = TransfoXLCorpus.from_pretrained(args.model_name)
+    ntokens = len(corpus.vocab)
+
+    va_iter = corpus.get_iterator('valid', args.batch_size, args.tgt_len,
+        device=device, ext_len=args.ext_len)
+    te_iter = corpus.get_iterator('test', args.batch_size, args.tgt_len,
+        device=device, ext_len=args.ext_len)
+
+    # Load a pre-trained model
+    model = TransfoXLLMHeadModel.from_pretrained(args.model_name)
+    model = model.to(device)
+
+    logger.info('Evaluating with bsz {} tgt_len {} ext_len {} mem_len {} clamp_len {}'.format(
+        args.batch_size, args.tgt_len, args.ext_len, args.mem_len, args.clamp_len))
+
+    model.reset_length(args.tgt_len, args.ext_len, args.mem_len)
+    if args.clamp_len > 0:
+        model.clamp_len = args.clamp_len
+    if args.same_length:
+        model.same_length = True
+
+    ###############################################################################
+    # Evaluation code
+    ###############################################################################
+    def evaluate(eval_iter):
+        # Turn on evaluation mode which disables dropout.
+        model.eval()
+        total_len, total_loss = 0, 0.
+        start_time = time.time()
+        with torch.no_grad():
+            mems = None
+            for idx, (data, target, seq_len) in enumerate(eval_iter):
+                ret = model(data, target, mems)
+                loss, mems = ret
+                loss = loss.mean()
+                total_loss += seq_len * loss.item()
+                total_len += seq_len
+            total_time = time.time() - start_time
+        logger.info('Time : {:.2f}s, {:.2f}ms/segment'.format(
+                total_time, 1000 * total_time / (idx+1)))
+        return total_loss / total_len
+
+    # Run on test data.
+    if args.split == 'all':
+        test_loss = evaluate(te_iter)
+        valid_loss = evaluate(va_iter)
+    elif args.split == 'valid':
+        valid_loss = evaluate(va_iter)
+        test_loss = None
+    elif args.split == 'test':
+        test_loss = evaluate(te_iter)
+        valid_loss = None
+
+    def format_log(loss, split):
+        log_str = '| {0} loss {1:5.2f} | {0} ppl {2:9.3f} '.format(
+            split, loss, math.exp(loss))
+        return log_str
+
+    log_str = ''
+    if valid_loss is not None:
+        log_str += format_log(valid_loss, 'valid')
+    if test_loss is not None:
+        log_str += format_log(test_loss, 'test')
+
+    logger.info('=' * 100)
+    logger.info(log_str)
+    logger.info('=' * 100)
+
+if __name__ == '__main__':
+    main()