From de713fa9b4a04de3bdd0e921255b0819be8920a5 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Thu, 20 Jun 2019 10:54:19 +0200
Subject: [PATCH 001/139] starting

---
 .../convert_xlnet_checkpoint_to_pytorch.py    |  62 ++
 pytorch_pretrained_bert/modeling.py           |   2 +-
 .../modeling_transfo_xl.py                    |   2 +-
 pytorch_pretrained_bert/modeling_xlnet.py     | 565 ++++++++++++++++++
 pytorch_pretrained_bert/tokenization_xlnet.py |   0
 5 files changed, 629 insertions(+), 2 deletions(-)
 create mode 100755 pytorch_pretrained_bert/convert_xlnet_checkpoint_to_pytorch.py
 create mode 100644 pytorch_pretrained_bert/modeling_xlnet.py
 create mode 100644 pytorch_pretrained_bert/tokenization_xlnet.py

diff --git a/pytorch_pretrained_bert/convert_xlnet_checkpoint_to_pytorch.py b/pytorch_pretrained_bert/convert_xlnet_checkpoint_to_pytorch.py
new file mode 100755
index 0000000000..b9220d60cd
--- /dev/null
+++ b/pytorch_pretrained_bert/convert_xlnet_checkpoint_to_pytorch.py
@@ -0,0 +1,62 @@
+# coding=utf-8
+# Copyright 2018 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert BERT checkpoint."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import torch
+
+from pytorch_pretrained_bert.modeling_xlnet import XLNetConfig, XLNetRunConfig, XLNetModel, load_tf_weights_in_xlnet
+
+def convert_xlnet_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, pytorch_dump_path):
+    # Initialise PyTorch model
+    config = XLNetConfig.from_json_file(bert_config_file)
+    print("Building PyTorch model from configuration: {}".format(str(config)))
+    model = XLNetModel(config)
+
+    # Load weights from tf checkpoint
+    load_tf_weights_in_xlnet(model, tf_checkpoint_path)
+
+    # Save pytorch-model
+    print("Save PyTorch model to {}".format(pytorch_dump_path))
+    torch.save(model.state_dict(), pytorch_dump_path)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    ## Required parameters
+    parser.add_argument("--tf_checkpoint_path",
+                        default = None,
+                        type = str,
+                        required = True,
+                        help = "Path the TensorFlow checkpoint path.")
+    parser.add_argument("--xlnet_config_file",
+                        default = None,
+                        type = str,
+                        required = True,
+                        help = "The config json file corresponding to the pre-trained XLNet model. \n"
+                            "This specifies the model architecture.")
+    parser.add_argument("--pytorch_dump_path",
+                        default = None,
+                        type = str,
+                        required = True,
+                        help = "Path to the output PyTorch model.")
+    args = parser.parse_args()
+    convert_xlnet_checkpoint_to_pytorch(args.tf_checkpoint_path,
+                                     args.xlnet_config_file,
+                                     args.pytorch_dump_path)
diff --git a/pytorch_pretrained_bert/modeling.py b/pytorch_pretrained_bert/modeling.py
index c0ab2dd681..81825f2f35 100644
--- a/pytorch_pretrained_bert/modeling.py
+++ b/pytorch_pretrained_bert/modeling.py
@@ -718,7 +718,7 @@ class BertPreTrainedModel(nn.Module):
             state_dict = torch.load(resolved_archive_file, map_location='cpu')
         if from_tf:
             # Directly load from a TensorFlow checkpoint
-            return load_tf_weights_in_bert(model, weights_path)
+            return load_tf_weights_in_bert(model, resolved_archive_file)
         # Load from a PyTorch state_dict
         old_keys = []
         new_keys = []
diff --git a/pytorch_pretrained_bert/modeling_transfo_xl.py b/pytorch_pretrained_bert/modeling_transfo_xl.py
index 534a111c77..ee04eda496 100644
--- a/pytorch_pretrained_bert/modeling_transfo_xl.py
+++ b/pytorch_pretrained_bert/modeling_transfo_xl.py
@@ -236,7 +236,7 @@ class TransfoXLConfig(object):
             dropout: The dropout probabilitiy for all fully connected
                 layers in the embeddings, encoder, and pooler.
             dropatt: The dropout ratio for the attention probabilities.
-            untie_r: untie relative position biases           
+            untie_r: untie relative position biases
             embd_pdrop: The dropout ratio for the embeddings.
             init: parameter initializer to use
             init_range: parameters initialized by U(-init_range, init_range).
diff --git a/pytorch_pretrained_bert/modeling_xlnet.py b/pytorch_pretrained_bert/modeling_xlnet.py
new file mode 100644
index 0000000000..7d9bb6af5e
--- /dev/null
+++ b/pytorch_pretrained_bert/modeling_xlnet.py
@@ -0,0 +1,565 @@
+# coding=utf-8
+# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch XLNet model.
+"""
+from __future__ import (absolute_import, division, print_function,
+                        unicode_literals)
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import copy
+import json
+import logging
+import math
+import os
+import sys
+from io import open
+
+import torch
+from torch import nn
+from torch.nn import CrossEntropyLoss
+
+from .file_utils import cached_path, WEIGHTS_NAME, CONFIG_NAME
+
+logger = logging.getLogger(__name__)
+
+PRETRAINED_MODEL_ARCHIVE_MAP = {
+    'xlnet-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-large-cased-pytorch_model.bin",
+}
+PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    'xlnet-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-large-cased-config.json",
+}
+XLNET_CONFIG_NAME = 'xlnet_config.json'
+TF_WEIGHTS_NAME = 'model.ckpt'
+
+def load_tf_weights_in_xlnet(model, tf_checkpoint_path):
+    """ Load tf checkpoints in a pytorch model
+    """
+    try:
+        import re
+        import numpy as np
+        import tensorflow as tf
+    except ImportError:
+        print("Loading a TensorFlow models in PyTorch, requires TensorFlow to be installed. Please see "
+            "https://www.tensorflow.org/install/ for installation instructions.")
+        raise
+    tf_path = os.path.abspath(tf_checkpoint_path)
+    print("Converting TensorFlow checkpoint from {}".format(tf_path))
+    # Load weights from TF model
+    init_vars = tf.train.list_variables(tf_path)
+    names = []
+    arrays = []
+    for name, shape in init_vars:
+        print("Loading TF weight {} with shape {}".format(name, shape))
+        array = tf.train.load_variable(tf_path, name)
+        names.append(name)
+        arrays.append(array)
+
+    for name, array in zip(names, arrays):
+        name = name.split('/')
+        # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
+        # which are not required for using pretrained model
+        if any(n in ["adam_v", "adam_m", "global_step"] for n in name):
+            print("Skipping {}".format("/".join(name)))
+            continue
+        pointer = model
+        for m_name in name:
+            if re.fullmatch(r'[A-Za-z]+_\d+', m_name):
+                l = re.split(r'_(\d+)', m_name)
+            else:
+                l = [m_name]
+            if l[0] == 'kernel' or l[0] == 'gamma':
+                pointer = getattr(pointer, 'weight')
+            elif l[0] == 'output_bias' or l[0] == 'beta':
+                pointer = getattr(pointer, 'bias')
+            elif l[0] == 'output_weights':
+                pointer = getattr(pointer, 'weight')
+            elif l[0] == 'squad':
+                pointer = getattr(pointer, 'classifier')
+            else:
+                try:
+                    pointer = getattr(pointer, l[0])
+                except AttributeError:
+                    print("Skipping {}".format("/".join(name)))
+                    continue
+            if len(l) >= 2:
+                num = int(l[1])
+                pointer = pointer[num]
+        if m_name[-11:] == '_embeddings':
+            pointer = getattr(pointer, 'weight')
+        elif m_name == 'kernel':
+            array = np.transpose(array)
+        try:
+            assert pointer.shape == array.shape
+        except AssertionError as e:
+            e.args += (pointer.shape, array.shape)
+            raise
+        print("Initialize PyTorch weight {}".format(name))
+        pointer.data = torch.from_numpy(array)
+    return model
+
+
+def gelu(x):
+    """Implementation of the gelu activation function.
+        For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
+        0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
+        Also see https://arxiv.org/abs/1606.08415
+    """
+    return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))
+
+
+def swish(x):
+    return x * torch.sigmoid(x)
+
+
+ACT2FN = {"gelu": gelu, "relu": torch.nn.functional.relu, "swish": swish}
+
+class XLNetBaseConfig(object):
+    @classmethod
+    def from_dict(cls, json_object):
+        """Constructs a `XLNetConfig` from a Python dictionary of parameters."""
+        config = XLNetConfig(vocab_size_or_config_json_file=-1)
+        for key, value in json_object.items():
+            config.__dict__[key] = value
+        return config
+
+    @classmethod
+    def from_json_file(cls, json_file):
+        """Constructs a `XLNetConfig` from a json file of parameters."""
+        with open(json_file, "r", encoding='utf-8') as reader:
+            text = reader.read()
+        return cls.from_dict(json.loads(text))
+
+    def __repr__(self):
+        return str(self.to_json_string())
+
+    def to_dict(self):
+        """Serializes this instance to a Python dictionary."""
+        output = copy.deepcopy(self.__dict__)
+        return output
+
+    def to_json_string(self):
+        """Serializes this instance to a JSON string."""
+        return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"
+
+    def to_json_file(self, json_file_path):
+        """ Save this instance to a json file."""
+        with open(json_file_path, "w", encoding='utf-8') as writer:
+            writer.write(self.to_json_string())
+
+
+class XLNetConfig(XLNetBaseConfig):
+    """Configuration class to store the configuration of a `XLNetModel`.
+    """
+    def __init__(self,
+                 vocab_size_or_config_json_file,
+                 d_model=768,
+                 n_layer=12,
+                 n_head=12,
+                 d_inner=3072,
+                 ff_activation="gelu",
+                 untie_r=True,
+
+                 max_position_embeddings=512,
+                 type_vocab_size=2,
+                 initializer_range=0.02,
+                 layer_norm_eps=1e-12):
+        """Constructs XLNetConfig.
+
+        Args:
+            vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `XLNetModel`.
+            d_model: Size of the encoder layers and the pooler layer.
+            n_layer: Number of hidden layers in the Transformer encoder.
+            n_head: Number of attention heads for each attention layer in
+                the Transformer encoder.
+            d_inner: The size of the "intermediate" (i.e., feed-forward)
+                layer in the Transformer encoder.
+            ff_activation: The non-linear activation function (function or string) in the
+                encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
+            untie_r: untie relative position biases
+
+            dropout: The dropout probabilitiy for all fully connected
+                layers in the embeddings, encoder, and pooler.
+            dropatt: The dropout ratio for the attention
+                probabilities.
+            max_position_embeddings: The maximum sequence length that this model might
+                ever be used with. Typically set this to something large just in case
+                (e.g., 512 or 1024 or 2048).
+            type_vocab_size: The vocabulary size of the `token_type_ids` passed into
+                `XLNetModel`.
+            initializer_range: The sttdev of the truncated_normal_initializer for
+                initializing all weight matrices.
+            layer_norm_eps: The epsilon used by LayerNorm.
+        """
+        if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
+                        and isinstance(vocab_size_or_config_json_file, unicode)):
+            with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader:
+                json_config = json.loads(reader.read())
+            for key, value in json_config.items():
+                self.__dict__[key] = value
+        elif isinstance(vocab_size_or_config_json_file, int):
+            self.vocab_size = vocab_size_or_config_json_file
+            self.d_model = d_model
+            self.n_layer = n_layer
+            self.n_head = n_head
+            self.ff_activation = ff_activation
+            self.d_inner = d_inner
+            self.untie_r = untie_r
+            self.max_position_embeddings = max_position_embeddings
+            self.type_vocab_size = type_vocab_size
+            self.initializer_range = initializer_range
+            self.layer_norm_eps = layer_norm_eps
+        else:
+            raise ValueError("First argument must be either a vocabulary size (int)"
+                             "or the path to a pretrained model config file (str)")
+
+
+class XLNetRunConfig(XLNetBaseConfig):
+    """XLNetRunConfig contains hyperparameters that could be different
+    between pretraining and finetuning.
+    These hyperparameters can also be changed from run to run.
+    We store them separately from XLNetConfig for flexibility.
+    """
+    def __init__(self, 
+                 dropout,
+                 dropatt,
+                 init="normal",
+                 init_range=0.1,
+                 init_std=0.02,
+                 mem_len=None,
+                 reuse_len=None,
+                 bi_data=False,
+                 clamp_len=-1,
+                 same_length=False):
+        """
+        Args:
+        dropout: float, dropout rate.
+        dropatt: float, dropout rate on attention probabilities.
+        init: str, the initialization scheme, either "normal" or "uniform".
+        init_range: float, initialize the parameters with a uniform distribution
+            in [-init_range, init_range]. Only effective when init="uniform".
+        init_std: float, initialize the parameters with a normal distribution
+            with mean 0 and stddev init_std. Only effective when init="normal".
+        mem_len: int, the number of tokens to cache.
+        reuse_len: int, the number of tokens in the currect batch to be cached
+            and reused in the future.
+        bi_data: bool, whether to use bidirectional input pipeline.
+            Usually set to True during pretraining and False during finetuning.
+        clamp_len: int, clamp all relative distances larger than clamp_len.
+            -1 means no clamping.
+        same_length: bool, whether to use the same attention length for each token.
+        """
+
+        self.init = init
+        self.init_range = init_range
+        self.init_std = init_std
+        self.dropout = dropout
+        self.dropatt = dropatt
+        self.mem_len = mem_len
+        self.reuse_len = reuse_len
+        self.bi_data = bi_data
+        self.clamp_len = clamp_len
+        self.same_length = same_length
+
+try:
+    from apex.normalization.fused_layer_norm import FusedLayerNorm as XLNetLayerNorm
+except ImportError:
+    logger.info("Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex .")
+    class XLNetLayerNorm(nn.Module):
+        def __init__(self, hidden_size, eps=1e-12):
+            """Construct a layernorm module in the TF style (epsilon inside the square root).
+            """
+            super(XLNetLayerNorm, self).__init__()
+            self.weight = nn.Parameter(torch.ones(hidden_size))
+            self.bias = nn.Parameter(torch.zeros(hidden_size))
+            self.variance_epsilon = eps
+
+        def forward(self, x):
+            u = x.mean(-1, keepdim=True)
+            s = (x - u).pow(2).mean(-1, keepdim=True)
+            x = (x - u) / torch.sqrt(s + self.variance_epsilon)
+            return self.weight * x + self.bias
+
+class XLNetPreTrainedModel(nn.Module):
+    """ An abstract class to handle weights initialization and
+        a simple interface for dowloading and loading pretrained models.
+    """
+    def __init__(self, config, *inputs, **kwargs):
+        super(XLNetPreTrainedModel, self).__init__()
+        if not isinstance(config, XLNetConfig):
+            raise ValueError(
+                "Parameter config in `{}(config)` should be an instance of class `XLNetConfig`. "
+                "To create a model from a Google pretrained model use "
+                "`model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`".format(
+                    self.__class__.__name__, self.__class__.__name__
+                ))
+        self.config = config
+
+    def init_xlnet_weights(self, module):
+        """ Initialize the weights.
+        """
+        if isinstance(module, (nn.Linear, nn.Embedding)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+        elif isinstance(module, XLNetLayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        if isinstance(module, nn.Linear) and module.bias is not None:
+            module.bias.data.zero_()
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
+        """
+        Instantiate a XLNetPreTrainedModel from a pre-trained model file or a pytorch state dict.
+        Download and cache the pre-trained model file if needed.
+
+        Params:
+            pretrained_model_name_or_path: either:
+                - a str with the name of a pre-trained model to load selected in the list of:
+                    . `xlnet-large-cased`
+                - a path or url to a pretrained model archive containing:
+                    . `xlnet_config.json` a configuration file for the model
+                    . `pytorch_model.bin` a PyTorch dump of a XLNetForPreTraining instance
+                - a path or url to a pretrained model archive containing:
+                    . `xlnet_config.json` a configuration file for the model
+                    . `model.chkpt` a TensorFlow checkpoint
+            from_tf: should we load the weights from a locally saved TensorFlow checkpoint
+            cache_dir: an optional path to a folder in which the pre-trained models will be cached.
+            state_dict: an optional state dictionnary (collections.OrderedDict object) to use instead of Google pre-trained models
+            *inputs, **kwargs: additional input for the specific XLNet class
+                (ex: num_labels for XLNetForSequenceClassification)
+        """
+        state_dict = kwargs.get('state_dict', None)
+        kwargs.pop('state_dict', None)
+        cache_dir = kwargs.get('cache_dir', None)
+        kwargs.pop('cache_dir', None)
+        from_tf = kwargs.get('from_tf', False)
+        kwargs.pop('from_tf', None)
+
+        if pretrained_model_name_or_path in PRETRAINED_MODEL_ARCHIVE_MAP:
+            archive_file = PRETRAINED_MODEL_ARCHIVE_MAP[pretrained_model_name_or_path]
+            config_file = PRETRAINED_CONFIG_ARCHIVE_MAP[pretrained_model_name_or_path]
+        else:
+            if from_tf:
+                # Directly load from a TensorFlow checkpoint
+                archive_file = os.path.join(pretrained_model_name_or_path, TF_WEIGHTS_NAME)
+                config_file = os.path.join(pretrained_model_name_or_path, XLNET_CONFIG_NAME)
+            else:
+                archive_file = os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME)
+                config_file = os.path.join(pretrained_model_name_or_path, CONFIG_NAME)
+        # redirect to the cache, if necessary
+        try:
+            resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir)
+        except EnvironmentError:
+            if pretrained_model_name_or_path in PRETRAINED_MODEL_ARCHIVE_MAP:
+                logger.error(
+                    "Couldn't reach server at '{}' to download pretrained weights.".format(
+                        archive_file))
+            else:
+                logger.error(
+                    "Model name '{}' was not found in model name list ({}). "
+                    "We assumed '{}' was a path or url but couldn't find any file "
+                    "associated to this path or url.".format(
+                        pretrained_model_name_or_path,
+                        ', '.join(PRETRAINED_MODEL_ARCHIVE_MAP.keys()),
+                        archive_file))
+            return None
+        try:
+            resolved_config_file = cached_path(config_file, cache_dir=cache_dir)
+        except EnvironmentError:
+            if pretrained_model_name_or_path in PRETRAINED_CONFIG_ARCHIVE_MAP:
+                logger.error(
+                    "Couldn't reach server at '{}' to download pretrained model configuration file.".format(
+                        config_file))
+            else:
+                logger.error(
+                    "Model name '{}' was not found in model name list ({}). "
+                    "We assumed '{}' was a path or url but couldn't find any file "
+                    "associated to this path or url.".format(
+                        pretrained_model_name_or_path,
+                        ', '.join(PRETRAINED_CONFIG_ARCHIVE_MAP.keys()),
+                        config_file))
+            return None
+        if resolved_archive_file == archive_file and resolved_config_file == config_file:
+            logger.info("loading weights file {}".format(archive_file))
+            logger.info("loading configuration file {}".format(config_file))
+        else:
+            logger.info("loading weights file {} from cache at {}".format(
+                archive_file, resolved_archive_file))
+            logger.info("loading configuration file {} from cache at {}".format(
+                config_file, resolved_config_file))
+        # Load config
+        config = XLNetConfig.from_json_file(resolved_config_file)
+        logger.info("Model config {}".format(config))
+        # Instantiate model.
+        model = cls(config, *inputs, **kwargs)
+        if state_dict is None and not from_tf:
+            state_dict = torch.load(resolved_archive_file, map_location='cpu')
+        if from_tf:
+            # Directly load from a TensorFlow checkpoint
+            return load_tf_weights_in_xlnet(model, resolved_archive_file)
+        # Load from a PyTorch state_dict
+        missing_keys = []
+        unexpected_keys = []
+        error_msgs = []
+        # copy state_dict so _load_from_state_dict can modify it
+        metadata = getattr(state_dict, '_metadata', None)
+        state_dict = state_dict.copy()
+        if metadata is not None:
+            state_dict._metadata = metadata
+
+        def load(module, prefix=''):
+            local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {})
+            module._load_from_state_dict(
+                state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys, error_msgs)
+            for name, child in module._modules.items():
+                if child is not None:
+                    load(child, prefix + name + '.')
+        start_prefix = ''
+        if not hasattr(model, 'xlnet') and any(s.startswith('xlnet.') for s in state_dict.keys()):
+            start_prefix = 'xlnet.'
+        load(model, prefix=start_prefix)
+        if len(missing_keys) > 0:
+            logger.info("Weights of {} not initialized from pretrained model: {}".format(
+                model.__class__.__name__, missing_keys))
+        if len(unexpected_keys) > 0:
+            logger.info("Weights from pretrained model not used in {}: {}".format(
+                model.__class__.__name__, unexpected_keys))
+        if len(error_msgs) > 0:
+            raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format(
+                               model.__class__.__name__, "\n\t".join(error_msgs)))
+        return model
+
+
+class XLNetModel(XLNetPreTrainedModel):
+    """XLNet model ("XLNet: Generalized Autoregressive Pretraining for Language Understanding").
+
+    Params:
+        `config`: a XLNetConfig class instance with the configuration to build a new model
+        `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
+        `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
+            This can be used to compute head importance metrics. Default: False
+
+    Inputs:
+        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
+            with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
+            `extract_features.py`, `run_classifier.py` and `run_squad.py`)
+        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
+            types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
+            a `sentence B` token (see XLNet paper for more details).
+        `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
+            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
+            input sequence length in the current batch. It's the mask that we typically use for attention when
+            a batch has varying length sentences.
+        `output_all_encoded_layers`: boolean which controls the content of the `encoded_layers` output as described below. Default: `True`.
+        `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
+            It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
+
+
+    Outputs: Tuple of (encoded_layers, pooled_output)
+        `encoded_layers`: controled by `output_all_encoded_layers` argument:
+            - `output_all_encoded_layers=True`: outputs a list of the full sequences of encoded-hidden-states at the end
+                of each attention block (i.e. 12 full sequences for XLNet-base, 24 for XLNet-large), each
+                encoded-hidden-state is a torch.FloatTensor of size [batch_size, sequence_length, hidden_size],
+            - `output_all_encoded_layers=False`: outputs only the full sequence of hidden-states corresponding
+                to the last attention block of shape [batch_size, sequence_length, hidden_size],
+        `pooled_output`: a torch.FloatTensor of size [batch_size, hidden_size] which is the output of a
+            classifier pretrained on top of the hidden state associated to the first character of the
+            input (`CLS`) to train on the Next-Sentence task (see XLNet's paper).
+
+    Example usage:
+    ```python
+    # Already been converted into WordPiece token ids
+    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
+    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
+    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
+
+    config = modeling.XLNetConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
+        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
+
+    model = modeling.XLNetModel(config=config)
+    all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)
+    ```
+    """
+    def __init__(self, config, output_attentions=False, keep_multihead_output=False):
+        super(XLNetModel, self).__init__(config)
+        self.output_attentions = output_attentions
+        self.apply(self.init_xlnet_weights)
+
+    def prune_heads(self, heads_to_prune):
+        """ Prunes heads of the model.
+            heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    def get_multihead_outputs(self):
+        """ Gather all multi-head outputs.
+            Return: list (layers) of multihead module outputs with gradients
+        """
+        return [layer.attention.self.multihead_output for layer in self.encoder.layer]
+
+    def forward(self, input_ids, token_type_ids=None, attention_mask=None, output_all_encoded_layers=True, head_mask=None):
+        if attention_mask is None:
+            attention_mask = torch.ones_like(input_ids)
+        if token_type_ids is None:
+            token_type_ids = torch.zeros_like(input_ids)
+
+        # We create a 3D attention mask from a 2D tensor mask.
+        # Sizes are [batch_size, 1, 1, to_seq_length]
+        # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
+        # this attention mask is more simple than the triangular masking of causal attention
+        # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
+        extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
+
+        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+        # masked positions, this operation will create a tensor which is 0.0 for
+        # positions we want to attend and -10000.0 for masked positions.
+        # Since we are adding it to the raw scores before the softmax, this is
+        # effectively the same as removing these entirely.
+        extended_attention_mask = extended_attention_mask.to(dtype=next(self.parameters()).dtype) # fp16 compatibility
+        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        if head_mask is not None:
+            if head_mask.dim() == 1:
+                head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
+                head_mask = head_mask.expand_as(self.config.num_hidden_layers, -1, -1, -1, -1)
+            elif head_mask.dim() == 2:
+                head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)  # We can specify head_mask for each layer
+            head_mask = head_mask.to(dtype=next(self.parameters()).dtype) # switch to fload if need + fp16 compatibility
+        else:
+            head_mask = [None] * self.config.num_hidden_layers
+
+        embedding_output = self.embeddings(input_ids, token_type_ids)
+        encoded_layers = self.encoder(embedding_output,
+                                      extended_attention_mask,
+                                      output_all_encoded_layers=output_all_encoded_layers,
+                                      head_mask=head_mask)
+        if self.output_attentions:
+            all_attentions, encoded_layers = encoded_layers
+        sequence_output = encoded_layers[-1]
+        pooled_output = self.pooler(sequence_output)
+        if not output_all_encoded_layers:
+            encoded_layers = encoded_layers[-1]
+        if self.output_attentions:
+            return all_attentions, encoded_layers, pooled_output
+        return encoded_layers, pooled_output
+    
\ No newline at end of file
diff --git a/pytorch_pretrained_bert/tokenization_xlnet.py b/pytorch_pretrained_bert/tokenization_xlnet.py
new file mode 100644
index 0000000000..e69de29bb2

From c2ea5aef772f6f3a06860f3f20d7366f1dcb576f Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Thu, 20 Jun 2019 13:52:21 +0200
Subject: [PATCH 002/139] work in progress on xlnet

---
 pytorch_pretrained_bert/modeling_xlnet.py | 594 +++++++++++++++++++---
 1 file changed, 527 insertions(+), 67 deletions(-)

diff --git a/pytorch_pretrained_bert/modeling_xlnet.py b/pytorch_pretrained_bert/modeling_xlnet.py
index 7d9bb6af5e..1ae3167713 100644
--- a/pytorch_pretrained_bert/modeling_xlnet.py
+++ b/pytorch_pretrained_bert/modeling_xlnet.py
@@ -126,6 +126,16 @@ def swish(x):
 
 ACT2FN = {"gelu": gelu, "relu": torch.nn.functional.relu, "swish": swish}
 
+def positional_embedding(pos_seq, inv_freq, bsz=None):
+    sinusoid_inp = torch.einsum('i,d->id', pos_seq, inv_freq)
+    pos_emb = torch.cat([tf.sin(sinusoid_inp), tf.cos(sinusoid_inp)], -1)
+    pos_emb = pos_emb[:, None, :]
+
+    if bsz is not None:
+        pos_emb = pos_emb.expand(1, bsz, 1)
+
+    return pos_emb
+
 class XLNetBaseConfig(object):
     @classmethod
     def from_dict(cls, json_object):
@@ -165,15 +175,14 @@ class XLNetConfig(XLNetBaseConfig):
     """
     def __init__(self,
                  vocab_size_or_config_json_file,
-                 d_model=768,
-                 n_layer=12,
-                 n_head=12,
-                 d_inner=3072,
+                 d_model=1024,
+                 n_layer=24,
+                 n_head=16,
+                 d_inner=4096,
                  ff_activation="gelu",
                  untie_r=True,
 
                  max_position_embeddings=512,
-                 type_vocab_size=2,
                  initializer_range=0.02,
                  layer_norm_eps=1e-12):
         """Constructs XLNetConfig.
@@ -197,8 +206,6 @@ class XLNetConfig(XLNetBaseConfig):
             max_position_embeddings: The maximum sequence length that this model might
                 ever be used with. Typically set this to something large just in case
                 (e.g., 512 or 1024 or 2048).
-            type_vocab_size: The vocabulary size of the `token_type_ids` passed into
-                `XLNetModel`.
             initializer_range: The sttdev of the truncated_normal_initializer for
                 initializing all weight matrices.
             layer_norm_eps: The epsilon used by LayerNorm.
@@ -214,11 +221,12 @@ class XLNetConfig(XLNetBaseConfig):
             self.d_model = d_model
             self.n_layer = n_layer
             self.n_head = n_head
+            assert d_model % n_head == 0
+            self.d_head = d_model // n_head
             self.ff_activation = ff_activation
             self.d_inner = d_inner
             self.untie_r = untie_r
             self.max_position_embeddings = max_position_embeddings
-            self.type_vocab_size = type_vocab_size
             self.initializer_range = initializer_range
             self.layer_norm_eps = layer_norm_eps
         else:
@@ -233,8 +241,8 @@ class XLNetRunConfig(XLNetBaseConfig):
     We store them separately from XLNetConfig for flexibility.
     """
     def __init__(self, 
-                 dropout,
-                 dropatt,
+                 dropout=0.1,
+                 dropatt=0.1,
                  init="normal",
                  init_range=0.1,
                  init_std=0.02,
@@ -278,12 +286,12 @@ try:
 except ImportError:
     logger.info("Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex .")
     class XLNetLayerNorm(nn.Module):
-        def __init__(self, hidden_size, eps=1e-12):
+        def __init__(self, d_model, eps=1e-12):
             """Construct a layernorm module in the TF style (epsilon inside the square root).
             """
             super(XLNetLayerNorm, self).__init__()
-            self.weight = nn.Parameter(torch.ones(hidden_size))
-            self.bias = nn.Parameter(torch.zeros(hidden_size))
+            self.weight = nn.Parameter(torch.ones(d_model))
+            self.bias = nn.Parameter(torch.zeros(d_model))
             self.variance_epsilon = eps
 
         def forward(self, x):
@@ -292,6 +300,220 @@ except ImportError:
             x = (x - u) / torch.sqrt(s + self.variance_epsilon)
             return self.weight * x + self.bias
 
+class XLNetRelativeAttention(nn.Module):
+    def __init__(self, config, output_attentions=False, keep_multihead_output=False):
+        super(XLNetRelativeAttention, self).__init__()
+        self.output_attentions = output_attentions
+        if config.d_model % config.num_attention_heads != 0:
+            raise ValueError(
+                "The hidden size (%d) is not a multiple of the number of attention "
+                "heads (%d)" % (config.d_model, config.num_attention_heads))
+        self.output_attentions = output_attentions
+        self.keep_multihead_output = keep_multihead_output
+        self.multihead_output = None
+
+        self.n_head = config.num_attention_heads
+        self.d_head = config.d_head
+        self.d_model = config.d_model
+        self.scale = 1 / (config.d_head ** 0.5)
+
+        self.q = nn.Parameter(torch.Tensor(config.d_model, self.n_head, self.d_head))
+        self.k = nn.Parameter(torch.Tensor(config.d_model, self.n_head, self.d_head))
+        self.v = nn.Parameter(torch.Tensor(config.d_model, self.n_head, self.d_head))
+        self.o = nn.Parameter(torch.Tensor(config.d_model, self.n_head, self.d_head))
+        self.r = nn.Parameter(torch.Tensor(config.d_model, self.n_head, self.d_head))
+
+        self.r_r_bias = nn.Parameter(torch.Tensor(self.n_head, self.d_head))
+        self.r_s_bias = nn.Parameter(torch.Tensor(self.n_head, self.d_head))
+        self.r_w_bias = nn.Parameter(torch.Tensor(self.n_head, self.d_head))
+        self.seg_embed = nn.Parameter(torch.Tensor(self.n_head, 2, self.d_head))
+
+        self.LayerNorm = XLNetLayerNorm(config.d_model, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.dropout)
+
+    def prune_heads(self, heads):
+        raise NotImplementedError
+
+    def rel_attn_core(self, q_head, k_head_h, v_head_h, k_head_r, seg_mat=None, attn_mask=None):
+        """Core relative positional attention operations."""
+
+        # content based attention score
+        ac = torch.einsum('ibnd,jbnd->ijbn', q_head + self.r_w_bias, k_head_h)
+
+        # position based attention score
+        bd = torch.einsum('ibnd,jbnd->ijbn', q_head + self.r_r_bias, k_head_r)
+        bd = rel_shift(bd, klen=torch.shape(ac)[1])
+
+        # segment based attention score
+        if seg_mat is None:
+            ef = 0
+        else:
+            ef = torch.einsum('ibnd,snd->ibns', q_head + self.r_s_bias, self.seg_embed)
+            ef = torch.einsum('ijbs,ibns->ijbn', seg_mat, ef)
+
+        # merge attention scores and perform masking
+        attn_score = (ac + bd + ef) * self.scale
+        if attn_mask is not None:
+            # attn_score = attn_score * (1 - attn_mask) - 1e30 * attn_mask
+            attn_score = attn_score - 1e30 * attn_mask
+
+        # attention probability
+        attn_prob = F.softmax(attn_score, dim=1)
+        attn_prob = self.dropout(attn_prob)
+
+        # attention output
+        attn_vec = torch.einsum('ijbn,jbnd->ibnd', attn_prob, v_head_h)
+
+        return attn_vec
+
+    def post_attention(self, h, attn_vec, residual=True):
+        """Post-attention processing."""
+        # post-attention projection (back to `d_model`)
+        attn_out = torch.einsum('ibnd,hnd->ibh', attn_vec, self.o)
+
+        attn_out = self.dropout(attn_out)
+        if residual:
+            attn_out = attn_out + h
+        output = self.LayerNorm(attn_out)
+
+        return output
+
+    def forward(self, h, g,
+                      attn_mask_h, attn_mask_g,
+                      r, seg_mat,
+                      mems=None, target_mapping=None, head_mask=None):
+        if g is not None:
+            ###### Two-stream attention with relative positional encoding.
+            # content based attention score
+            if mems is not None and mems.dim() > 1:
+                cat = torch.cat([mems, h], dim=0)
+            else:
+                cat = h
+
+            # content-based key head
+            k_head_h = torch.einsum('ibh,hnd->ibnd', cat, self.k)
+
+            # content-based value head
+            v_head_h = torch.einsum('ibh,hnd->ibnd', cat, self.v)
+
+            # position-based key head
+            k_head_r = torch.einsum('ibh,hnd->ibnd', r, self.r)
+
+            ##### h-stream
+            # content-stream query head
+            q_head_h = torch.einsum('ibh,hnd->ibnd', h, self.q)
+
+            # core attention ops
+            attn_vec_h = self.rel_attn_core(
+                q_head_h, k_head_h, v_head_h, k_head_r, seg_mat=seg_mat, attn_mask=attn_mask_h)
+
+            # post processing
+            output_h = self.post_attention(h, attn_vec_h)
+
+            ##### g-stream
+            # query-stream query head
+            q_head_g = torch.einsum('ibh,hnd->ibnd', g, self.q)
+
+            # core attention ops
+            if target_mapping is not None:
+                q_head_g = torch.einsum('mbnd,mlb->lbnd', q_head_g, target_mapping)
+                attn_vec_g = self.rel_attn_core(
+                    q_head_g, k_head_h, v_head_h, k_head_r, seg_mat=seg_mat, attn_mask=attn_mask_g)
+                attn_vec_g = torch.einsum('lbnd,mlb->mbnd', attn_vec_g, target_mapping)
+            else:
+                attn_vec_g = self.rel_attn_core(
+                    q_head_g, k_head_h, v_head_h, k_head_r, seg_mat=seg_mat, attn_mask=attn_mask_g)
+
+            # post processing
+            output_g = self.post_attention(g, attn_vec_g)
+            attention_output = output_h, output_g
+        else:
+            ###### Multi-head attention with relative positional encoding
+            if mems is not None and mems.dim() > 1:
+                cat = torch.cat([mems, h], dim=0)
+            else:
+                cat = h
+
+            # content heads
+            q_head_h = torch.einsum('ibh,hnd->ibnd', h, self.q)
+            k_head_h = torch.einsum('ibh,hnd->ibnd', cat, self.k)
+            v_head_h = torch.einsum('ibh,hnd->ibnd', cat, self.v)
+
+            # positional heads
+            k_head_r = torch.einsum('ibh,hnd->ibnd', r, self.r)
+
+            # core attention ops
+            attn_vec = self.rel_attn_core(
+                q_head_h, k_head_h, v_head_h, k_head_r, seg_mat=seg_mat, attn_mask=attn_mask_h)
+
+            # post processing
+            attention_output = self.post_attention(h, attn_vec)
+
+
+        # Mask heads if we want to
+        # if head_mask is not None:
+        #     attention_probs = attention_probs * head_mask
+
+        # context_layer = torch.matmul(attention_probs, value_layer)
+        # if self.keep_multihead_output:
+        #     self.multihead_output = context_layer
+        #     self.multihead_output.retain_grad()
+
+        # context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        # new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        # context_layer = context_layer.view(*new_context_layer_shape)
+
+        # if self.output_attentions:
+        #     attentions, self_output = self_output
+        # if self.output_attentions:
+        #     return attentions, attention_output
+        return attention_output
+
+class XLNetFeedForward(nn.Module):
+    def __init__(self, config):
+        super(XLNetFeedForward, self).__init__()
+        self.LayerNorm = XLNetLayerNorm(config.d_model, eps=config.layer_norm_eps)
+        self.layer_1 = nn.Linear(config.d_model, config.d_inner)
+        self.layer_2 = nn.Linear(config.d_inner, config.d_model)
+        self.dropout = nn.Dropout(config.dropout)
+        if isinstance(config.ff_activation, str) or (sys.version_info[0] == 2 and isinstance(config.ff_activation, unicode)):
+            self.activation_function = ACT2FN[config.ff_activation]
+        else:
+            self.activation_function = config.ff_activation
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.layer_1(hidden_states)
+        hidden_states = self.activation_function(hidden_states)
+        hidden_states = self.layer_2(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+class XLNetLayer(nn.Module):
+    def __init__(self, config, output_attentions=False, keep_multihead_output=False):
+        super(XLNetLayer, self).__init__()
+        self.output_attentions = output_attentions
+        self.rel_attn = XLNetRelativeAttention(config, output_attentions=output_attentions,
+                                               keep_multihead_output=keep_multihead_output)
+        self.ff = XLNetFeedForward(config)
+        self.dropout = nn.Dropout(config.dropout)
+
+    def forward(self, output_h, output_g,
+                attn_mask_h, attn_mask_g,
+                r, seg_mat, r, seg_mat,
+                two_streams=False, mems=None, target_mapping=None, head_mask=None):
+        output_h, output_g = self.rel_attn(output_h, output_g,
+                                           attn_mask_h, attn_mask_g,
+                                           r, seg_mat,
+                                           mems=mems, target_mapping=target_mapping, head_mask=head_mask)
+        if two_streams:
+            output_g = self.ff(output_g)
+        output_h = self.ff(output_h)
+
+        # if self.output_attentions:
+        #     return attentions, layer_output
+        return output_h, output_g
+
 class XLNetPreTrainedModel(nn.Module):
     """ An abstract class to handle weights initialization and
         a simple interface for dowloading and loading pretrained models.
@@ -445,6 +667,228 @@ class XLNetPreTrainedModel(nn.Module):
 
 
 class XLNetModel(XLNetPreTrainedModel):
+    def __init__(self, config, output_attentions=False, keep_multihead_output=False):
+        super(XLNetModel, self).__init__()
+        self.output_attentions = output_attentions
+        self.mem_len = config.mem_len
+        self.reuse_len = config.reuse_len
+        layer = XLNetLayer(config, output_attentions=output_attentions,
+                                  keep_multihead_output=keep_multihead_output)
+        self.layer = nn.ModuleList([copy.deepcopy(layer) for _ in range(config.num_hidden_layers)])
+
+    @classmethod
+    def _create_mask(qlen, mlen, dtype=torch.float, same_length=False):
+        """create causal attention mask."""
+        attn_mask = torch.ones([qlen, qlen], dtype=dtype)
+        mask_u = tf.matrix_band_part(attn_mask, 0, -1)
+        mask_dia = tf.matrix_band_part(attn_mask, 0, 0)
+        attn_mask_pad = tf.zeros([qlen, mlen], dtype=dtype)
+        ret = tf.concat([attn_mask_pad, mask_u - mask_dia], 1)
+        if same_length:
+            mask_l = tf.matrix_band_part(attn_mask, -1, 0)
+            ret = tf.concat([ret[:, :qlen] + mask_l - mask_dia, ret[:, qlen:]], 1)
+
+        return ret
+
+    def cache_mem(self, curr_out, prev_mem):
+        """cache hidden states into memory."""
+        if self.mem_len is None or self.mem_len == 0:
+            return None
+        else:
+            if self.reuse_len is not None and self.reuse_len > 0:
+                curr_out = curr_out[:self.reuse_len]
+
+            if prev_mem is None:
+                new_mem = curr_out[-self.mem_len:]
+            else:
+                new_mem = torch.cat([prev_mem, curr_out], dim=0)[-self.mem_len:]
+
+        return new_mem.detach()
+
+    def relative_positional_encoding(self, qlen, klen, bsz=None, dtype=torch.float):
+        """create relative positional encoding."""
+        freq_seq = torch.zrange(0, d_model, 2.0, dtype=dtype)
+        inv_freq = 1 / (10000 ** (freq_seq / self.config.d_model))
+
+        if self.attn_type == 'bi':
+            # beg, end = klen - 1, -qlen
+            beg, end = klen, -qlen
+        elif self.attn_type == 'uni':
+            # beg, end = klen - 1, -1
+            beg, end = klen, -1
+        else:
+            raise ValueError('Unknown `attn_type` {}.'.format(self.attn_type))
+
+        if self.bi_data:
+            fwd_pos_seq = torch.arange(beg, end, -1.0, dtype=dtype)
+            bwd_pos_seq = torch.arange(-beg, -end, 1.0, dtype=dtype)
+
+            if self.clamp_len > 0:
+                fwd_pos_seq = fwd_pos_seq.clamp(-self.clamp_len, self.clamp_len)
+                bwd_pos_seq = bwd_pos_seq.clamp(-self.clamp_len, self.clamp_len)
+
+            if bsz is not None:
+                fwd_pos_emb = positional_embedding(fwd_pos_seq, inv_freq, bsz//2)
+                bwd_pos_emb = positional_embedding(bwd_pos_seq, inv_freq, bsz//2)
+            else:
+                fwd_pos_emb = positional_embedding(fwd_pos_seq, inv_freq)
+                bwd_pos_emb = positional_embedding(bwd_pos_seq, inv_freq)
+
+            pos_emb = torch.cat([fwd_pos_emb, bwd_pos_emb], dim=1)
+        else:
+            fwd_pos_seq = torch.arange(beg, end, -1.0, dtype=dtype)
+            if self.clamp_len > 0:
+                fwd_pos_seq = fwd_pos_seq.clamp(-self.clamp_len, self.clamp_len)
+            pos_emb = positional_embedding(fwd_pos_seq, inv_freq, bsz)
+
+        return pos_emb
+
+    def forward(self, inp_k, seg_id=None, input_mask=None,
+                mems=None, perm_mask=None, target_mapping=None, inp_q=None,
+                output_all_encoded_layers=True, head_mask=None):
+        """
+        Args:
+            inp_k: int32 Tensor in shape [len, bsz], the input token IDs.
+            seg_id: int32 Tensor in shape [len, bsz], the input segment IDs.
+            input_mask: float32 Tensor in shape [len, bsz], the input mask.
+                0 for real tokens and 1 for padding.
+            mems: a list of float32 Tensors in shape [mem_len, bsz, d_model], memory
+                from previous batches. The length of the list equals n_layer.
+                If None, no memory is used.
+            perm_mask: float32 Tensor in shape [len, len, bsz].
+                If perm_mask[i, j, k] = 0, i attend to j in batch k;
+                if perm_mask[i, j, k] = 1, i does not attend to j in batch k.
+                If None, each position attends to all the others.
+            target_mapping: float32 Tensor in shape [num_predict, len, bsz].
+                If target_mapping[i, j, k] = 1, the i-th predict in batch k is
+                on the j-th token.
+                Only used during pretraining for partial prediction.
+                Set to None during finetuning.
+            inp_q: float32 Tensor in shape [len, bsz].
+                1 for tokens with losses and 0 for tokens without losses.
+                Only used during pretraining for two-stream attention.
+                Set to None during finetuning.
+
+            mem_len: int, the number of tokens to cache.
+            reuse_len: int, the number of tokens in the currect batch to be cached
+                and reused in the future.
+            bi_data: bool, whether to use bidirectional input pipeline.
+                Usually set to True during pretraining and False during finetuning.
+            clamp_len: int, clamp all relative distances larger than clamp_len.
+                -1 means no clamping.
+            same_length: bool, whether to use the same attention length for each token.
+            summary_type: str, "last", "first", "mean", or "attn". The method
+                to pool the input to get a vector representation.
+        """
+        qlen, bsz = inp_k.shape
+        mlen = mems[0].shape[0] if mems is not None else 0
+        klen = mlen + qlen
+
+        ##### Attention mask
+        # causal attention mask
+        if self.attn_type == 'uni':
+            attn_mask = _create_mask(qlen, mlen, inp_k.dtype, self.same_length)
+            attn_mask = attn_mask[:, :, None, None]
+        elif self.attn_type == 'bi':
+            attn_mask = None
+        else:
+            raise ValueError('Unsupported attention type: {}'.format(self.attn_type))
+
+        # data mask: input mask & perm mask
+        if input_mask is not None and perm_mask is not None:
+            data_mask = input_mask[None] + perm_mask
+        elif input_mask is not None and perm_mask is None:
+            data_mask = input_mask[None]
+        elif input_mask is None and perm_mask is not None:
+            data_mask = perm_mask
+        else:
+            data_mask = None
+
+        if data_mask is not None:
+            # all mems can be attended to
+            mems_mask = torch.zeros([data_mask.shape[0], mlen, bsz], dtype=data_mask.dtype, device=data_mask.device)
+            data_mask = torch.cat([mems_mask, data_mask], dim=1)
+            if attn_mask is None:
+                attn_mask = data_mask[:, :, :, None]
+            else:
+                attn_mask += data_mask[:, :, :, None]
+
+        if attn_mask is not None:
+            attn_mask = (attn_mask > 0).float()
+
+        if attn_mask is not None:
+            non_tgt_mask = -tf.eye(qlen, dtype=tf_float)
+            non_tgt_mask = tf.concat([tf.zeros([qlen, mlen], dtype=tf_float),
+                                    non_tgt_mask], axis=-1)
+            non_tgt_mask = tf.cast((attn_mask + non_tgt_mask[:, :, None, None]) > 0,
+                                    dtype=tf_float)
+        else:
+            non_tgt_mask = None
+
+        ##### Word embedding
+        word_emb_k = self.word_embedding(inp_k)
+        output_h = self.dropout(word_emb_k)
+        if inp_q is not None:
+            if target_mapping is not None:
+                word_emb_q = mask_emb.expand(target_mapping.shape[0], bsz, 1)
+            else:
+                inp_q_ext = inp_q[:, :, None]
+                word_emb_q = inp_q_ext * mask_emb + (1 - inp_q_ext) * word_emb_k
+            output_g = self.dropout(word_emb_q)
+        else:
+            output_g = None
+
+        ##### Segment embedding
+        if seg_id is not None:
+            # Convert `seg_id` to one-hot `seg_mat`
+            mem_pad = torch.zeros([mlen, bsz], dtype=torch.long)
+            cat_ids = torch.cat([mem_pad, seg_id], dim=0)
+
+            # `1` indicates not in the same segment [qlen x klen x bsz]
+            seg_mat = (seg_id[:, None] != cat_ids[None, :]).long()
+            # seg_mat = tf.one_hot(seg_mat, 2, dtype=tf_float)
+        else:
+            seg_mat = None
+
+        ##### Positional encoding
+        pos_emb = relative_positional_encoding(qlen, klen, bsz=bsz, dtype=inp_k.dtype)
+        pos_emb = self.dropout(pos_emb)
+
+        ##### Head mask if needed (for bertology/pruning)
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        if head_mask is not None:
+            if head_mask.dim() == 1:
+                head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
+                head_mask = head_mask.expand_as(self.config.num_hidden_layers, -1, -1, -1, -1)
+            elif head_mask.dim() == 2:
+                head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)  # We can specify head_mask for each layer
+            head_mask = head_mask.to(dtype=next(self.parameters()).dtype) # switch to fload if need + fp16 compatibility
+        else:
+            head_mask = [None] * self.config.num_hidden_layers
+
+        new_mems = []
+        if mems is None:
+            mems = [None] * len(self.layer)
+
+        for i, layer_module in enumerate(self.layer):
+            # cache new mems
+            new_mems.append(self.cache_mem(output_h, mems[i]))
+
+            output_h, output_g = layer_module(output_h, output_g,
+                                              attn_mask_h, attn_mask_g,
+                                              r, seg_mat,
+                                              mems=mems[i], target_mapping=target_mapping,
+                                              head_mask=head_mask)
+
+        output = self.dropout(output_g if output_g is not None else output_h)
+
+        return output
+
+
+class XLNetLMHeadModel(XLNetPreTrainedModel):
     """XLNet model ("XLNet: Generalized Autoregressive Pretraining for Language Understanding").
 
     Params:
@@ -473,10 +917,10 @@ class XLNetModel(XLNetPreTrainedModel):
         `encoded_layers`: controled by `output_all_encoded_layers` argument:
             - `output_all_encoded_layers=True`: outputs a list of the full sequences of encoded-hidden-states at the end
                 of each attention block (i.e. 12 full sequences for XLNet-base, 24 for XLNet-large), each
-                encoded-hidden-state is a torch.FloatTensor of size [batch_size, sequence_length, hidden_size],
+                encoded-hidden-state is a torch.FloatTensor of size [batch_size, sequence_length, d_model],
             - `output_all_encoded_layers=False`: outputs only the full sequence of hidden-states corresponding
-                to the last attention block of shape [batch_size, sequence_length, hidden_size],
-        `pooled_output`: a torch.FloatTensor of size [batch_size, hidden_size] which is the output of a
+                to the last attention block of shape [batch_size, sequence_length, d_model],
+        `pooled_output`: a torch.FloatTensor of size [batch_size, d_model] which is the output of a
             classifier pretrained on top of the hidden state associated to the first character of the
             input (`CLS`) to train on the Next-Sentence task (see XLNet's paper).
 
@@ -487,16 +931,30 @@ class XLNetModel(XLNetPreTrainedModel):
     input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
     token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
 
-    config = modeling.XLNetConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
+    config = modeling.XLNetConfig(vocab_size_or_config_json_file=32000, d_model=768,
         num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
 
     model = modeling.XLNetModel(config=config)
     all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)
     ```
     """
-    def __init__(self, config, output_attentions=False, keep_multihead_output=False):
-        super(XLNetModel, self).__init__(config)
+    def __init__(self, config, run_config, output_attentions=False, keep_multihead_output=False):
+        super(XLNetLMHeadModel, self).__init__(config)
         self.output_attentions = output_attentions
+        self.attn_type = run_config.attn_type
+        self.same_length = run_config.same_length
+
+        self.word_embedding = nn.Embedding(config.vocab_size, config.d_model)
+        self.mask_emb = nn.Parameter(torch.Tensor(1, 1, self.d_model))
+        self.transformer = XLNetModel(config,
+                                            output_attentions=output_attentions,
+                                            keep_multihead_output=keep_multihead_output)
+        self.lm_loss = nn.Linear(config.d_model, config.vocab_size, bias=True)
+        self.dropout = nn.Dropout(config.dropout)
+        # Tie weights
+        if config.tie_weight:
+            self.lm_loss.weight = self.word_embedding.weight
+
         self.apply(self.init_xlnet_weights)
 
     def prune_heads(self, heads_to_prune):
@@ -512,54 +970,56 @@ class XLNetModel(XLNetPreTrainedModel):
         """
         return [layer.attention.self.multihead_output for layer in self.encoder.layer]
 
-    def forward(self, input_ids, token_type_ids=None, attention_mask=None, output_all_encoded_layers=True, head_mask=None):
-        if attention_mask is None:
-            attention_mask = torch.ones_like(input_ids)
-        if token_type_ids is None:
-            token_type_ids = torch.zeros_like(input_ids)
+    def forward(self, inp_k, seg_id=None, input_mask=None,
+                mems=None, perm_mask=None, target_mapping=None, inp_q=None,
+                output_all_encoded_layers=True, head_mask=None):
+        """
+        Args:
+            inp_k: int32 Tensor in shape [len, bsz], the input token IDs.
+            seg_id: int32 Tensor in shape [len, bsz], the input segment IDs.
+            input_mask: float32 Tensor in shape [len, bsz], the input mask.
+                0 for real tokens and 1 for padding.
+            mems: a list of float32 Tensors in shape [mem_len, bsz, d_model], memory
+                from previous batches. The length of the list equals n_layer.
+                If None, no memory is used.
+            perm_mask: float32 Tensor in shape [len, len, bsz].
+                If perm_mask[i, j, k] = 0, i attend to j in batch k;
+                if perm_mask[i, j, k] = 1, i does not attend to j in batch k.
+                If None, each position attends to all the others.
+            target_mapping: float32 Tensor in shape [num_predict, len, bsz].
+                If target_mapping[i, j, k] = 1, the i-th predict in batch k is
+                on the j-th token.
+                Only used during pretraining for partial prediction.
+                Set to None during finetuning.
+            inp_q: float32 Tensor in shape [len, bsz].
+                1 for tokens with losses and 0 for tokens without losses.
+                Only used during pretraining for two-stream attention.
+                Set to None during finetuning.
 
-        # We create a 3D attention mask from a 2D tensor mask.
-        # Sizes are [batch_size, 1, 1, to_seq_length]
-        # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
-        # this attention mask is more simple than the triangular masking of causal attention
-        # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
-        extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
+            mem_len: int, the number of tokens to cache.
+            reuse_len: int, the number of tokens in the currect batch to be cached
+                and reused in the future.
+            bi_data: bool, whether to use bidirectional input pipeline.
+                Usually set to True during pretraining and False during finetuning.
+            clamp_len: int, clamp all relative distances larger than clamp_len.
+                -1 means no clamping.
+            same_length: bool, whether to use the same attention length for each token.
+            summary_type: str, "last", "first", "mean", or "attn". The method
+                to pool the input to get a vector representation.
+        """
+        output, new_mems = self.transformer(output_h, non_tgt_mask, r, seg_mat,
+                                            output_g=output_g, attn_mask_g=attn_mask,
+                                            mems=mems, target_mapping=target_mapping,
+                                            head_mask=head_mask)
 
-        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
-        # masked positions, this operation will create a tensor which is 0.0 for
-        # positions we want to attend and -10000.0 for masked positions.
-        # Since we are adding it to the raw scores before the softmax, this is
-        # effectively the same as removing these entirely.
-        extended_attention_mask = extended_attention_mask.to(dtype=next(self.parameters()).dtype) # fp16 compatibility
-        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
+        logits = self.lm_loss(output)
 
-        # Prepare head mask if needed
-        # 1.0 in head_mask indicate we keep the head
-        # attention_probs has shape bsz x n_heads x N x N
-        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
-        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
-        if head_mask is not None:
-            if head_mask.dim() == 1:
-                head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
-                head_mask = head_mask.expand_as(self.config.num_hidden_layers, -1, -1, -1, -1)
-            elif head_mask.dim() == 2:
-                head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)  # We can specify head_mask for each layer
-            head_mask = head_mask.to(dtype=next(self.parameters()).dtype) # switch to fload if need + fp16 compatibility
-        else:
-            head_mask = [None] * self.config.num_hidden_layers
-
-        embedding_output = self.embeddings(input_ids, token_type_ids)
-        encoded_layers = self.encoder(embedding_output,
-                                      extended_attention_mask,
-                                      output_all_encoded_layers=output_all_encoded_layers,
-                                      head_mask=head_mask)
-        if self.output_attentions:
-            all_attentions, encoded_layers = encoded_layers
-        sequence_output = encoded_layers[-1]
-        pooled_output = self.pooler(sequence_output)
-        if not output_all_encoded_layers:
-            encoded_layers = encoded_layers[-1]
-        if self.output_attentions:
-            return all_attentions, encoded_layers, pooled_output
-        return encoded_layers, pooled_output
-    
\ No newline at end of file
+        # if self.output_attentions:
+        #     all_attentions, encoded_layers = encoded_layers
+        # sequence_output = encoded_layers[-1]
+        # pooled_output = self.pooler(sequence_output)
+        # if not output_all_encoded_layers:
+        #     encoded_layers = encoded_layers[-1]
+        # if self.output_attentions:
+        #     return all_attentions, encoded_layers, pooled_output
+        return output, new_mems

From b407972e27332a6b8328a4f1dd7ae8dc863fb6c2 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Thu, 20 Jun 2019 13:52:56 +0200
Subject: [PATCH 003/139] update gitignore

---
 .gitignore | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index aeff829aa0..8abc9b84e1 100644
--- a/.gitignore
+++ b/.gitignore
@@ -122,4 +122,5 @@ dmypy.json
 tensorflow_code
 
 # Models
-models
\ No newline at end of file
+models
+proc_data
\ No newline at end of file

From 45709d7532a2bb08106861994ea41789beadc611 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Fri, 21 Jun 2019 00:28:42 +0200
Subject: [PATCH 004/139] model running with simple inputs

---
 pytorch_pretrained_bert/__init__.py           |   3 +
 .../convert_xlnet_checkpoint_to_pytorch.py    |   4 +-
 pytorch_pretrained_bert/modeling.py           |   2 +-
 pytorch_pretrained_bert/modeling_gpt2.py      |   2 +-
 pytorch_pretrained_bert/modeling_openai.py    |   2 +-
 pytorch_pretrained_bert/modeling_xlnet.py     | 250 +++++++++++-------
 tests/modeling_xlnet_test.py                  | 247 +++++++++++++++++
 7 files changed, 403 insertions(+), 107 deletions(-)
 create mode 100644 tests/modeling_xlnet_test.py

diff --git a/pytorch_pretrained_bert/__init__.py b/pytorch_pretrained_bert/__init__.py
index 508988322b..ded1f7093b 100644
--- a/pytorch_pretrained_bert/__init__.py
+++ b/pytorch_pretrained_bert/__init__.py
@@ -17,6 +17,9 @@ from .modeling_transfo_xl import (TransfoXLConfig, TransfoXLModel, TransfoXLLMHe
 from .modeling_gpt2 import (GPT2Config, GPT2Model,
                             GPT2LMHeadModel, GPT2DoubleHeadsModel, GPT2MultipleChoiceHead,
                             load_tf_weights_in_gpt2)
+from .modeling_xlnet import (XLNetBaseConfig, XLNetConfig, XLNetRunConfig,
+                             XLNetPreTrainedModel, XLNetModel, XLNetLMHeadModel,
+                             load_tf_weights_in_xlnet)
 
 from .optimization import BertAdam
 from .optimization_openai import OpenAIAdam
diff --git a/pytorch_pretrained_bert/convert_xlnet_checkpoint_to_pytorch.py b/pytorch_pretrained_bert/convert_xlnet_checkpoint_to_pytorch.py
index b9220d60cd..eb89745be2 100755
--- a/pytorch_pretrained_bert/convert_xlnet_checkpoint_to_pytorch.py
+++ b/pytorch_pretrained_bert/convert_xlnet_checkpoint_to_pytorch.py
@@ -21,13 +21,13 @@ from __future__ import print_function
 import argparse
 import torch
 
-from pytorch_pretrained_bert.modeling_xlnet import XLNetConfig, XLNetRunConfig, XLNetModel, load_tf_weights_in_xlnet
+from pytorch_pretrained_bert.modeling_xlnet import XLNetConfig, XLNetRunConfig, XLNetLMHeadModel, load_tf_weights_in_xlnet
 
 def convert_xlnet_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, pytorch_dump_path):
     # Initialise PyTorch model
     config = XLNetConfig.from_json_file(bert_config_file)
     print("Building PyTorch model from configuration: {}".format(str(config)))
-    model = XLNetModel(config)
+    model = XLNetLMHeadModel(config)
 
     # Load weights from tf checkpoint
     load_tf_weights_in_xlnet(model, tf_checkpoint_path)
diff --git a/pytorch_pretrained_bert/modeling.py b/pytorch_pretrained_bert/modeling.py
index 81825f2f35..2b67a260f0 100644
--- a/pytorch_pretrained_bert/modeling.py
+++ b/pytorch_pretrained_bert/modeling.py
@@ -867,7 +867,7 @@ class BertModel(BertPreTrainedModel):
         if head_mask is not None:
             if head_mask.dim() == 1:
                 head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
-                head_mask = head_mask.expand_as(self.config.num_hidden_layers, -1, -1, -1, -1)
+                head_mask = head_mask.expand(self.config.num_hidden_layers, -1, -1, -1, -1)
             elif head_mask.dim() == 2:
                 head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)  # We can specify head_mask for each layer
             head_mask = head_mask.to(dtype=next(self.parameters()).dtype) # switch to fload if need + fp16 compatibility
diff --git a/pytorch_pretrained_bert/modeling_gpt2.py b/pytorch_pretrained_bert/modeling_gpt2.py
index c17a7e7d26..c4c4876833 100644
--- a/pytorch_pretrained_bert/modeling_gpt2.py
+++ b/pytorch_pretrained_bert/modeling_gpt2.py
@@ -722,7 +722,7 @@ class GPT2Model(GPT2PreTrainedModel):
         if head_mask is not None:
             if head_mask.dim() == 1:
                 head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
-                head_mask = head_mask.expand_as(self.config.n_layer, -1, -1, -1, -1)
+                head_mask = head_mask.expand(self.config.n_layer, -1, -1, -1, -1)
             elif head_mask.dim() == 2:
                 head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)  # We can specify head_mask for each layer
             head_mask = head_mask.to(dtype=next(self.parameters()).dtype) # switch to fload if need + fp16 compatibility
diff --git a/pytorch_pretrained_bert/modeling_openai.py b/pytorch_pretrained_bert/modeling_openai.py
index f02f016ef1..b4df679fe6 100644
--- a/pytorch_pretrained_bert/modeling_openai.py
+++ b/pytorch_pretrained_bert/modeling_openai.py
@@ -718,7 +718,7 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
         if head_mask is not None:
             if head_mask.dim() == 1:
                 head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
-                head_mask = head_mask.expand_as(self.config.n_layer, -1, -1, -1, -1)
+                head_mask = head_mask.expand(self.config.n_layer, -1, -1, -1, -1)
             elif head_mask.dim() == 2:
                 head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)  # We can specify head_mask for each layer
             head_mask = head_mask.to(dtype=next(self.parameters()).dtype) # switch to fload if need + fp16 compatibility
diff --git a/pytorch_pretrained_bert/modeling_xlnet.py b/pytorch_pretrained_bert/modeling_xlnet.py
index 1ae3167713..39a2d95a4f 100644
--- a/pytorch_pretrained_bert/modeling_xlnet.py
+++ b/pytorch_pretrained_bert/modeling_xlnet.py
@@ -29,6 +29,7 @@ from io import open
 
 import torch
 from torch import nn
+from torch.nn import functional as F
 from torch.nn import CrossEntropyLoss
 
 from .file_utils import cached_path, WEIGHTS_NAME, CONFIG_NAME
@@ -126,32 +127,27 @@ def swish(x):
 
 ACT2FN = {"gelu": gelu, "relu": torch.nn.functional.relu, "swish": swish}
 
-def positional_embedding(pos_seq, inv_freq, bsz=None):
-    sinusoid_inp = torch.einsum('i,d->id', pos_seq, inv_freq)
-    pos_emb = torch.cat([tf.sin(sinusoid_inp), tf.cos(sinusoid_inp)], -1)
-    pos_emb = pos_emb[:, None, :]
-
-    if bsz is not None:
-        pos_emb = pos_emb.expand(1, bsz, 1)
-
-    return pos_emb
-
 class XLNetBaseConfig(object):
     @classmethod
     def from_dict(cls, json_object):
-        """Constructs a `XLNetConfig` from a Python dictionary of parameters."""
-        config = XLNetConfig(vocab_size_or_config_json_file=-1)
+        """Constructs a `XLNetBaseConfig` from a Python dictionary of parameters."""
+        config = cls(vocab_size_or_config_json_file=-1)
         for key, value in json_object.items():
             config.__dict__[key] = value
         return config
 
     @classmethod
     def from_json_file(cls, json_file):
-        """Constructs a `XLNetConfig` from a json file of parameters."""
+        """Constructs a `XLNetBaseConfig` from a json file of parameters."""
         with open(json_file, "r", encoding='utf-8') as reader:
             text = reader.read()
         return cls.from_dict(json.loads(text))
 
+    def update(self, other):
+        dict_b = other.to_dict()
+        for key, value in dict_b.items():
+            self.__dict__[key] = value
+
     def __repr__(self):
         return str(self.to_json_string())
 
@@ -181,6 +177,7 @@ class XLNetConfig(XLNetBaseConfig):
                  d_inner=4096,
                  ff_activation="gelu",
                  untie_r=True,
+                 attn_type="bi",
 
                  max_position_embeddings=512,
                  initializer_range=0.02,
@@ -198,6 +195,7 @@ class XLNetConfig(XLNetBaseConfig):
             ff_activation: The non-linear activation function (function or string) in the
                 encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
             untie_r: untie relative position biases
+            attn_type: 'bi' for XLNet, 'uni' for Transformer-XL
 
             dropout: The dropout probabilitiy for all fully connected
                 layers in the embeddings, encoder, and pooler.
@@ -226,6 +224,7 @@ class XLNetConfig(XLNetBaseConfig):
             self.ff_activation = ff_activation
             self.d_inner = d_inner
             self.untie_r = untie_r
+            self.attn_type = attn_type
             self.max_position_embeddings = max_position_embeddings
             self.initializer_range = initializer_range
             self.layer_norm_eps = layer_norm_eps
@@ -304,15 +303,15 @@ class XLNetRelativeAttention(nn.Module):
     def __init__(self, config, output_attentions=False, keep_multihead_output=False):
         super(XLNetRelativeAttention, self).__init__()
         self.output_attentions = output_attentions
-        if config.d_model % config.num_attention_heads != 0:
+        if config.d_model % config.n_head != 0:
             raise ValueError(
                 "The hidden size (%d) is not a multiple of the number of attention "
-                "heads (%d)" % (config.d_model, config.num_attention_heads))
+                "heads (%d)" % (config.d_model, config.n_head))
         self.output_attentions = output_attentions
         self.keep_multihead_output = keep_multihead_output
         self.multihead_output = None
 
-        self.n_head = config.num_attention_heads
+        self.n_head = config.n_head
         self.d_head = config.d_head
         self.d_model = config.d_model
         self.scale = 1 / (config.d_head ** 0.5)
@@ -326,7 +325,7 @@ class XLNetRelativeAttention(nn.Module):
         self.r_r_bias = nn.Parameter(torch.Tensor(self.n_head, self.d_head))
         self.r_s_bias = nn.Parameter(torch.Tensor(self.n_head, self.d_head))
         self.r_w_bias = nn.Parameter(torch.Tensor(self.n_head, self.d_head))
-        self.seg_embed = nn.Parameter(torch.Tensor(self.n_head, 2, self.d_head))
+        self.seg_embed = nn.Parameter(torch.Tensor(2, self.n_head, self.d_head))
 
         self.LayerNorm = XLNetLayerNorm(config.d_model, eps=config.layer_norm_eps)
         self.dropout = nn.Dropout(config.dropout)
@@ -334,6 +333,18 @@ class XLNetRelativeAttention(nn.Module):
     def prune_heads(self, heads):
         raise NotImplementedError
 
+    @staticmethod
+    def rel_shift(x, klen=-1):
+        """perform relative shift to form the relative attention score."""
+        x_size = x.shape
+
+        x = x.reshape(x_size[1], x_size[0], x_size[2], x_size[3])
+        x = x[1:, ...]
+        x = x.reshape(x_size[0], x_size[1] - 1, x_size[2], x_size[3])
+        x = x[:, 0:klen, :, :]
+
+        return x
+
     def rel_attn_core(self, q_head, k_head_h, v_head_h, k_head_r, seg_mat=None, attn_mask=None):
         """Core relative positional attention operations."""
 
@@ -342,7 +353,7 @@ class XLNetRelativeAttention(nn.Module):
 
         # position based attention score
         bd = torch.einsum('ibnd,jbnd->ijbn', q_head + self.r_r_bias, k_head_r)
-        bd = rel_shift(bd, klen=torch.shape(ac)[1])
+        bd = self.rel_shift(bd, klen=ac.shape[1])
 
         # segment based attention score
         if seg_mat is None:
@@ -426,7 +437,6 @@ class XLNetRelativeAttention(nn.Module):
 
             # post processing
             output_g = self.post_attention(g, attn_vec_g)
-            attention_output = output_h, output_g
         else:
             ###### Multi-head attention with relative positional encoding
             if mems is not None and mems.dim() > 1:
@@ -447,7 +457,8 @@ class XLNetRelativeAttention(nn.Module):
                 q_head_h, k_head_h, v_head_h, k_head_r, seg_mat=seg_mat, attn_mask=attn_mask_h)
 
             # post processing
-            attention_output = self.post_attention(h, attn_vec)
+            output_h = self.post_attention(h, attn_vec)
+            output_g = None
 
 
         # Mask heads if we want to
@@ -467,7 +478,7 @@ class XLNetRelativeAttention(nn.Module):
         #     attentions, self_output = self_output
         # if self.output_attentions:
         #     return attentions, attention_output
-        return attention_output
+        return output_h, output_g
 
 class XLNetFeedForward(nn.Module):
     def __init__(self, config):
@@ -481,13 +492,15 @@ class XLNetFeedForward(nn.Module):
         else:
             self.activation_function = config.ff_activation
 
-    def forward(self, hidden_states, input_tensor):
-        hidden_states = self.layer_1(hidden_states)
-        hidden_states = self.activation_function(hidden_states)
-        hidden_states = self.layer_2(hidden_states)
-        hidden_states = self.dropout(hidden_states)
-        hidden_states = self.LayerNorm(hidden_states + input_tensor)
-        return hidden_states
+    def forward(self, inp):
+        output = inp
+        output = self.layer_1(output)
+        output = self.activation_function(output)
+        output = self.dropout(output)
+        output = self.layer_2(output)
+        output = self.dropout(output)
+        output = self.LayerNorm(output + inp)
+        return output
 
 class XLNetLayer(nn.Module):
     def __init__(self, config, output_attentions=False, keep_multihead_output=False):
@@ -500,13 +513,13 @@ class XLNetLayer(nn.Module):
 
     def forward(self, output_h, output_g,
                 attn_mask_h, attn_mask_g,
-                r, seg_mat, r, seg_mat,
-                two_streams=False, mems=None, target_mapping=None, head_mask=None):
+                r, seg_mat,
+                mems=None, target_mapping=None, head_mask=None):
         output_h, output_g = self.rel_attn(output_h, output_g,
                                            attn_mask_h, attn_mask_g,
                                            r, seg_mat,
                                            mems=mems, target_mapping=target_mapping, head_mask=head_mask)
-        if two_streams:
+        if output_g is not None:
             output_g = self.ff(output_g)
         output_h = self.ff(output_h)
 
@@ -520,9 +533,9 @@ class XLNetPreTrainedModel(nn.Module):
     """
     def __init__(self, config, *inputs, **kwargs):
         super(XLNetPreTrainedModel, self).__init__()
-        if not isinstance(config, XLNetConfig):
+        if not isinstance(config, XLNetBaseConfig):
             raise ValueError(
-                "Parameter config in `{}(config)` should be an instance of class `XLNetConfig`. "
+                "Parameter config in `{}(config)` should be an instance of class `XLNetBaseConfig`. "
                 "To create a model from a Google pretrained model use "
                 "`model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`".format(
                     self.__class__.__name__, self.__class__.__name__
@@ -668,26 +681,41 @@ class XLNetPreTrainedModel(nn.Module):
 
 class XLNetModel(XLNetPreTrainedModel):
     def __init__(self, config, output_attentions=False, keep_multihead_output=False):
-        super(XLNetModel, self).__init__()
+        super(XLNetModel, self).__init__(config)
         self.output_attentions = output_attentions
         self.mem_len = config.mem_len
         self.reuse_len = config.reuse_len
+        self.d_model = config.d_model
+        self.same_length = config.same_length
+        self.attn_type = config.attn_type
+        self.bi_data = config.bi_data
+        self.clamp_len = config.clamp_len
+
         layer = XLNetLayer(config, output_attentions=output_attentions,
-                                  keep_multihead_output=keep_multihead_output)
-        self.layer = nn.ModuleList([copy.deepcopy(layer) for _ in range(config.num_hidden_layers)])
+                                   keep_multihead_output=keep_multihead_output)
+        self.layer = nn.ModuleList([copy.deepcopy(layer) for _ in range(config.n_layer)])
+        self.dropout = nn.Dropout(config.dropout)
 
-    @classmethod
-    def _create_mask(qlen, mlen, dtype=torch.float, same_length=False):
-        """create causal attention mask."""
-        attn_mask = torch.ones([qlen, qlen], dtype=dtype)
-        mask_u = tf.matrix_band_part(attn_mask, 0, -1)
-        mask_dia = tf.matrix_band_part(attn_mask, 0, 0)
-        attn_mask_pad = tf.zeros([qlen, mlen], dtype=dtype)
-        ret = tf.concat([attn_mask_pad, mask_u - mask_dia], 1)
-        if same_length:
-            mask_l = tf.matrix_band_part(attn_mask, -1, 0)
-            ret = tf.concat([ret[:, :qlen] + mask_l - mask_dia, ret[:, qlen:]], 1)
+    def create_mask(self, qlen, mlen):
+        """ create causal attention mask.
+            float mask where 1.0 indicate masked, 0.0 indicated not-masked.
+             same_length=False:      same_length=True:
+             <mlen > <  qlen >       <mlen > <  qlen >
+          ^ [0 0 0 0 0 1 1 1 1]     [0 0 0 0 0 1 1 1 1]
+            [0 0 0 0 0 0 1 1 1]     [1 0 0 0 0 0 1 1 1]
+       qlen [0 0 0 0 0 0 0 1 1]     [1 1 0 0 0 0 0 1 1]
+            [0 0 0 0 0 0 0 0 1]     [1 1 1 0 0 0 0 0 1]
+          v [0 0 0 0 0 0 0 0 0]     [1 1 1 1 0 0 0 0 0]
+        """
+        attn_mask = torch.ones([qlen, qlen])
+        mask_up = torch.triu(attn_mask, diagonal=1)
+        attn_mask_pad = torch.zeros([qlen, mlen])
+        ret = torch.cat([attn_mask_pad, mask_up], dim=1)
+        if self.same_length:
+            mask_lo = torch.tril(attn_mask, diagonal=-1)
+            ret = torch.cat([ret[:, :qlen] + mask_lo, ret[:, qlen:]], dim=1)
 
+        ret = ret.to(next(self.parameters()))
         return ret
 
     def cache_mem(self, curr_out, prev_mem):
@@ -705,10 +733,21 @@ class XLNetModel(XLNetPreTrainedModel):
 
         return new_mem.detach()
 
-    def relative_positional_encoding(self, qlen, klen, bsz=None, dtype=torch.float):
+    @staticmethod
+    def positional_embedding(pos_seq, inv_freq, bsz=None):
+        sinusoid_inp = torch.einsum('i,d->id', pos_seq, inv_freq)
+        pos_emb = torch.cat([torch.sin(sinusoid_inp), torch.cos(sinusoid_inp)], dim=-1)
+        pos_emb = pos_emb[:, None, :]
+
+        if bsz is not None:
+            pos_emb = pos_emb.expand(-1, bsz, -1)
+
+        return pos_emb
+
+    def relative_positional_encoding(self, qlen, klen, bsz=None):
         """create relative positional encoding."""
-        freq_seq = torch.zrange(0, d_model, 2.0, dtype=dtype)
-        inv_freq = 1 / (10000 ** (freq_seq / self.config.d_model))
+        freq_seq = torch.arange(0, self.d_model, 2.0, dtype=torch.float)
+        inv_freq = 1 / (10000 ** (freq_seq / self.d_model))
 
         if self.attn_type == 'bi':
             # beg, end = klen - 1, -qlen
@@ -720,51 +759,52 @@ class XLNetModel(XLNetPreTrainedModel):
             raise ValueError('Unknown `attn_type` {}.'.format(self.attn_type))
 
         if self.bi_data:
-            fwd_pos_seq = torch.arange(beg, end, -1.0, dtype=dtype)
-            bwd_pos_seq = torch.arange(-beg, -end, 1.0, dtype=dtype)
+            fwd_pos_seq = torch.arange(beg, end, -1.0, dtype=torch.float)
+            bwd_pos_seq = torch.arange(-beg, -end, 1.0, dtype=torch.float)
 
             if self.clamp_len > 0:
                 fwd_pos_seq = fwd_pos_seq.clamp(-self.clamp_len, self.clamp_len)
                 bwd_pos_seq = bwd_pos_seq.clamp(-self.clamp_len, self.clamp_len)
 
             if bsz is not None:
-                fwd_pos_emb = positional_embedding(fwd_pos_seq, inv_freq, bsz//2)
-                bwd_pos_emb = positional_embedding(bwd_pos_seq, inv_freq, bsz//2)
+                fwd_pos_emb = self.positional_embedding(fwd_pos_seq, inv_freq, bsz//2)
+                bwd_pos_emb = self.positional_embedding(bwd_pos_seq, inv_freq, bsz//2)
             else:
-                fwd_pos_emb = positional_embedding(fwd_pos_seq, inv_freq)
-                bwd_pos_emb = positional_embedding(bwd_pos_seq, inv_freq)
+                fwd_pos_emb = self.positional_embedding(fwd_pos_seq, inv_freq)
+                bwd_pos_emb = self.positional_embedding(bwd_pos_seq, inv_freq)
 
             pos_emb = torch.cat([fwd_pos_emb, bwd_pos_emb], dim=1)
         else:
-            fwd_pos_seq = torch.arange(beg, end, -1.0, dtype=dtype)
+            fwd_pos_seq = torch.arange(beg, end, -1.0)
             if self.clamp_len > 0:
                 fwd_pos_seq = fwd_pos_seq.clamp(-self.clamp_len, self.clamp_len)
-            pos_emb = positional_embedding(fwd_pos_seq, inv_freq, bsz)
+            pos_emb = self.positional_embedding(fwd_pos_seq, inv_freq, bsz)
 
+        pos_emb = pos_emb.to(next(self.parameters()))
         return pos_emb
 
-    def forward(self, inp_k, seg_id=None, input_mask=None,
+    def forward(self, word_emb_k, seg_id=None, input_mask=None,
                 mems=None, perm_mask=None, target_mapping=None, inp_q=None,
                 output_all_encoded_layers=True, head_mask=None):
         """
         Args:
-            inp_k: int32 Tensor in shape [len, bsz], the input token IDs.
+            word_emb_k: float32 Tensor in shape [len, bsz, d_model], the input token embeddings.
             seg_id: int32 Tensor in shape [len, bsz], the input segment IDs.
-            input_mask: float32 Tensor in shape [len, bsz], the input mask.
+            input_mask: [optional] float32 Tensor in shape [len, bsz], the input mask.
                 0 for real tokens and 1 for padding.
-            mems: a list of float32 Tensors in shape [mem_len, bsz, d_model], memory
+            mems: [optional] a list of float32 Tensors in shape [mem_len, bsz, d_model], memory
                 from previous batches. The length of the list equals n_layer.
                 If None, no memory is used.
-            perm_mask: float32 Tensor in shape [len, len, bsz].
+            perm_mask: [optional] float32 Tensor in shape [len, len, bsz].
                 If perm_mask[i, j, k] = 0, i attend to j in batch k;
                 if perm_mask[i, j, k] = 1, i does not attend to j in batch k.
                 If None, each position attends to all the others.
-            target_mapping: float32 Tensor in shape [num_predict, len, bsz].
+            target_mapping: [optional] float32 Tensor in shape [num_predict, len, bsz].
                 If target_mapping[i, j, k] = 1, the i-th predict in batch k is
                 on the j-th token.
                 Only used during pretraining for partial prediction.
                 Set to None during finetuning.
-            inp_q: float32 Tensor in shape [len, bsz].
+            inp_q: [optional] float32 Tensor in shape [len, bsz].
                 1 for tokens with losses and 0 for tokens without losses.
                 Only used during pretraining for two-stream attention.
                 Set to None during finetuning.
@@ -780,14 +820,16 @@ class XLNetModel(XLNetPreTrainedModel):
             summary_type: str, "last", "first", "mean", or "attn". The method
                 to pool the input to get a vector representation.
         """
-        qlen, bsz = inp_k.shape
+        qlen, bsz = word_emb_k.shape[0], word_emb_k.shape[1]
         mlen = mems[0].shape[0] if mems is not None else 0
         klen = mlen + qlen
+        dtype_float = word_emb_k.dtype
+        device = word_emb_k.device
 
         ##### Attention mask
         # causal attention mask
         if self.attn_type == 'uni':
-            attn_mask = _create_mask(qlen, mlen, inp_k.dtype, self.same_length)
+            attn_mask = self.create_mask(qlen, mlen)
             attn_mask = attn_mask[:, :, None, None]
         elif self.attn_type == 'bi':
             attn_mask = None
@@ -806,7 +848,7 @@ class XLNetModel(XLNetPreTrainedModel):
 
         if data_mask is not None:
             # all mems can be attended to
-            mems_mask = torch.zeros([data_mask.shape[0], mlen, bsz], dtype=data_mask.dtype, device=data_mask.device)
+            mems_mask = torch.zeros([data_mask.shape[0], mlen, bsz]).to(data_mask)
             data_mask = torch.cat([mems_mask, data_mask], dim=1)
             if attn_mask is None:
                 attn_mask = data_mask[:, :, :, None]
@@ -814,23 +856,20 @@ class XLNetModel(XLNetPreTrainedModel):
                 attn_mask += data_mask[:, :, :, None]
 
         if attn_mask is not None:
-            attn_mask = (attn_mask > 0).float()
+            attn_mask = (attn_mask > 0).to(dtype_float)
 
         if attn_mask is not None:
-            non_tgt_mask = -tf.eye(qlen, dtype=tf_float)
-            non_tgt_mask = tf.concat([tf.zeros([qlen, mlen], dtype=tf_float),
-                                    non_tgt_mask], axis=-1)
-            non_tgt_mask = tf.cast((attn_mask + non_tgt_mask[:, :, None, None]) > 0,
-                                    dtype=tf_float)
+            non_tgt_mask = -torch.eye(qlen).to(attn_mask)
+            non_tgt_mask = torch.cat([torch.zeros([qlen, mlen]).to(attn_mask), non_tgt_mask], dim=-1)
+            non_tgt_mask = ((attn_mask + non_tgt_mask[:, :, None, None]) > 0).to(attn_mask)
         else:
             non_tgt_mask = None
 
-        ##### Word embedding
-        word_emb_k = self.word_embedding(inp_k)
+        ##### Process Word embeddings and prepare h & g hidden states
         output_h = self.dropout(word_emb_k)
         if inp_q is not None:
             if target_mapping is not None:
-                word_emb_q = mask_emb.expand(target_mapping.shape[0], bsz, 1)
+                word_emb_q = mask_emb.expand(target_mapping.shape[0], bsz, -1)
             else:
                 inp_q_ext = inp_q[:, :, None]
                 word_emb_q = inp_q_ext * mask_emb + (1 - inp_q_ext) * word_emb_k
@@ -841,33 +880,33 @@ class XLNetModel(XLNetPreTrainedModel):
         ##### Segment embedding
         if seg_id is not None:
             # Convert `seg_id` to one-hot `seg_mat`
-            mem_pad = torch.zeros([mlen, bsz], dtype=torch.long)
+            mem_pad = torch.zeros([mlen, bsz], dtype=torch.long, device=device)
             cat_ids = torch.cat([mem_pad, seg_id], dim=0)
 
             # `1` indicates not in the same segment [qlen x klen x bsz]
             seg_mat = (seg_id[:, None] != cat_ids[None, :]).long()
-            # seg_mat = tf.one_hot(seg_mat, 2, dtype=tf_float)
+            seg_mat = F.one_hot(seg_mat, num_classes=2).to(dtype_float)
         else:
             seg_mat = None
 
         ##### Positional encoding
-        pos_emb = relative_positional_encoding(qlen, klen, bsz=bsz, dtype=inp_k.dtype)
+        pos_emb = self.relative_positional_encoding(qlen, klen, bsz=bsz)
         pos_emb = self.dropout(pos_emb)
 
         ##### Head mask if needed (for bertology/pruning)
         # 1.0 in head_mask indicate we keep the head
         # attention_probs has shape bsz x n_heads x N x N
-        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
-        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        # input head_mask has shape [num_heads] or [n_layer x num_heads]
+        # and head_mask is converted to shape [n_layer x batch x num_heads x seq_length x seq_length]
         if head_mask is not None:
             if head_mask.dim() == 1:
                 head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
-                head_mask = head_mask.expand_as(self.config.num_hidden_layers, -1, -1, -1, -1)
+                head_mask = head_mask.expand(self.config.n_layer, -1, -1, -1, -1)
             elif head_mask.dim() == 2:
                 head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)  # We can specify head_mask for each layer
             head_mask = head_mask.to(dtype=next(self.parameters()).dtype) # switch to fload if need + fp16 compatibility
         else:
-            head_mask = [None] * self.config.num_hidden_layers
+            head_mask = [None] * self.config.n_layer
 
         new_mems = []
         if mems is None:
@@ -878,14 +917,14 @@ class XLNetModel(XLNetPreTrainedModel):
             new_mems.append(self.cache_mem(output_h, mems[i]))
 
             output_h, output_g = layer_module(output_h, output_g,
-                                              attn_mask_h, attn_mask_g,
-                                              r, seg_mat,
+                                              attn_mask_h=non_tgt_mask, attn_mask_g=attn_mask,
+                                              r=pos_emb, seg_mat=seg_mat,
                                               mems=mems[i], target_mapping=target_mapping,
                                               head_mask=head_mask)
 
         output = self.dropout(output_g if output_g is not None else output_h)
 
-        return output
+        return output, new_mems
 
 
 class XLNetLMHeadModel(XLNetPreTrainedModel):
@@ -932,28 +971,27 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
     token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
 
     config = modeling.XLNetConfig(vocab_size_or_config_json_file=32000, d_model=768,
-        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
+        n_layer=12, num_attention_heads=12, intermediate_size=3072)
 
     model = modeling.XLNetModel(config=config)
     all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)
     ```
     """
-    def __init__(self, config, run_config, output_attentions=False, keep_multihead_output=False):
+    def __init__(self, config, output_attentions=False, keep_multihead_output=False):
         super(XLNetLMHeadModel, self).__init__(config)
         self.output_attentions = output_attentions
-        self.attn_type = run_config.attn_type
-        self.same_length = run_config.same_length
+        self.attn_type = config.attn_type
+        self.same_length = config.same_length
 
         self.word_embedding = nn.Embedding(config.vocab_size, config.d_model)
-        self.mask_emb = nn.Parameter(torch.Tensor(1, 1, self.d_model))
-        self.transformer = XLNetModel(config,
-                                            output_attentions=output_attentions,
-                                            keep_multihead_output=keep_multihead_output)
+        self.mask_emb = nn.Parameter(torch.Tensor(1, 1, config.d_model))
+        self.transformer = XLNetModel(config, output_attentions=output_attentions,
+                                              keep_multihead_output=keep_multihead_output)
         self.lm_loss = nn.Linear(config.d_model, config.vocab_size, bias=True)
         self.dropout = nn.Dropout(config.dropout)
+
         # Tie weights
-        if config.tie_weight:
-            self.lm_loss.weight = self.word_embedding.weight
+        self.lm_loss.weight = self.word_embedding.weight
 
         self.apply(self.init_xlnet_weights)
 
@@ -972,7 +1010,7 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
 
     def forward(self, inp_k, seg_id=None, input_mask=None,
                 mems=None, perm_mask=None, target_mapping=None, inp_q=None,
-                output_all_encoded_layers=True, head_mask=None):
+                target=None, output_all_encoded_layers=True, head_mask=None):
         """
         Args:
             inp_k: int32 Tensor in shape [len, bsz], the input token IDs.
@@ -1007,13 +1045,21 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
             summary_type: str, "last", "first", "mean", or "attn". The method
                 to pool the input to get a vector representation.
         """
-        output, new_mems = self.transformer(output_h, non_tgt_mask, r, seg_mat,
-                                            output_g=output_g, attn_mask_g=attn_mask,
-                                            mems=mems, target_mapping=target_mapping,
-                                            head_mask=head_mask)
+        word_emb_k = self.word_embedding(inp_k)
+
+        output, new_mems = self.transformer(word_emb_k, seg_id, input_mask,
+                                            mems, perm_mask, target_mapping, inp_q,
+                                            output_all_encoded_layers, head_mask)
 
         logits = self.lm_loss(output)
 
+        if target is not None:
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss(ignore_index=-1)
+            loss = loss_fct(logits.view(-1, logits.size(-1)),
+                            target.view(-1))
+            return loss, new_mems
+
         # if self.output_attentions:
         #     all_attentions, encoded_layers = encoded_layers
         # sequence_output = encoded_layers[-1]
diff --git a/tests/modeling_xlnet_test.py b/tests/modeling_xlnet_test.py
new file mode 100644
index 0000000000..30a6bfbec7
--- /dev/null
+++ b/tests/modeling_xlnet_test.py
@@ -0,0 +1,247 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import unittest
+import json
+import random
+import shutil
+import pytest
+
+import torch
+
+from pytorch_pretrained_bert import (XLNetConfig, XLNetRunConfig, XLNetModel, XLNetLMHeadModel)
+from pytorch_pretrained_bert.modeling_xlnet import PRETRAINED_MODEL_ARCHIVE_MAP
+
+class XLNetModelTest(unittest.TestCase):
+    class XLNetModelTester(object):
+
+        def __init__(self,
+                     parent,
+                     batch_size=13,
+                     seq_length=7,
+                     mem_len=30,
+                     clamp_len=15,
+                     reuse_len=15,
+                     is_training=True,
+                     use_labels=True,
+                     vocab_size=99,
+                     cutoffs=[10, 50, 80],
+                     d_model=32,
+                     n_head=4,
+                     d_inner=128,
+                     n_layer=5,
+                     max_position_embeddings=10,
+                     untie_r=True,
+                     bi_data=False,
+                     same_length=False,
+                     seed=1,
+                     type_vocab_size=2):
+            self.parent = parent
+            self.batch_size = batch_size
+            self.seq_length = seq_length
+            self.mem_len = mem_len
+            self.clamp_len = clamp_len
+            self.reuse_len = reuse_len
+            self.is_training = is_training
+            self.use_labels = use_labels
+            self.vocab_size = vocab_size
+            self.cutoffs = cutoffs
+            self.d_model = d_model
+            self.n_head = n_head
+            self.d_inner = d_inner
+            self.n_layer = n_layer
+            self.max_position_embeddings = max_position_embeddings
+            self.bi_data = bi_data
+            self.untie_r = untie_r
+            self.same_length = same_length
+            self.seed = seed
+            self.type_vocab_size = type_vocab_size
+
+        def prepare_config_and_inputs(self):
+            input_ids_1 = XLNetModelTest.ids_tensor([self.seq_length, self.batch_size], self.vocab_size)
+            input_ids_2 = XLNetModelTest.ids_tensor([self.seq_length, self.batch_size], self.vocab_size)
+            segment_ids = XLNetModelTest.ids_tensor([self.seq_length, self.batch_size], self.type_vocab_size)
+
+            lm_labels = None
+            if self.use_labels:
+                lm_labels = XLNetModelTest.ids_tensor([self.seq_length, self.batch_size], self.vocab_size)
+
+            config = XLNetConfig(
+                vocab_size_or_config_json_file=self.vocab_size,
+                d_model=self.d_model,
+                n_head=self.n_head,
+                d_inner=self.d_inner,
+                n_layer=self.n_layer,
+                untie_r=self.untie_r,
+                max_position_embeddings=self.max_position_embeddings)
+
+            run_config = XLNetRunConfig(
+                mem_len=self.mem_len,
+                clamp_len=self.clamp_len,
+                same_length=self.same_length,
+                reuse_len=self.reuse_len,
+                bi_data=self.bi_data)
+
+            config.update(run_config)
+
+            return (config, input_ids_1, input_ids_2, segment_ids, lm_labels)
+
+        def set_seed(self):
+            random.seed(self.seed)
+            torch.manual_seed(self.seed)
+
+        def create_transfo_xl_model(self, config, input_ids_1, input_ids_2, segment_ids, lm_labels):
+            model = XLNetLMHeadModel(config)
+            model.eval()
+
+            hidden_states_1, mems_1 = model(input_ids_1, seg_id=segment_ids)
+            hidden_states_2, mems_2 = model(input_ids_2, seg_id=segment_ids, mems=mems_1)
+            outputs = {
+                "hidden_states_1": hidden_states_1,
+                "mems_1": mems_1,
+                "hidden_states_2": hidden_states_2,
+                "mems_2": mems_2,
+            }
+            return outputs
+
+        def check_transfo_xl_model_output(self, result):
+            self.parent.assertListEqual(
+                list(result["hidden_states_1"].size()),
+                [self.seq_length, self.batch_size, self.d_model])
+            self.parent.assertListEqual(
+                list(result["hidden_states_2"].size()),
+                [self.seq_length, self.batch_size, self.d_model])
+            self.parent.assertListEqual(
+                list(list(mem.size()) for mem in result["mems_1"]),
+                [[self.mem_len, self.batch_size, self.d_model]] * self.n_layer)
+            self.parent.assertListEqual(
+                list(list(mem.size()) for mem in result["mems_2"]),
+                [[self.mem_len, self.batch_size, self.d_model]] * self.n_layer)
+
+
+        def create_transfo_xl_lm_head(self, config, input_ids_1, input_ids_2, segment_ids, lm_labels):
+            model = XLNetLMHeadModel(config)
+            model.eval()
+
+            loss_1, mems_1a = model(input_ids_1, target=lm_labels)
+            lm_logits_1, mems_1b = model(input_ids_1)
+
+            loss_2, mems_2a = model(input_ids_2, target=lm_labels, mems=mems_1a)
+            lm_logits_2, mems_2b = model(input_ids_2, mems=mems_1b)
+
+            outputs = {
+                "loss_1": loss_1,
+                "mems_1a": mems_1a,
+                "lm_logits_1": lm_logits_1,
+                "mems_1b": mems_1b,
+                "loss_2": loss_2,
+                "mems_2a": mems_2a,
+                "lm_logits_2": lm_logits_2,
+                "mems_2b": mems_2b,
+            }
+            return outputs
+
+        def check_transfo_xl_lm_head_output(self, result):
+            self.parent.assertListEqual(
+                list(result["loss_1"].size()),
+                [self.seq_length, self.batch_size])
+            self.parent.assertListEqual(
+                list(result["lm_logits_1"].size()),
+                [self.seq_length, self.batch_size, self.vocab_size])
+            self.parent.assertListEqual(
+                list(list(mem.size()) for mem in result["mems_1a"]),
+                [[self.mem_len, self.batch_size, self.d_model]] * self.n_layer)
+            self.parent.assertListEqual(
+                list(list(mem.size()) for mem in result["mems_1b"]),
+                [[self.mem_len, self.batch_size, self.d_model]] * self.n_layer)
+            self.parent.assertListEqual(
+                list(mem[~torch.isnan(mem)].sum() for mem in result["mems_1a"]),
+                list(mem[~torch.isnan(mem)].sum() for mem in result["mems_1b"]))
+
+            self.parent.assertListEqual(
+                list(result["loss_2"].size()),
+                [self.seq_length, self.batch_size])
+            self.parent.assertListEqual(
+                list(result["lm_logits_2"].size()),
+                [self.seq_length, self.batch_size, self.vocab_size])
+            self.parent.assertListEqual(
+                list(list(mem.size()) for mem in result["mems_2a"]),
+                [[self.mem_len, self.batch_size, self.d_model]] * self.n_layer)
+            self.parent.assertListEqual(
+                list(list(mem.size()) for mem in result["mems_2b"]),
+                [[self.mem_len, self.batch_size, self.d_model]] * self.n_layer)
+            self.parent.assertListEqual(
+                list(mem[~torch.isnan(mem)].sum() for mem in result["mems_2a"]),
+                list(mem[~torch.isnan(mem)].sum() for mem in result["mems_2b"]))
+
+    def test_default(self):
+        self.run_tester(XLNetModelTest.XLNetModelTester(self))
+
+    def test_config_to_json_string(self):
+        config = XLNetConfig(vocab_size_or_config_json_file=96, d_model=37)
+        obj = json.loads(config.to_json_string())
+        self.assertEqual(obj["n_token"], 96)
+        self.assertEqual(obj["d_model"], 37)
+
+    def test_config_to_json_file(self):
+        config_first = XLNetConfig(vocab_size_or_config_json_file=96, d_model=37)
+        json_file_path = "/tmp/config.json"
+        config_first.to_json_file(json_file_path)
+        config_second = XLNetConfig.from_json_file(json_file_path)
+        os.remove(json_file_path)
+        self.assertEqual(config_second.to_dict(), config_first.to_dict())
+
+    @pytest.mark.slow
+    def test_model_from_pretrained(self):
+        cache_dir = "/tmp/pytorch_pretrained_bert_test/"
+        for model_name in list(PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
+            model = XLNetModel.from_pretrained(model_name, cache_dir=cache_dir)
+            shutil.rmtree(cache_dir)
+            self.assertIsNotNone(model)
+
+    def run_tester(self, tester):
+        config_and_inputs = tester.prepare_config_and_inputs()
+
+        tester.set_seed()
+        output_result = tester.create_transfo_xl_model(*config_and_inputs)
+        tester.check_transfo_xl_model_output(output_result)
+
+        tester.set_seed()
+        output_result = tester.create_transfo_xl_lm_head(*config_and_inputs)
+        tester.check_transfo_xl_lm_head_output(output_result)
+
+    @classmethod
+    def ids_tensor(cls, shape, vocab_size, rng=None, name=None):
+        """Creates a random int32 tensor of the shape within the vocab size."""
+        if rng is None:
+            rng = random.Random()
+
+        total_dims = 1
+        for dim in shape:
+            total_dims *= dim
+
+        values = []
+        for _ in range(total_dims):
+            values.append(rng.randint(0, vocab_size - 1))
+
+        return torch.tensor(data=values, dtype=torch.long).view(shape).contiguous()
+
+
+if __name__ == "__main__":
+    unittest.main()

From 32da75486bbfbcb7feb98b032dcf05e54e6f745d Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Fri, 21 Jun 2019 11:09:51 +0200
Subject: [PATCH 005/139] add tokenizer and tests

---
 pytorch_pretrained_bert/__init__.py           |   4 +-
 pytorch_pretrained_bert/modeling_xlnet.py     |  10 +-
 .../modeling_xlnet_utilities.py               | 111 ++++++++
 pytorch_pretrained_bert/tokenization_xlnet.py | 254 ++++++++++++++++++
 requirements.txt                              |   4 +-
 samples/test_sentencepiece.model              | Bin 0 -> 253154 bytes
 setup.py                                      |   3 +-
 tests/modeling_xlnet_test.py                  |  90 ++++---
 tests/tokenization_test.py                    |   2 +-
 tests/tokenization_transfo_xl_test.py         |   2 +-
 tests/tokenization_xlnet_test.py              |  88 ++++++
 11 files changed, 511 insertions(+), 57 deletions(-)
 create mode 100644 pytorch_pretrained_bert/modeling_xlnet_utilities.py
 create mode 100644 samples/test_sentencepiece.model
 create mode 100644 tests/tokenization_xlnet_test.py

diff --git a/pytorch_pretrained_bert/__init__.py b/pytorch_pretrained_bert/__init__.py
index ded1f7093b..7be5031d0e 100644
--- a/pytorch_pretrained_bert/__init__.py
+++ b/pytorch_pretrained_bert/__init__.py
@@ -3,6 +3,7 @@ from .tokenization import BertTokenizer, BasicTokenizer, WordpieceTokenizer
 from .tokenization_openai import OpenAIGPTTokenizer
 from .tokenization_transfo_xl import (TransfoXLTokenizer, TransfoXLCorpus)
 from .tokenization_gpt2 import GPT2Tokenizer
+from .tokenization_xlnet import XLNetTokenizer
 
 from .modeling import (BertConfig, BertModel, BertForPreTraining,
                        BertForMaskedLM, BertForNextSentencePrediction,
@@ -24,4 +25,5 @@ from .modeling_xlnet import (XLNetBaseConfig, XLNetConfig, XLNetRunConfig,
 from .optimization import BertAdam
 from .optimization_openai import OpenAIAdam
 
-from .file_utils import PYTORCH_PRETRAINED_BERT_CACHE, cached_path, WEIGHTS_NAME, CONFIG_NAME
+from .file_utils import (PYTORCH_PRETRAINED_BERT_CACHE, cached_path,
+                         WEIGHTS_NAME, CONFIG_NAME)
diff --git a/pytorch_pretrained_bert/modeling_xlnet.py b/pytorch_pretrained_bert/modeling_xlnet.py
index 39a2d95a4f..08b193acfd 100644
--- a/pytorch_pretrained_bert/modeling_xlnet.py
+++ b/pytorch_pretrained_bert/modeling_xlnet.py
@@ -1034,14 +1034,6 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
                 Only used during pretraining for two-stream attention.
                 Set to None during finetuning.
 
-            mem_len: int, the number of tokens to cache.
-            reuse_len: int, the number of tokens in the currect batch to be cached
-                and reused in the future.
-            bi_data: bool, whether to use bidirectional input pipeline.
-                Usually set to True during pretraining and False during finetuning.
-            clamp_len: int, clamp all relative distances larger than clamp_len.
-                -1 means no clamping.
-            same_length: bool, whether to use the same attention length for each token.
             summary_type: str, "last", "first", "mean", or "attn". The method
                 to pool the input to get a vector representation.
         """
@@ -1068,4 +1060,4 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
         #     encoded_layers = encoded_layers[-1]
         # if self.output_attentions:
         #     return all_attentions, encoded_layers, pooled_output
-        return output, new_mems
+        return logits, new_mems
diff --git a/pytorch_pretrained_bert/modeling_xlnet_utilities.py b/pytorch_pretrained_bert/modeling_xlnet_utilities.py
new file mode 100644
index 0000000000..e2611b7a41
--- /dev/null
+++ b/pytorch_pretrained_bert/modeling_xlnet_utilities.py
@@ -0,0 +1,111 @@
+# coding=utf-8
+# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Utilities for PyTorch XLNet model.
+"""
+
+from collections import defaultdict
+
+import numpy as np
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+special_symbols = {
+    "<unk>"  : 0,
+    "<s>"    : 1,
+    "</s>"   : 2,
+    "<cls>"  : 3,
+    "<sep>"  : 4,
+    "<pad>"  : 5,
+    "<mask>" : 6,
+    "<eod>"  : 7,
+    "<eop>"  : 8,
+}
+
+VOCAB_SIZE = 32000
+UNK_ID = special_symbols["<unk>"]
+CLS_ID = special_symbols["<cls>"]
+SEP_ID = special_symbols["<sep>"]
+MASK_ID = special_symbols["<mask>"]
+EOD_ID = special_symbols["<eod>"]
+
+
+def permutation_mask(inputs, targets, is_masked, perm_size, seq_len):
+    """
+    Sample a permutation of the factorization order, and create an
+    attention mask accordingly.
+    Args:
+        inputs: int64 Tensor in shape [seq_len], input ids.
+        targets: int64 Tensor in shape [seq_len], target ids.
+        is_masked: bool Tensor in shape [seq_len]. True means being selected
+            for partial prediction.
+        perm_size: the length of longest permutation. Could be set to be reuse_len.
+            Should not be larger than reuse_len or there will be data leaks.
+        seq_len: int, sequence length.
+    """
+
+    # Generate permutation indices
+    index = np.arange(10)
+    index = np.transpose(np.reshape(index, [-1, perm_size]))
+    index = np.random.shuffle(index)
+    index = np.reshape(np.transpose(index), [-1])
+
+    # `perm_mask` and `target_mask`
+    # non-functional tokens
+    non_func_tokens = tf.logical_not(tf.logical_or(
+        tf.equal(inputs, SEP_ID),
+        tf.equal(inputs, CLS_ID)))
+
+    non_mask_tokens = tf.logical_and(tf.logical_not(is_masked), non_func_tokens)
+    masked_or_func_tokens = tf.logical_not(non_mask_tokens)
+
+    # Set the permutation indices of non-masked (& non-funcional) tokens to the
+    # smallest index (-1):
+    # (1) they can be seen by all other positions
+    # (2) they cannot see masked positions, so there won"t be information leak
+    smallest_index = -tf.ones([seq_len], dtype=tf.int64)
+    rev_index = tf.where(non_mask_tokens, smallest_index, index)
+
+    # Create `target_mask`: non-funcional and maksed tokens
+    # 1: use mask as input and have loss
+    # 0: use token (or [SEP], [CLS]) as input and do not have loss
+    target_tokens = tf.logical_and(masked_or_func_tokens, non_func_tokens)
+    target_mask = tf.cast(target_tokens, tf.float32)
+
+    # Create `perm_mask`
+    # `target_tokens` cannot see themselves
+    self_rev_index = tf.where(target_tokens, rev_index, rev_index + 1)
+
+    # 1: cannot attend if i <= j and j is not non-masked (masked_or_func_tokens)
+    # 0: can attend if i > j or j is non-masked
+    perm_mask = tf.logical_and(
+        self_rev_index[:, None] <= rev_index[None, :],
+        masked_or_func_tokens)
+    perm_mask = tf.cast(perm_mask, tf.float32)
+
+    # new target: [next token] for LM and [curr token] (self) for PLM
+    new_targets = tf.concat([inputs[0: 1], targets[: -1]],
+                            axis=0)
+
+    # construct inputs_k
+    inputs_k = inputs
+
+    # construct inputs_q
+    inputs_q = target_mask
+
+    return perm_mask, new_targets, target_mask, inputs_k, inputs_q
+
diff --git a/pytorch_pretrained_bert/tokenization_xlnet.py b/pytorch_pretrained_bert/tokenization_xlnet.py
index e69de29bb2..c9a3d40631 100644
--- a/pytorch_pretrained_bert/tokenization_xlnet.py
+++ b/pytorch_pretrained_bert/tokenization_xlnet.py
@@ -0,0 +1,254 @@
+# coding=utf-8
+# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Tokenization classes for XLNet model."""
+from __future__ import (absolute_import, division, print_function,
+                        unicode_literals)
+
+import json
+import logging
+import os
+import sys
+from shutil import copyfile
+from io import open
+
+import unicodedata
+import six
+
+from .file_utils import cached_path
+
+logger = logging.getLogger(__name__)
+
+PRETRAINED_VOCAB_ARCHIVE_MAP = {
+    'xlnet-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-large-cased-spiece.model",
+}
+VOCAB_NAME = 'spiece.model'
+SPECIAL_TOKENS_NAME = 'special_tokens.txt'
+
+SPIECE_UNDERLINE = '▁'
+
+class XLNetTokenizer(object):
+    """
+        SentencePiece based tokenizer. Peculiarities:
+            - requires SentencePiece: https://github.com/google/sentencepiece
+    """
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, *inputs, **kwargs):
+        """
+        Instantiate a PreTrainedBertModel from a pre-trained model file.
+        Download and cache the pre-trained model file if needed.
+        """
+        if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP:
+            vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[pretrained_model_name_or_path]
+            special_tokens_file = None
+        else:
+            vocab_file = os.path.join(pretrained_model_name_or_path, VOCAB_NAME)
+            special_tokens_file = os.path.join(pretrained_model_name_or_path, SPECIAL_TOKENS_NAME)
+            if not os.path.exists(special_tokens_file):
+                special_tokens_file = None
+            else:
+                logger.info("loading special tokens file {}".format(special_tokens_file))
+        # redirect to the cache, if necessary
+        try:
+            resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir)
+        except EnvironmentError:
+            if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP:
+                logger.error(
+                    "Couldn't reach server at '{}' to download vocabulary.".format(
+                        vocab_file))
+            else:
+                logger.error(
+                    "Model name '{}' was not found in model name list ({}). "
+                    "We assumed '{}' was a path or url but couldn't find files {}"
+                    "at this path or url.".format(
+                        pretrained_model_name_or_path,
+                        ', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()),
+                        pretrained_model_name_or_path,
+                        vocab_file))
+            return None
+        if resolved_vocab_file == vocab_file:
+            logger.info("loading vocabulary file {}".format(vocab_file))
+        else:
+            logger.info("loading vocabulary file {} from cache at {}".format(
+                vocab_file, resolved_vocab_file))
+        # Instantiate tokenizer.
+        if special_tokens_file and 'special_tokens' not in kwargs:
+            special_tokens = open(special_tokens_file, encoding='utf-8').read().split('\n')[:-1]
+        else:
+            special_tokens = kwargs.pop('special_tokens', [])
+        tokenizer = cls(resolved_vocab_file, special_tokens=special_tokens, *inputs, **kwargs)
+        return tokenizer
+
+    def __init__(self, vocab_file, special_tokens=None, max_len=None,
+                 do_lower_case=False, remove_space=True, keep_accents=False):
+        try:
+            import sentencepiece as spm
+        except ImportError:
+            logger.warning("You need to install SentencePiece to use XLNetTokenizer: https://github.com/google/sentencepiece"
+                           "pip install sentencepiece")
+
+        self.max_len = max_len if max_len is not None else int(1e12)
+        self.do_lower_case = do_lower_case
+        self.remove_space = remove_space
+        self.keep_accents = keep_accents
+        self.vocab_file = vocab_file
+
+        self.sp_model = spm.SentencePieceProcessor()
+        self.sp_model.Load(vocab_file)
+        self.special_tokens = {}
+        self.special_tokens_decoder = {}
+        self.set_special_tokens(special_tokens)
+
+    def __len__(self):
+        return len(self.encoder) + len(self.special_tokens)
+
+    def set_special_tokens(self, special_tokens):
+        """ Add a list of additional tokens to the encoder.
+            The additional tokens are indexed starting from the last index of the
+            current vocabulary in the order of the `special_tokens` list.
+        """
+        if not special_tokens:
+            self.special_tokens = {}
+            self.special_tokens_decoder = {}
+            return
+        self.special_tokens = dict((tok, len(self.sp_model) + i) for i, tok in enumerate(special_tokens))
+        self.special_tokens_decoder = {v:k for k, v in self.special_tokens.items()}
+        logger.info("Special tokens: %s", str(self.special_tokens))
+
+    def preprocess_text(self, inputs):
+        if self.remove_space:
+            outputs = ' '.join(inputs.strip().split())
+        else:
+            outputs = inputs
+        outputs = outputs.replace("``", '"').replace("''", '"')
+
+        if six.PY2 and isinstance(outputs, str):
+            outputs = outputs.decode('utf-8')
+
+        if not self.keep_accents:
+            outputs = unicodedata.normalize('NFKD', outputs)
+            outputs = ''.join([c for c in outputs if not unicodedata.combining(c)])
+        if self.do_lower_case:
+            outputs = outputs.lower()
+
+        return outputs
+
+    def tokenize(self, text, return_unicode=True, sample=False):
+        """ Tokenize a string.
+            return_unicode is used only for py2
+        """
+        text = self.preprocess_text(text)
+        # note(zhiliny): in some systems, sentencepiece only accepts str for py2
+        if six.PY2 and isinstance(text, unicode):
+            text = text.encode('utf-8')
+
+        if not sample:
+            pieces = self.sp_model.EncodeAsPieces(text)
+        else:
+            pieces = self.sp_model.SampleEncodeAsPieces(text, 64, 0.1)
+        new_pieces = []
+        for piece in pieces:
+            if len(piece) > 1 and piece[-1] == ',' and piece[-2].isdigit():
+                cur_pieces = self.sp_model.EncodeAsPieces(
+                    piece[:-1].replace(SPIECE_UNDERLINE, ''))
+                if piece[0] != SPIECE_UNDERLINE and cur_pieces[0][0] == SPIECE_UNDERLINE:
+                    if len(cur_pieces[0]) == 1:
+                        cur_pieces = cur_pieces[1:]
+                    else:
+                        cur_pieces[0] = cur_pieces[0][1:]
+                cur_pieces.append(piece[-1])
+                new_pieces.extend(cur_pieces)
+            else:
+                new_pieces.append(piece)
+
+        # note(zhiliny): convert back to unicode for py2
+        if six.PY2 and return_unicode:
+            ret_pieces = []
+            for piece in new_pieces:
+                if isinstance(piece, str):
+                    piece = piece.decode('utf-8')
+                ret_pieces.append(piece)
+            new_pieces = ret_pieces
+
+        return new_pieces
+
+    def convert_tokens_to_ids(self, tokens, sample=False):
+        """ Converts a sequence of tokens into ids using the vocab. """
+        ids = []
+        if isinstance(tokens, str) or (sys.version_info[0] == 2 and isinstance(tokens, unicode)):
+            if tokens in self.special_tokens:
+                return self.special_tokens[tokens]
+            else:
+                return self.sp_model.PieceToId(tokens)
+        for token in tokens:
+            if token in self.special_tokens:
+                ids.append(self.special_tokens[token])
+            else:
+                ids.append(self.sp_model.PieceToId(token))
+        if len(ids) > self.max_len:
+            logger.warning(
+                "Token indices sequence length is longer than the specified maximum "
+                " sequence length for this XLNet model ({} > {}). Running this"
+                " sequence through the model will result in indexing errors".format(len(ids), self.max_len)
+            )
+        return ids
+
+    def convert_ids_to_tokens(self, ids, skip_special_tokens=False):
+        """Converts a sequence of ids in tokens."""
+        tokens = []
+        for i in ids:
+            if i in self.special_tokens_decoder:
+                if not skip_special_tokens:
+                    tokens.append(self.special_tokens_decoder[i])
+            else:
+                tokens.append(self.sp_model.IdToPiece(i))
+        return tokens
+
+    def encode(self, text, sample=False):
+        return self.convert_tokens_to_ids(self.tokenize(text, sample=sample))
+
+    def decode(self, ids, skip_special_tokens=False, clean_up_tokenization_spaces=True):
+        """Converts a sequence of ids in a string."""
+        tokens = self.convert_ids_to_tokens(ids, skip_special_tokens=skip_special_tokens)
+        out_string = ''.join(tokens)
+        if clean_up_tokenization_spaces:
+            out_string = out_string.strip().replace('<unk>', '')
+            out_string = out_string.replace(' .', '.').replace(' ?', '?').replace(' !', '!').replace(' ,', ','
+                    ).replace(" ' ", "'").replace(" n't", "n't").replace(" 'm", "'m").replace(" do not", " don't"
+                    ).replace(" 's", "'s").replace(" 've", "'ve").replace(" 're", "'re")
+        return out_string
+
+    def save_vocabulary(self, vocab_path):
+        """ Save the sentencepiece vocabulary (copy original file) and special tokens file
+            to a directory.
+        """
+        if not os.path.isdir(vocab_path):
+            logger.error("Vocabulary path ({}) should be a directory".format(vocab_path))
+            return
+        out_vocab_file = os.path.join(vocab_path, VOCAB_NAME)
+        special_tokens_file = os.path.join(vocab_path, SPECIAL_TOKENS_NAME)
+
+        copyfile(self.vocab_file, out_vocab_file)
+
+        index = len(self.sp_model)
+        with open(special_tokens_file, 'w', encoding='utf-8') as writer:
+            for token, token_index in sorted(self.special_tokens.items(), key=lambda kv: kv[1]):
+                if index != token_index:
+                    logger.warning("Saving special tokens vocabulary to {}: BPE indices are not consecutive."
+                                   " Please check that the tokenizer is not corrupted!".format(special_tokens_file))
+                    index = token_index
+                writer.write(token + u'\n')
+                index += 1
+
+        return out_vocab_file, special_tokens_file
diff --git a/requirements.txt b/requirements.txt
index caf6471e86..165fa74af9 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -7,4 +7,6 @@ boto3
 # Used for downloading models over HTTP
 requests
 # For OpenAI GPT
-regex
\ No newline at end of file
+regex
+# For XLNet
+sentencepiece
\ No newline at end of file
diff --git a/samples/test_sentencepiece.model b/samples/test_sentencepiece.model
new file mode 100644
index 0000000000000000000000000000000000000000..376dda73010c6f93acfa3b974bea81a9ac9e1740
GIT binary patch
literal 253154
zcmZ6z4_wq$_Wyro_@^ZJC$FjN|INxa+-$SWQp-})O3RYUHroamU=$df0hKmvK>nIq
zs8m|CDWQn`5fMZM)HW*%DlJ>BY$dZ#x|!K#o0aYFdEb|r@%!`fcs=ep=iYnnz2}~L
z?w|KyWOQWM_{_BV6P56<zdf=?WXO1L7Z(y46&X7Ik3qIur0qI&%rj?iB-Chimo08D
zMZ&*$J|`pDsjm7g-pcT*pC3rw8yOiHLWX+r^NhWbmqms`PglRbo3)oF!gLL{>Tyqw
zKg*$>`$z8H$nKGLXM*}<cD`S)BP~HCJYVFm<W5w7KLAJb8LI3L%Ph*NuK0X~MJ1^H
ze_Uy`>`F^g>7mbBQim$KrNpnnm8Q1tU1Q1JD*n*({)QGgylUFFFZ%m4LY@3(tyRjU
zK6~LMi%L`9UiET7PlkGEW|<{VRd2<-YEdcb<xe+QRJ>Z*t=yv0Rc4-{vea#(U-Q>b
zb*gUJaE#<?HFde6V$`ggD)vT3M@CpCb!4cXcf(W0qjoRe=&u{+RQvxH;IRDrziqN?
zmy)G^&8jqJ^r)#jA<gt8x;^Ua`ptgTi(DDWYD{yLr8-HS+X-c6B{fU^{a<jVTADjU
zE%@6OtJr+?WgC<!=JKjv!mItp@+$Wl7*i+9ovC)ss<S%b@Thkmgfx|~@XCE~3@1~)
zF{s|6lGPuQp_q7P=B69B`PIZ_W~ekTEID*Oen*2vdYtOQ%I#LoM0L+Up~xRSLX}T#
z^f%*9Q#+;wIBYm%NK+usqt-g$j6bHU*IwG;Z!p=BpxzqOY^h$P4n1P1IQ3~Wlz&DT
z_old=dn0>Bc4LMXxieD|)P~qyejT%!i|zY%`}N1Gbw^wLO?%yHUuS@09RE5EXF8Io
zdWE$5l_b+hztOP9YH`fSjcxueBs&&5)vFosjGM7VhwbqfOmi#OU2tT0T<Vk4hD>p)
zw|_OHBSpRP`@Q~V+-XjAU;?aGl%_i0hh%0Jsyja0=kHgXI{rNjCA=BxZ?&&mWJ-z}
zGxs0<Ruh~mw;0Y$ezGf76<qy>pLI}nx&sys@yV)m$bP@(Me68b7&F8Kw;FIRz_}9D
zz1`om>NwNX$IBp%xgSh+c{0?D&k&f_9w1ia91E~+TxMR<TUJf{`_58GGjWL?cd9!4
z4|qm6U3q^C<Pf*ARP3y`{r#GxE^mS{4U)+`%^h^WFL1=^HXa#<z$o@QTnTE?QFx-A
zGc7}XHsc+CUW#kJQ+;yJyH<ZPJ?hoZ4T*n#@cVW%KCk-MLy+|9R`uc2P{tsOoa#iK
zDdLJ(l?{gUI@OTR4C!8^Uj81^^kR|j=H-(+{B~KC>{fpbf6uQiGhI!0!}*=qnW|o#
z^Zwq*%OiW>#P_>B@tNr^yy1&@gr);YZWUPzXYA}ief*oSOe5}h`Ul8fk=?26jZb#F
zQ`C$7Kd>_K+{;FFktwRK#E{pGP<M4d=vN)@R?B+8b*WCvQr%i$QB4;LhaEDC64cb|
zAiKtwj3W%c-(<x*)wF*bGBZt$>hqyrRgyD9{r*u{Q_16I&cn;#QSHKSju_J8R=1pk
z>{`OfE*x{nRUi2sz@zdez?j}8FuTwF6P5`Yq5kwW6w1<_YW@X7-Znx_wjcI4>2(v;
z$A!XEez@9uBa~?-*`3Mk{p~gcX2qwdUsl6lZAXR+k2qfnXBzc7o$AM5U}*|(J9pE^
zmcHB6$_-FvaB)s7lDaLxFG?mBJpB{Au9T{F_x`8f-qW1w$-!`@Bk@+qx_dGLZ5EF@
z@HnLD(+sumc^LXWMP;@@8DE&5sM7B`;^znzzb3<(et0vn-q63no4&`BeMiVqKkFqP
zezq9aH1MFh>Ma;kHPww*UG@Py9dV|p?;ridpOxXlj)!7C_1DW#cRvba>Me2+RPqww
z&4}E3m8$R-1YJIfu?frfT>hEA8>x;gm318~EwKqX9ERkG@Locokk)eJN3g79M}||C
z{Tmuv%vKi<9rHJKXNr4Kn#%hG0lFyl$mO5=E#pmAFI^4CSRHYM-OcyHP-;P@dhZ2D
zV<9(Pi+VA@E~0DCd|(QuIMlvVhRpP;us{9FKjb(^{CxG#IRQS^L1Za<Jiy0mzuW4B
zH%t1i%nWB*oYRw}4z(a;Kp70~!7l?9GLo66r-mQ*cb(=^{|sj=>Tx9{XQ);A2&jtZ
zs(VVIvBe~H^6yY)Hl50S8ph0uJB_o)j6Zf-#b&CN$xyu`BlR)H>4?|1jduq{%#dzk
zT{<Ex#AwFnK$_;0-1_jgxCDVQSvpJBRR&8B9#Fs4LG_8eOh=!L%mjk2SJk&5GaZ?q
zMr>@<ylMV^k9(doK11F279!IC4uj+0@lK#lhPLN|)9_s@kV)A+_6t9ovB<6dyc*V2
zcX;Qs?>zrJ0^@Lr&J^52-S;YzE+2J!<}0-x9s{}vnfuG3&2n<2s?-03Gs@B|TYqum
z34b;*E$mJ>#(IltPlGZl?{Q_h)q9%n>P4cS)AC9L*8HZaw@yK0P0nnm`+w=zH$6ky
z$G{ma$xco==f=aEIZRNa(*jJA`XUbo6`3jO_0^E3$Iiv-k$2&Yaqb`{Wwo11sd~#(
zoiL_Tsn%%D)}8dX=Uhmfnz=Q=Cprk<*_x-7;Um<6eUPRrvsM4TU->;a-Q!A&ccpV;
zsosFlXwOLY5E{2>f$2&D`_1P~ux4-xdTr(m{XajS>fms)LGxxDac<Q*84i107ZaoQ
zYuYG^Cjt+=E5N!F6V*TOhSf_sjUFGl&s5X5k^f~u(|bZ&=TnfTZBD%&b+FQ8VdIZ~
z3}m?%GEp%@zP8jkwYnQ_fH5s66OVph1&hXXw;Iv`$<)tPzm7QN*X>H<u5{pfcr%Vv
zx7s!)kh{pi9Nc~rJoToj=Ea7(PksIrl;3i$wE60u%?KF0uCU{<DfXaR5q{dZX-1a%
z!&T5`K&d*ie>4W3j*hnesneP@*%>;HO`Qa<-6cVdm~2Q6Z+&M%T0WVe&Llv0O%^t`
z*T8C{rVuT%jv9uAe2@CiaTqf^Z@P;4Dv&i*T{G$5*4XY<FHVD^J$Jf#BNoyqPUM#H
z+`}e^Gw?qa7?R;kUj}KE&UCBp&2Xmov-KH!ao9IO9ydLZ2yX(Y-pDhObr_nNg3z?1
zy{pX~WSQ`|bXbP!BKEKSG?0}_be;7tScYot?vMS?Fgon1S8Zqf?Z!I@-2aM%H*4Sy
zRXQz@nd(H%xm<WAcClK%63R^HZ1;SE!O9mAn9(IWRQxtLY{TL1rT*Xg8+cfsoQD1>
zz$dxg+^5IF(`)UQUo}FT#wV*CtuUsEM3;vn|A7xoZiY5R>kon4^c200lNOxyx0B+g
zK&9qQNQ`r4=<xKYCr~1TAgT6f-dHo4h`DSjtaivGr}|4tpy(nUReq}u6ipx;Z+ab;
z;bo*M@7X}fd6{%_xcbiD4R0oAifvZ}_zXQ~+kO||(;ZwO`VI;34!!*jb-)`tCOOm-
zDFH6kp&GK_F#p49!qWlyeQC<M8qPmdF4s;K+k(V+*L)XY!l8K#o8t6(Rm({j2AW7<
zjX49Y7frhAbIm!wmK3*J?<#|0kaVpuN6LJ4a}vCn5kj{oL0vH)0ku6@>b)J1#+Z7~
z8+|IkdYSYw=V7(~>AiPM==atz?pAjyD0-4YV;QrcO=A!1^W#JF;LV^OT#)I&r~mpE
z64U%5kDlUrT3|L-{f3}#LsQ-kmZ2@Dx;FAZezmjp?drKJ;k6~S6RAH2GOfZR(%>=h
z^(rwN%BY@}85gGqo2*5qLmp>FrYB9!(-PBVuU<P9s{%ZWE=vvB9pD#fhx@b*-q^^?
zB6+wYknPri_|cC7*>iZdxcEO<Ix|K+aKjJ&&cr$69s2P8!k-bD&ZIgl?{3l(qkIxG
zUT_O6rq#!JXDl>hX8-f3Q6IwKVDVNgSpO-U83L!U1oiu$;EmoCvub+$jD&7XRo~mr
z`^zTjW5lkI0Pocc=;q4<e4>s)bNa&5EFpE8N*Nx=)ZV2&ofycxPd!o%(>0nbuQMf4
zUAq&Bnd=mXj#EQ-Auz+s!l5o}gJpQkshZRYtyiN%Z8`(lwR${i^o$?<TwJC~@xqy-
z8k1uJ>T@~B7~EvlcP*4LM7+LXUb6w72Ar(vtE&QetbVVmZZvuNT>a2FXw%U+r}n9W
zVgL0v`k-6AdLx_}nb(n}^T+%-kUu?5?U@K?sxuVY?Q>h80<SB&!hJIX6}U+!CBo6;
zJJqUrP^QN$t2Fg^W1!q5UJpNn)AP@@AniD`nTY$<%2R=AoYd1*^N^n`g`?DaL!peq
zd)x${_ch<O-jZF6^@E@wKE**KIT{qCS<clsoGQltV@9jjM+Uf5-hd{;nO@wfemMeT
z?2_oV4y<wiMPgb{aVBOcPmsrLd9A!aepi5TE>ek_?J}&>nMwfpSxZdY4?EQ_2QT>Z
z)14kKtKjOxNK6B%I#Qp}yipj(8OiorfKBo^ROvH6TTaE%EO$AiQJmmVeb>Pmt2mMz
zJQKdTAs|@f$l#pvQZoVu8s|>PQeU<jhDC6VdSX9}X)?p@R`1^Xi=U;B>YZ3vrgn~c
z@Hr^ss$CZrui9UV%(P3WBLt*uKw!q>(Ff`IAHy5vUgGBx&7$po_1Awvb@kt)o>0H~
zizRb7-`pS8sP$&(4JpZifYrp?+57V#yDGX=)k-*HaUF$-cSWm_m>zKJ$WZsMfj7Dm
z-I+v_?=~U833c=iKLm|cX6xWS@-PgYo1tF$CzJ`aR{Kokrvoqg9px{o_zy6s(5IOH
zPB65dy-pvr=}oFDAp!4v;Ta@G6K8YlaAS$OoWzmhz>C)ELZ-{f4&uz0nm1kcI9you
z%hd>s72}D~$2J7YXQtys@3secW^lgh`93_|(*fIk1ll<G6m|5IKtZpYiqHNA-!(fK
zT$M)uW|pVZt3F7CHhoycRn4oOoQJ@aO?SA|yNhAXD0tD(uizoek@!c!*7S0aPu1If
zrRI&sIA;=;8?ho#Azd%r4J+aOzOqPVJ_~D1hkEO%d8Q;#B9$ZUYsX<tAMS7@B;l<`
zR{zi6hcxHn4E0HnpYL?0t4__EK6o?qE6M4gU_qu1OMTZ+!sx`I^sB+2N&`CC%{}Uw
zjqs+M4-)8mSHWUn>$2&*N%m>Ra<qqTg)sw|;Ye_&r8!i_1Bguf3*9cZS4&`;^EesP
ze3hw-7&BV?d2NtS)BfH4LGq3N&vvVse+`sQ(c52a+$B7(h27%e&7zHSb70W1yFL|}
zf9g{7G3WApBxXoS%w)x(KqanyURMH^-S#hJeIug?8O{`53pnfW?rKTRcv5*S&r)yc
zVy5?5oW>0AAD=@<yaT#~sj|qe7tp&wftM?-Qw={J(2?v+Nmq}25vaim!jn2qA~4;}
zAS!<RRUn@|hvgFWpMV06lPT)RkMJz)zo=`ku=#7Ib1>V~AGT}vO;4x4*y7MoWTwXs
zOQuS%N5XQ_&Tw`Nw6Q)dUw%8BnPKz7;Z-mGH6Y?4F)f4T{mTMGej9jQNoki#lAN4M
z9O_qH##klAsXt)+FDOWN=s+wUN*K$!)7e=wf_$>>*}FkLQ7^YTFXaiDT<Z=j3{=&V
zsqT9c9xqMP2k&QIfbpL|y!xFyLH+4vQ{Z8JK$I#N6wT-K_YWh-3GPy#y#=GqLc}?D
zz+~XDPW6r5Ze{43L%#@UGp>iN?YgdeK#*uXg;)aAJV@#8eND|d1fV&Z?urIHTR-|c
zSTp=YC;Z-#@Qfit?{+cQ!7#yhsbTj*8CzQ+!>it$kAxXZv059q3C{F~FCJW?#%qC@
z!Z?q4D7{^ax|-5Y_jd((b^{(GgZv_W19>UP$2s__a$hC&P0!OQKCvpmdYul{<8xTk
z0~6G?{u=?Eze6>C2Zbs0ZEjq*5dT0@Tp1Z`CB5zmNYb6|^#51lB~~%%y>*!`<xWqk
zR}I$!GZt?Kj{-CAg*X12oT+Y{31@ohr1qBC0roy($Jb4;bZo5JeIBZ7O(Z5dcp9`M
zM$bIA;fxP?S&^2dtrEtti#&_~m$gJDhD_B9*}Utw1eGsh47j=_F{7Z<tf04a5i^Pu
z2j`d1HQ(hMuElD=kAdt2H?GriQ)pn2E;Z*CxUNC6%g5Xbi{s$KvFhr(V6eCjtRE!=
zYR$+}c}Z}l$$0&Z?xZWgTZe_RMet^L#KKH{>OQy$k+G6<ak_p{J)|Y3khV)2m-r90
z$nRY2CdHNTrgw{yv(&4aHQ8~LR}I1J`Fv@)r;2PkpNxssX3dzY9LQAbd6Q>dR9}t=
zvrJ$fPe0L|(aJvLRNlwnO}`u|`ka=fBrZ^d7sw11sd+O833SA*dIt+6cpQt=xFA2@
zna&X{SRo0O8R{u5Fy<i4r!lVuL5VZXIt!HqE9s+pmWqlGRATehyVXxxV*1YUK0^&l
z2y~3q$d3JqGa%3p02wM`J_2Kx2XWOao`S`VW7XyZQ2yXxwO{!y663}hE_K;2rsNGO
zYGk;-BwMELyCv!BhAWZ#oi1Vccz1f18haJIu@?t!D%=zl&`(xZg%%j2b1}7UCehah
zDsk-P3(tZW1g7iUbk(pM0{MDbrKwwQMPOQGJLhnFcYsgS+kee0c>KeK&#HuZFh-e|
zuY2$21lTzCE=L=zF-Ef9=+e~Db4bh(rnxh?xPGDqriddxUQe<m=^BG8)w&d0qQGM1
z?d+0TX}qPeFg^?F@o@6vY-mZ$IGkCm*n!^%nvQ4x^Qa6hFny16C8=ls6UgUQ&m7nO
z5a^nRn2Wt1KOc~!nmGRHPk~CfFb9ic65*c#>z0H!m4zW<bDQ#jdPS3ZZRvZ<lI}2m
zLle`N^oH*mrF8~b+b2*U)s3QKm%|%dd0nYGIv<*XgxI3(cIpvmGaSC8sIpX(<>0D2
zI}h5lr0*K4e=&Sl%+PQANnYhWfXL`}@}$II3$>(c0qestqMBf#L}QV*poGt1dM(-C
z3AE)%;nx;JHE%{mI7Z15Eif&4obgWALN)STB<9K2!Mf&B%cj|_J)kE|&(_3CdCm_z
zJ|y1^SI3gvj{?QnyHnI-Cj$H;{U($7B|NK;<xL-^Ue?WDIS#iiOMvZi!Zdw8QAP8S
zn8C+6Q(UYh^-DFPu48T{r$~LUxkSeMx6^v;x<r=lW>M&es7quEGI?3~|H*hTWg0kD
zTICp&Gc`Sv_4BK4va1)?#V+Ggk=N;Q>Cb`}b*V1DNzpeDTTMWaVTH-*TF|u=_zb9)
z1^HzC1!0rsO*^R^eDpMi)*`~Qk}{oY?`defHdUVy&kpbI4`&M<YHf_+7FlmNuiObo
zLF>zLx9KpZTh_He6{aCDR!(O#<7od)SwNELap~j187(o}x8*G+tep*(OTssB{P0%-
z<*cA?jyX|UYFc0~)Gwudw8V6R%L9unMhlE7;vHDr{c3>6R!&vAKA<8q&3ZL{p)w#z
z<q(W(-d`1{!|B_ly4S%o?06Ru(eWt^hMUI2;J<E<v@)#UtAsrQ!ya*yI$Z{3%snM7
zDMh>IH(J!S1k&9p?j+Zf9CEeL49A(s^LB=j7$0==eb|*ogH|bHGX1dbiqGW2Z=`16
zTsn#KRxu+;<ROxY)Y^?D<~aH7&wqlgIZ`-RdyT|Y@uunn%K2cC1Sc<H9`#=>F++)W
zdaQ$Tw^staCPZl)_tFwfa_6*3YG@bzFb9!aG;Nk}7ClVVtm#mk{*s#$<QF+urR|zG
zHcD_2I;>SR<nMt>TuGd2%lZJHu19$$$R}sw5dV2CkpG~*(#+ZfZwA2rqi<8{V(n!C
z5ogL&Vyh)mk<lFJcq8+&Kd_s5E-gts_UMRUmF%*abw;%WW>oXjc)ydnfOZy*6;?xn
zqR9@A^*|pU6zP+#UXfR5k?{ul>A|kUw8XS!#Z7g!78v)ip3ihOOV-sPw}<1;-*h3<
zp#=RH&>rM7k_ji{w+6HlVF-MmYe829r+2>kxDj55QtOpAstL|?ix0N?y&<L<N!Lb1
zdz1GC*c7){HM|eYVwtT!{v~DgFsiH{A`CbL#|pVwt@sei%*sr6oYOs@AEpLHj)Vlu
z)|ZIpInrDl<4viq(Js<=IIE259vA-<zLBodij0IQOJo`^k$Cyy!u4{sE@OslU8dIs
z8}~S^2(DhzBGZ!g@(lG_P>{}VC-}Il1^%WJoa|XweiV@KYYKJ;RUVYMcm+Yt#-PMu
zo+4%(4s<3-Um|Y&4Bl_wOm+5O0hXOly*nt%-}OWt(1wqIH@#pN*Yk69Vn9MK;v8}6
zciWH{jqG)*<8@O&A3DviP|BmD{qr%&6Q8V)ftJYl+Pyq#dGsTrB{OX~QmiA3CGoG>
zM2xND=Ox<G99-uKVwaR7h~dqS^yEvMz*b(Dz5rPTyP{oY8llsQ$d<Ir@wo$e^{iSl
z)0qtYEwih8X3FCDbUI(XriWz&&O7uQgC#I^(%td%@gGZKJmF#ebKQzyl|;uvJkApM
z?du^-UMlgXJDu#e8!xSt$%FAGE$HeS+aA5G(GnIFF^IG4)?kHr7L8Li1_gv&?sz+b
z0*Ce4w^<8}%EV0Tjigme%pzj=)}zbo!9v>Ic&GWt|EtHT+O@!#5+|k>KKXvI8bMa?
zS4XtOm<8w0QeOwvB)iivOW&RpFs5~|^Cqd0T3~!EDKkq=(X4-kF%=hr<ueHG3F?tv
zfdYy8*&$W)roJ~bPAv&$v(D!$SMNY}3OC27KCna?4m0s;*ex*ZKg_7=?tv!8+@rqf
z*3)ku25H@;e~J%?xLv2{+t39qGLfI}jmf;9qS0%_#!BhQSzhZdWEC~`=JbZg@+0Y>
zLT_ezP}JqBZhn2|VW+c-VGO?4Gnh2>v#w_5QeTSLRxF8sR^pjBmXReh?V%@42m4=i
zMPv1J{j9Hk4GQ$nw6fI2V2}80Y`K6W1X_!8@&?W)AuTa#Qe3HQy>>0=8k)zwP&-aU
zuofp6{q%BKu#lHesQS@2I#>v|(5|Gk#Hi-YpO`TuDBvN69sJKhK`LG37sNq<H{MB!
zI4v-~!z8h;HQyCv+(Z;k*Iu<HSj+4dwZU4>6y_#8F`$LJ^L%w>5<FgVFROmMrcE#T
zj({7?)dJJxB0c&ME_gFT`eR~>OL^xZ(F>Shn4s2a4sYdCihhy#^J0^k$_F8rdVV7u
z${cYn-rcfP>mhiY)%vOO&AwiK&F(nsH*>K^0+KjKY8>Cs9?}vsU>fjn{~>~0B1~}U
zomdANOKAF;=CFLtl9;J<^Lt}`6R|{H&aC$`{lI#OO#ckW`emyn`+pAS`gbgr)R<<W
zivy0un<m*`GZ^2)!2$5x!mlFL94+zt6VK^*^Iw96JbW0%h6$Gxa>ZLAEm4bTE7iqz
zK+ltcMG`YT)?q45i_FxSUktp`Rfn@%iktnvDp<gq&d)v6j-WuFwVdkpU=8Nlp?pVC
z=^qgf;~rHKlrV#Ae(FUn(bJ!zqs&)+&cnW?zSmrr^XOyr<DVM7>l3GX^NK!Jmjk~S
z-IR{N%r7q&=-02c-!3qvJnlpy0wMOS7MN~LpYPCKYDtV)?`5Y?!^<xz6w46_vtCjN
z*X7x)OJp>0(a^7omlUFX4!d22jA02S{5J2>GMPz9dUana%1TYwFV&XF=+<Xxw(v_N
zIGc;#pncahG^S{wOTQyo0^^bR20J>(mUF@3@)=4;yYGXN2lX?ZSN#xFM0DY1dx^k{
zBtiWcEJ4iYo#ekkL84QC#lD~g#)uqIt+(Nywa84Mb#AbDV+sy~_`|QkI{JLb2Bv<~
zA~SApx(mNDlCGd)c}a*T&=R*r>$NUii@Lm*pUxANw7@hyMgNNP`d|TXMi$n5SPRU)
znBiEg@-E4@{!rtEpdjA*O8<%$7>((fY~|}UZ#wEqTd4iN!ixl7X6AXO`tw3K)1^#r
zCg+>=StdJ`rv#T8`edNGxs|C?H}&-^yoYa+Sq^pCI3z}eGi{zb%LuyGzth9jHuX{w
zk6SK%xwNX8WiZE0;GEA7#Z(Wi%XHU~$R3wqBql6zHFI+d*KTf=GfnUVRcoKHM5Z93
z43CSyZgQG}{@ugFZl}KnT+&LKGd-EGc8P=wx;B#~=?a^AQ`Ams$xN#r*L+S&T^%;9
z>hl8+nhWt%tDtEW|4uO9Wi6pOY-p$VrkJl6mY5m4!>RXcuZjwqliyhrZY+UmHLdIQ
z&JuOGA14od^b(1~Eb^W~J@_noEP>IJsCOzNY#&|5c#AWMcf!kq=C{u7{KZR9f=%_E
zt8Y-kX;!~%_X|q&Sqr@bv;-gHr;#j{VVa}iH0=sz(AQ~^F_1oo^3Fo7yR^_al76@3
z&q=hvbT7%}Nl|k&&uU;AwG-rYv0S!v2pfM3xiref2*;n%#qp5AZ$9qVoKfw?{rDxn
z78ncaILX=@IzCW@s}UEaUv5KS?8SKzrCt>|uHU}MD<XRl%Hm!8Nl&VN!Qu#qGm~L@
zIxjOJ$?(Q|*v(i;&O{?K`uLL$>)>ZqGsU>}n%Bf0T9t7V{q5WxaFc-8I=z~c5T1tE
zXh`8OlBs{@X~|4OG;aOy-V&MNre&t`XA7o~k!X*%ygQ&O-A%;RAHptCj|CQ5FOrt5
z%a+ORd9F)U@lxmEOkfr9uL`&IxXDIfNlm|ai?A+6mdN<QLU#)9GdkX0Qf?7JTD!L;
z>gugSKmOtH^w#tuF*A*yB}Qw3DZ<4c^_NQYK?=+C43@Dz&8MhdK{d=dR_Glp63<PA
z$lfPdB!k!HE`gcsbbZkr5G;`BN@Vy$v;aR%aPn0oLH(zT<sCtPy6Arkx%jE|>ku_x
zb6wWd&;F`c$rY9$o^z7*dxGn8`}_THrX$0f$s>U!GPBO#UU13Bx|bAtAd>?WKa8+M
z#w4?e)cn@=5{V;@qhNxOaAmqi_5aJ&*5&D1b<OraTGW%Oq>Eahe(s{?srbe}?(5S(
zvKL=p9e<2-v#UDNMrSO}=wnMA_I1R-b9<sAyQQUO{536cete%3d!l8#T_jKc>9hE3
zp#4|7G>3^4j)49WdV<K`H2sZ{KcngKA{{LvFLf8+D_}iXI+)4>L~_7{!QvYWW`HTc
zHwHO)jdI1L6CMeX@zX-&_DJzfOb!uSK3%ZcBxHq%y;vlyR3x-SB%(|te62|L%_7~_
zi(FPC5?LV<wNs==t4L1}4c&XcNUwb&mv@NtIUu4Ai}XDxa>X%`en&+Hoa7I?J4JqX
zMr7bAk-_Ih2AvZbdQoJ^1rf*1-Ni{i<8P0axbM1&^O~NL;ER@|BRwVYyPo1o?;*+8
z_mKJD^^kc-dPwTDC`lO<CGK8*BrPRMrZSH79+!)^@^bNfbGc+*(@Qe$?Invo=q(HT
z_L8g*`pM!SddXbcc=CsSl1Z8hp3b&OK@agw^4aC{%fxqQe~~-(icFa%GWiLSyPgY|
zsd+Y;eoeScOR>p4H%G|b-$lrb^a#0ka)jLXU8u}#50y>W_5SO^<iS@X<$+#d5-X81
z`~FCI_=9fpP_N5m&a`gwNOL#I>n-x=_-^83{5fSJM`$yKek^s_C3lKla>m-3IJ*?4
zhDbqdh~$i8UYvHhf<7*@g-Y?^5GgtnBF}agsif{G<jbM|NLoB5L`tCBkBG#gGaIxH
z6yLqr#^*#=6gnq}EFByo?Y->M!MJiqQ9dI?%5%bHKV^OO^Z|50=a5&yoF0a5|2jlE
zp0LXU=&EGi9wB`kB!FYOtP4G#g-UL3o1BE7hx`obMG*EohD+LjjoM!gl@7|ace6<o
zd<A_hpiZa!L5@FlwVWuLX8)qc&Gs_ng(Ws=B&`9yqc&_q8*kX<^yCD4DY7W^o=r}$
z=k$({+;cYh?t^FS#j)7+q)ilgg`GAg89s_JXNfGNPdTK%Qk%5jY?F?@Hd(&UCQl)6
zp)cjiCjVG-g}nZ9U)i5`g}ib974p_MSIC>bKC*)LiYWIke)P`qe$?$JW3l=BEyLu!
zmj}qf_ABLsBO~R*udkH)SvI+qyiLa<q?LLfw_GVl-@aOod^AEn{po6n(S18ZpV8O;
zQ>YAvA4$2w1L3ljew0veC+SG!>695unos_6J=Yh)<uX#=;c&^>7jD_3BQ#8MXEDai
z2#ke%XSfuMjgZ1o5t27aWF<N_vF;|Ir|49K6n8|(8f4Gzi;(5CwGR3;zO-pB?c<ZU
zux}$MpA;@rs1pmaX>$hYTu`$hT%4qP=wtg&_{DW0@;Y=mYkWWH@);uAsJ{t6X(p|o
z5H5x2Ehxoj2h*owkv8PHo0*%#A~m)Mc?Z4(-^^WWlef{)(IX5W36mUTh50rqYPCrw
zZSKbowe)p4zA=eD-32<3`Of1Xl*`Sr$<i_#ZP?@ubQWy3N#R_PBK)A3ex;&A`~Grd
zxltk?Q0Cd`B3n<1WRv$Dy30r5HzUy(Ym<Cro5tcVui-D$`<U`8kZE8147#?vO}-{=
z|C;=3>{3czIpfX?6DfoKg1j2i%0||bu1C9#r0u^_ey>ehq4#Tfeu(sDEIB7b<Xh?#
zo(hq7wJj^KFZn+stAanQ%dTZD9}AJ?t>j%v--ZFbp08)^5=#s51z#+-og?xC^_DWe
z<@mu#>K+5{p(iJsG7DHw!_cvn^`9Lg2a%We4v{n1GbhF-^%q4p5mRa=g~&-{<;0|m
zr02mI^lXCnVTYW-BK6QkF(NNAhE1i&(CwrBrKIKO;ptONj>viH)prk(V(gGNf%aL4
zdCZsIvwYoktP{CNnf4DNq@6e$HVD5crrf>Z(h-Wkr;}!o-X0+@#q^Z+$_VMGj*tT8
zvM{zM{@zo%Gl#jh9<mhP$K2%*J6@w-&oal$&xovoKE@cHp<hL*J*AksJ*l%@*BdD^
zL(c>DdXQ8NA<lr2U@(XQVZb+vF@ayH6Gb2Mve@%Tb7tT>>uu5>dHF^9cQf<CK08?F
zVULcd?D$fYTuJ^M^j99}DTU)i{)z32*Y}hV<R!gDULo%jWNYF73hmq3lXcxw4%6?Y
zD|*V0$VcjS@v|7xA@E~KuO-bXL+5bp(Wvc1`(M-M_8!FD9<s56Io;0~Df?_rPg!2v
zQ}nvMo_Z6IjVG-?Kwb34!cPH(%+p-b1@z-4^2@{M%LV47GDMunry*NDoq0aT`hs6X
zxjT?)Kgfp91nt%M+1nz=7*nB3<Tz<gD{BzDozSwKHknDCk_zKbWz?AsUqnADpkFFs
z--Q2)vU=?wf_@Y11?^zz6m+2@2ly&%GJ!cNSRnEQd3p5h86~oeaTH_gBFa8X{}ajE
z1(wssCSvx+={9lF_pS7)d=7iVW_%LlXfPHZs99i>`IOn9^{&8<v@->iWAC}7vA_v(
zQYjDW!|*-Y9E&WUaimbMlJ-iWSAd%C*l{cUW-a@`(hk-e@<z}K+FOVd?9;`>gZ4q%
zmpLEoXHHo+?c+f*^klm%ok8rSeBn{%E{3)lV-9|lOIse=JWO5(IHq;tlSSC0sMGXO
zpE26=&~v{{%BO@#-Z=Jf%AL_=KV#me*(H_!pThQw(UlKfg$?u=&r@azvhst>HS$c*
zUKA=hndk;NI<FD^5-J_D?Q)Uwg~;+)Co4$9@aHJd{z0^~U&nYPOgbJ5lW%D2ENFi*
znthJ??fCL|%5ZMzc$xI0p3<?{$U1JO%t_`QECspXBkJan>NCJ3^cPpKhaAOz2O?w~
z{46kCw|k>a&QsQRjJ}={xl_l>i}bw%{(`n)gt#c@0L6^4{TiEOK`#KE^k3Kil+-tu
zvQGTQMjsr|_?~aN$Qa5E#8z7<qt7PSLEix8ASZ76_Hiz{k2Z4$>oW@b_kajuA@K}<
zDjY>TEQyfs>EALJzH~HPib#vkN62*QJj>p_{3!c5`hKF!DD?e8TCyHJlh6b7IzB)f
zx#&4adK8qGaz4n95Pc?Se~x)S9wzmt=sR|fMekpL10;dp&{yG%kWT8gH-|`NY=oSG
zP9^Ugsf}3O9bC{fvho`44W!FQhD(G_XNYX3U%9E+gmXZyjq?e1T92%dbf->xb3UkH
zzkzPQpZ;*h%VFMp2e2inj{m-k%-vy|tfkCy_KV`NB3|@cK8EcJVz3voyln0w^d~o;
zy3jtow-7G|AhY6sr$`p{o(7SiC<=Qr&ztc15_CU<>}Kp)KGG%=Nw>}+AGuyf`$%h)
zO`b(Q27Lv{@+m(YdKB2i-1wqs4<9HTgFUA+X9LXMJdXTm>XlHwW)^n;2tWSGCY6&!
zCXwe~&pBs8<ORx<f#vk=41QA%owJ|y&syIMoeMpkI(Y{|EFUSLtvT@8#y;AZ1zqUi
zPDSbhihk=@-BE4M2YR25LQgMnIarR}8cBD9`ooNqG>du%s8_Fj{Cb;AM(^F=pw<;?
zleZ|dfHIpZL_UNrU%{CA5}S51&tOwN_eIL*I^ij&+xU;x_01sZ_?7)1S#6ETQR<dJ
zx9??N|A6ynEB1OWLjHw3hxu>sKUg}3*+t)*{*A1onZ3U{T-H<Pq%JcG8$)jf<!2~=
zJLNeG?S$V4a=PRHq?Y^xyHpYb`qD4mhK{H0Pta~cxST`RAaIbfC8KO|QS0s)Do5e}
z3w?^TQ|ozy^7J(aoBNJ(e<g1O(0k9)Dfld@-h&F}+T<Lz$|F5R+QFT&{X1f=ZIBda
z*~CVid^VOn#$l5S<mVh4A~|ZPtfh^dxkF`n&QK|wG?ei(Z^`TpGloh|#ZVbZTT3^S
zPub-MhsZVXg^femo5)`au+!33_<ZQ%p>hs=`keI=V=gBD0=(W+!<h@Z;`2o@KNq>*
zLzf>ND&<*2q@20yL4M8Hp|Z&_L^jdhdg|v<b`xni@;}gS4f^vIAR9hZ>e+DXX|t64
zPR7%R`aAS|Z)d)@4wCzk*E61nNMA-ifbsVRqrh<B+sXJr?r`kLnJo{yj6q&d%v!=H
zide(Nr)+W*-OpUaZYPQJ^rt&zo;^k#Y+gbP$)T)IV?8vuKwJL<+6KKDV{xfXUdkb_
zgNi`EwRC*Px;V)j1^GE=Shp2AF0wu!;~qYkHw)@lUJw~h{?*WrF@`uW3SOVT#*mh?
zZibULPV1OO{U50RflXqN>GRVp<bQ&mM0%T+QEv|PbWnJX^ZZeu&-Ep&TVxsNUw(l#
z&-uP^Cvxa<)N4=Roq%<}oU$%m_aJWpCFCCqmy+|8mr&{WhPdElJlN_U>MWq#ljxhG
z>-Wc&<Ct?~S)lzfbd3}l&3x+pSD*RI$A!ok_#44`%50j0Ufu|bSWD%zLS!?tx#(J&
z8YP>`L^jqSC#{d79Q)&8@-{Uh2M2VU#E+cA5%LdBlm9;HB+MUAT2n0IBK37phIwqC
ztoQdJl6RiFkuLuT{JqR4Ydhy6^Nh}9Z5zs_k+ydmBFoPa6F3iba2}e4-eT53PFT21
zhsJ$<bASshWgdLP$z%LJZ3F0W$cwcd=vNN=tD81+E}{>+uSH%0>Va=K`S@iy^R^Ry
zA2<NoIXrgUZkHW{LL`&Ah1{)=B0mTQWB-mK?MvJ{j)ht4s-L#=al3SqUxZC^X3!Tf
zi@K+fJxp3ME<*lHxuX3MQUSk=Gs`*hUZVenCnID5ea>P3_JLa{mwSP?1l|Mm8Q8}g
zg7$@i&T#n~b@TAWV#+UL&wpku=Z7r#vT&)vcF)e?{E!+U7tr1ABhFlDm#v(4CZIEi
zI>YII7HJ-Od@HaQbKH(^b$lHvHu9DsFP|PE&yjYAFK>;I-lV^y8~;c5E+c)dB0~Nh
zO4%@>1=f)R`$Ta@hJKr{_B+lOH7*LrVBZO>>AB1?XK}s$o2gd~3ZaWw+r^DGS=P$_
zJ4#6l`UdOyU(EVF!aD)}pwC=i(wE$e)E^-7d-4nMn_;@#jjW67L*@U-|GP`cRmhgJ
zzjW~a($4y?pKg=kbVI*K=y#x3*DCSyOg9x--g(A-4fU@Jm46_sBYszA6Q3AImCGg%
zFox}%7aB+5AKh(YNB8M)@qG)nh1sNGf=yb`(Rz`0kE0?tqodholbv7}7*88_f`T3*
z4P_y+tvEy)D5LKK+B7Mk4{dyjzW)Pb$t%uht$i6Gd)9|Y814O2j|Zc6eAxrv?IFdS
zD~e*d_uv!DKz<|VdQe!(8(<Faz4Sd4M1U8s<K33|*vS2?V>!NG87}!Q-foeN0g%4-
z??NPkbSyYXTlJ+PBdOa!x($qlz8TyOZUv)3CS^C$-;ty*(*ME3c{7R)kvqv70Ih&y
z7`8|eS>ue7RbrQyndh<!o2)A)PJ<WMvJOexhq3R5(zn}5A0x$R9lXW1w?s(3Zpg-+
zG>Ug(^zEWfewfGt=sBP*AKw_oTfoSk;)Kr~-c#~NiEO8xd}2i^vIg$K+wyx#{Q;Y7
z<?UbxcjG2}rx9!iIZk2_Sn9wp^}U*lavpx*>!jUdJtY$zb3yyt_z8}iPfW~L><Nr<
z9BBV~fOKphAg|4}$yCO_VTPUia!+|R)+Xzz|1@)&hK+6oXIOJ7q~}Q&k)9*1Cx3Mm
z_fO~xAn$Mw$v+h>YnYtEW6{!xY%SjrT1oeVdN6@@car`+!zTMkOZwCB3hqid(Nd5f
zEv3uxiQ;H^VKRFPcG%o3OfKvE3~`(C(<y$EG=EM{Sz64Q{l+NCnG#J#w46d-aU=F~
zV2khi%3zKhJKx~Vjz!(Uozc2a?ZZcq?|ClFvUB}6IP!V)BVUiL8u7Il@^>(&KE{y4
zc-pUFJu?np2mFO-Yo3-ef4)rSpq}&`>0z5}ras3IpPuJ>XdiQ~_x{{$k=5j{>7;Mi
zYb|*kPkj1qNxw0Tf?kb}t%>1XfjnJiOqZS-_;H%XH*zVPKUQdudYzn)$twesu&Hj-
zLAsPaS^Lr`{PR0v$yS@(i_PYeHwTn!8huXSBX;M|Rw{I+=4n4?Fzw(Y4)_lCUKi;C
zkOhi3<5!@=x}UXh{^ee9ANmS72b7WTV-4qA#LikT{mhw8Us>}VpJ4;i3d*bp9Nl~!
z?7wS>D=W~~K|kAjV5^<frH!S;kJS~>HRw2ujxy5C>=BfGfcErU`3`_a<T=;``BGOX
z?{LiXcKV=oj_2)!va4;O#1Gz@eqjC>tM1nt_SA#);V|HM$$pKU2GKtLrr*#uQ(xQK
z$N5g*(>jvzp*85})t&d$e$xI(H1~Y`qHh#&J`(@n9gB-m`|&FP{LPn1#}C~s|B9s#
zzFF+4pmP}gdJg|#u1-O>A3@JIoXNFZkke)d-<3LeQ|OqD{$$!t;e1KHgSED1EwN?q
ztuXZKDC4HD9SM|MOu46!)!@q@-^m=7LC-=4)^rk=IP2@!uzElDc<P@-XVGxuk7soI
z{h5O!=-V#%J^kg(uom0ocSlBgo-!9nkCBFfD4;-hU;{PStT*WZ(B8~?;q0L|RQ=|n
z{eLiWtt%ftuu*oiwhMcI4jY^vx4sv5N1i{2w^qtt02sv@=L)^vnVY;*yiq|HoaBD0
z^O;NSzhydqu*h^|6`IDrvvhte`KhM;)l-;v%IdW>j=U1eS?dqIYrE=CLB77L{tR9J
z6!trZ@1k$Bmh1k5Sn?+5{8aL>srHqn6*h^|{b#(o{oc@FU^wO4-^L!F*`=B>bG-Li
z=e%)z!-D3D<vR$DYB^)j<LQJRsrB!NKaBnNve$DC@<Hby%ct(r1H>`(*J(Ryx$Yb7
z#vr>${Ty8%nfB>0=56&_#=nz0ww_ZbWhbznD6@JlZ^T+gxsG)FJR?lj*v3l<d0C+2
z8hnIw1<><UN}3JE(#|LiEgxz6)czCWe1kQ}+S^9Ia@f~>E8yS2My%)jS(KZu&n@`t
zb>L=vj<P=2M9Bfu(KiKj+&o^|XOEYhgX3j&*+gldMp<n`V*3p8*X*1qkLYo)ryj@!
zMJvWjEO~m5v*Orr+B$rjlrlEo8SHS5_m~R4p~B~FW~^7DlYY#AFS;;M^3UBSvCz{&
z{{GuImyzEK{Gj&%<{4k;Xknk{t-t-a-tXyqGAZ9;^<BMVF>>DgJ9ztV|C;n|&c8Em
zlcg7#H@=<boU+R-bk704unCf%&AovB#$Z4F&2JQ2SPpx-k3B-$FSmmf8|CUf20!!7
z0qphG{G-F_KlaOIT>4GW${z*20Q<6*bv)4ZS5Q{RbnRQk&?R62?ONYxzMQ}~h&NwT
zrw97kqxAbmF21t#<ZaTOvc16oFort6r;nMW6Esi1xjOr<L*`?gIg@UaY<L&>IiyZf
z-$>-6Zj;p|*dbfwD$1^5{kI=sE_wG!rOfJB`lkJq{xveE*~oO68gy83f4%9u_TxO}
zuMxSP$5zrJ#-4kSx<@I~0r0~dY@pZce)8+VPOuNGuE+n;aRAy{XR51w4CN>A{-*n@
z{RDk_|67^^{lNqoJB;<qTaVtu@;TR+Q0|Y6V=ZYh?;yvtK6L1P#TrK{-?Vg_)GMLQ
z=kT@F$8MEXwooah-)l;5l{4f&&$x6fS<P4vQ1(1By?zfuuQ|_sj?{X$I!1p^5);Tj
zrOTWW(+=(F^7|=wPRp@h2lRz5J%^#|_l3z;_S07AZKRDy!(@90do_L6F=Quu^?K-1
zup8TVfGZfA4IN>mAHug6+2u0mj^h*Yb$p(6la72Pn1nnV_!wu$N92DwQI^8Dw?H>f
zB(Sj;b4JLeodTE0MDp_4^R<8TJkI$OyN<t2iWr~Xe>nE}im=D6q>F&|-%?T?-(Dgu
z=wuBu{=0M@{aCNlUfjc&v!YS>74c_=mXYdyZi3E_;ZE`@{Yjxehv^UF&$V&J;LOsV
zKwsMJGPz4%dk^~4jkVevT`!Zrm3rHWKW~ukCH0wc)lgo4n`nQLIB`GnVaRnGJ#^dv
zi$N&knj^w5Y3L7jd&4GoL+=Og>iT`D-;Fi;HvQN0j_jXWehu%0r1}lGfN}W-b8qZm
zeE3BBc+Om7(Z$)gfb$(cMe!986ADN-O%9Vd(r3<47U+GXXe2tE)Jtbv8MJpjDfiOH
zD7PHwedw>G?Yy-=sZ-*ijztC3)AzJ)$O`hVmp7PeAMXMM7bE0FU5>q|h&A&H^n9S{
z*R*^Z`dK6QLsx+p_J+yx>?x)FxfnD5s~_P`Oq)gY>A9cAO9A$Ir7}$Nrtp5E>wjdI
zb=xUBmv6S!VY2C)P$|-LaRj~WpXDu~vQ?LR!Y;3qrh&IXO}9{ak95lqp?t5QZ($-I
z!SDDwM4ASLO5-)ixyLn-K9fcJ*?e2i2$ceSt?f1L8=1%ki+n<zUH3BY+!dOC#YZa9
zg&kXv>u*zq`V4rpU7Yakp>}zOK0QnR0{CYK&`;t-2J{N<?nUT)k?+`Ruj4HY8D|e)
z`@MGTf&MQ0t<rkB`rDvsEk|Dybm4h^m+%qqTI{D>LwrXW^C7N_hry>{%~<R?hPUcu
zyYO=)-%)h0VcefXvz}!Q>*4-hyxH~R9)E@S4v}gde}-;65+d8VJ8K<BAF<1$$aLMK
zv(Z8OW8jZx@UHPTV^`RQ^tg|AN8~xLGS}bO<+ZnYw|<O#%CFVs^X&TlL&~6E`^hfH
zf8gDwJw!@B;2R{ehT|c!kvf~`!>S`93CP;^iOeT`fz(5q1$;oSpZ1p9q>TgnGw>~o
zdq^wvt{-~H?%6%0nRFHMCa|NqhwLJ4<cBERUgeGA)gH1QzW(+Qsm1T>SmVuMe7~H=
zcOv9<pmH*H#Ri{1Zz6pS8&(wID`P`s$Al1BML!!!*UsgxKZ83zYiym1HvrO?!Qb<3
z@(Opm!k6vRRwMF!g~$u7%>RwNyDk5-v^U?({&utcy_DY*;a6ppefd0ZF7-C4UCcWh
z?Y4on6NulWFX21ux>FzeLvVbWUHJW+R6k~y8tnKpvLC^3;A>F$M2MUvtz*90SRXH7
zcYA;8Rof+;wBRS+RK5w3<J;{L4PS@8+UJ;)@swx2B9Xm7o5zn3NAg1Cd35ZN9#WPn
z^0I^9%8_4>KQ-LXZ$~KKPwONe9>*>X_`Tj=M<Da(Uj;o73<0{$g5|W&8q#f*vhJT}
z&0S00_OG!kI(2_<&}Bce%l04WlkUr_b{X9zTTA^N<ncZyHzB(PEW;0X+{k*vo)h8k
z01f0%?MnYb`Z6{v12dp!1N{v>jx-fyfWLviPiH?REe46;Iq(8F3f>2&!Ex{kSjt%_
z=YmbDp&P(1un)WqDu8dh@te+W*q45tL&k5mWCe459=cQTb$sB&nqH#sA1@ShhUiNi
zSxk(TP?>rQ-><+-@BnxSlwqs0%ai5h^)`7F{&BEjE#n|f1oJ=|*i_7#AzcVIPq)dF
zq}7{6mXPLx0?;s$HB7n^tO7f@?=_9%{b-C$o`+utc3$8d(3|(80r)ih2CxxS0pC{k
zo+8Euy_9__Z#J>=u~69tpU3xb-!b;n80<+LYQwj7A#VePtfhjNL#2Ve_jUL;wVd&`
z4GNPc&IFA|L*yM~yV-+xoePn5k9=s$JI=cyXOuSbTG87A+N8f!uy!{_g~)r9&l}WV
z4v~Hg%Bfe)K3)TADN{E9{<YuPYd#xjucF;8BdLGy0BM^xP(Go28!y1yj}Nprel*zL
zbo~(fS?26><ejhb-g)yt`w3#!F5z66Fv#Ba!yx+}>18ho{m}N@mp|H$PqxXbZWHZa
zQ2r}$8hi`dst3vUq(6e6!EeCM^*kIzf}E|X7?Jn6K@Ujokwua}Y@Osi*CvJC-e-+`
zCB-R!wiO+}&h||ICAMWpF1M{XUSV7Q-J`Z=fBm;@<s;A9wjRDh>TSugZN+@vVCT#B
z>GP$r)Gg8IKAvuqKBWD?KrjUOSV#Ij$y%TA*MjvGcG<vtc=@CVd5z!BRq$i2jr`zh
z6Ce32ZG&Vp2g|DILuHGtmsFp>Txx1Ams(q2sk?B6RE`Ok&7f*VxNMmcF4d$p+}Uea
zgiBpM@A*E~dH)a@O*=P$o4_q#BDe!I^NznWGeUM{N62o{7Wh`)725bVum^129M1Q-
zaM_j>Aq`+V56g|9X>Nq<h$Y3I4g2`5E4r=%O@qVbZqk{c5&14xxa_8155V)k?)lm_
zqZhP*R#1-aN0C1c;y@zUI0qXbp9j6g#@J%QB@NmG>dr(+EvNz2pprH=$JnHbb+lyy
zae%Z2zLvC(b-j=}Pl6>N7Zm8Ul{o-D&|JoQdJX<nVUyj&)|O2CE1R`Kx(9wMaiabr
zcjbNT4PbjCYqOO#O1gu3%h0zHtO5<hgXc*bNt?hr=mLDJkaOiO`nG#4@A<6ZR?wDh
zlRcTd`=4P=Q*Hy;2&#07tOC?RZ^>e83uqs@3Dk{4PHd=&WvpNqvNrHKcoVz>I;(B+
z9_fiE!sQU@$KVric6PXYPWlCC+dfdfB0UYh1qI22q>z~YJ@k*@XYd>Fzk_IhwU1}b
zfN0PM^aGWH*{3IkNEO%uszJ?k*82k1JIEV0SO!vV2p9pb0@s4k;0Dlqir98uWY<NJ
z-RH1>3HAqV#LhinYbWotU>j%v+d<<Q&R<8_Yru&f`pphpPYmxCAvdA(7BCUq0j7ex
z!A$S~IQwFRJVg4ari<|DV!J#J9S7Qc10|8P30WhU2ff>gjX@f;2khaD(FWhj+_ium
z6T+nlG=lBJ!=+&?YX#Kfw_Cf1%N~5WZC1FnlD4omcc-!@vchHOf^cajweHU~#DfZv
zC(*wI<bndQ4AfDsmNisEy=t%}hxM=)pDziQO44x3ECee-`ToJOinM}sBX}OVKv@5r
zuc}53ku9Kl+z_b&wa|4Vhe!kTI?BBQHh_(wiN~o%PzAmF=n&b(<5w;8Hn7JvRN5Rv
zrIoa0-w>%hH$*mrDzF7qk466k%9GZ`43Q@4>;i4zb?_#52fPOkfo3k^JGpG{BHhi!
zxP=RJYYA<DJu8OF)>wWYFn5S-BW<`iRJK#Dk@8KXJGj`FXAPB)(enxT9DD&bwhomF
z@D=oywZu+z6%?_j-%Ctj+_l5$qqap3m+0xl0+2fg|5`glPE)6`n10W)$+ytogCD`J
z7}oDd=A3ohO4<hYD8A>-VEpv0iau^3t%k2DV~ui-sLNqLW-b0q{ojCn0{ae#1ks=m
zXvUXzrWzmHO-yKkZ-v)Bwg>;-T0#7&Vf~Xfv=V=y8`0lXito)Oe(=N2e&`tphJX=Z
z<6zD_%>Px;TU>Uja@gfs=+U5V4rc{qHK2NdU2Z^j6SxIT1b2X`;BGJzT)O@rfPWM`
z4hlKD7d#d!dH04&9DE|k3k{PK!)!7Sy0elsLwa`2K=F_+1W$q`AQu#XWuRewh^!=C
z1)c}%KvOp75AX`~?m3)4W`)QG=#5|xaiR_AZ=x;4n#x)Hwuv!SF&;hE>PE)a${0!O
zYPgG6Mo1O)YQZ+3e>>hp>Lb<PE1Fr)JHf8*%r6MUXXrg(>tVj}9<$3f(gqjyCmuI~
zCg>dw{{EmDd^A)}+zfjBjyR96gV(`3;5~2%d<;GTZR5Mk=cH%zekU5`>+SLde8JbD
zQpn}(E9i~c%pW)ny+x0WF@6jEJ*c~gZ-E+6J&8WCPrix#NANTF4Kx)Kk5+_>o#&L@
z#ED(-;n0y_&w1hzd@FIJg)|yjAJ7jB1Vg|Ga1}V4H(0JE?c`iIn)Jl4p*%KXx54K8
zas#r;bHpN06-8_cBSwLm?!+wOVI6VsCd%9bto7F`L?%Mt0j7exL33|n;Q;zb?A#4n
z&T}r{4Wo^8&neCyGl-?J#MHUOR?_Y8jbn(pU<WAYw+u7U^8k1VJPI}z^ZSPt5%M_n
z7Jk221s?~U2<p&V3tyAN`GIsEvW8C17o;B0bTV8Pl0FF<&(W_|`h_i*z~=&M-)3#L
zlww2Tf8}7tKhm60wvbjc?i$8XOMmO=YXNnZft6qtcpl{L<V;@AyE|#~8T>CF|676o
zk+zhv{z=>5_mFNyUwstg$F2=0@jvFhk+ccyIL!L52$OZ_d<AR(Rbb=#FsWz^lUnF4
z2g0Q4LYNfrqq@RD?8(Df8{=3bV^}K(!(<y}8qV3JiF6le;yZgA>Fc1elk+xv`0lI_
zc@zE}&^9JS_DtaH&zfunl_ObyWAV`mtUuD~7<8Ag|ARX49(4|ZkHIIPn)CbTq+fuq
zK=axV&R?wY3ff6UZzg(K`)#Cq;J2O&k@^e#wqYOd24H(Vv7wcBfJW}0n?vL@I=%(p
zgCD`q;5T5u4gakiC~f@cr*n->!r@O0V!w^Te;44tAQD;m`4EXF?E~_-PY)#B$a<?d
z86rcVx3CVYGPy6mjs32Rk~(Zx%Q$Od87t_#KU_vo?nGbyK4otYxeEGPFdEzdZUVP}
ziQo<}72FNZF8-ag{nS%tLO%c=0u9Xhqoj>%nP2F~wVd@FN4g7N*gXUPh{ZqV;vb}Y
zrZfLM@b)7=5zGT=U?Z}M*l6)UZ^?<~?hq{tp`QeG`O#9lHkvt#mTFQTXI%YVO8<^{
z3HfLHcb8n!0<a967#}4oNjo3mJx9xVD=KW^Jpy_is6@|Z^i&;)mMuJ-R`U>6vyV1F
z-T7$A`<nMd%54A}K^3S4+dva&-q%xhf?Z%YXaTLD4eUAJQ?~NEgZe4avW-XM2GZ?w
z`OO03YXUn!<t*BtL;IB5QbzlXuLio7w5~N;cF{%~cpbb6-U07{L*Qf3O#gS%|6MiF
z+*7bGwrwSC1NwZvbwBNQ&^~DcW7|&J*hu^6-a*<B7A>Ek=X3A{_zE;dQ3jlb-rXJD
zwrKel`g^cvLA12Nw<2%p9WA-oU(ccS9n{Hh-<YePDfb(wfUcUtxMuLXi)s2zLZlYH
zZY<-BWt_+>=d%7B_$Yttw*^0~PQ^Ft@lj9*@_3)M-_E%MM1pA02lN92LGuFEAF$Rr
zY0F{O-!ax7>7EYO-$C|2unjbT?Xy^apb6{%L(tK1$}S^F+kBzC&)MZF=xc!$KYH6`
zHy90n186HjFKDsxn-g#ovRgnoci@Sn73|q}kWK{~v+Qy==}fSNaaMtv8H^Fs(Z^a)
zd6@AZBQJ*g-$?ev3G9ueweWSM4^a0Z@F;j3#DRGr4R}CvKJmAh_*=&MDrNrbnSaoh
z#r*?p#UJWVa{lV%{B?o&dy4pbjP-e#_}j|f{~c=!Jtz7`%af%3d*u@7T+a3S4XN-O
z{>~1*04xJ5!PzIGWfkf3U>(SREL&b7-2hG`@Vf}o&h6a&NYB2CKa*}GuI2qi%!Y0P
zyFdebP9kX=cpbb6nvgZJ_r3$Ydp&y?`|Eqqhrk~03~e>+U7)3r-__%PXZd}b-~ZYk
z3z3g0SMU@5mcsih^ylCU@D(@>PD~D!Z%I3sbAPW0m+zr}1V4k{z(!;h=(bN{-q;KE
zeI*<^64cR`+8FjoP)#4Bku~tn(1)}iXqp}&14)N~Mt+a63+x6X;I9I0V>uU$<NaU?
z=Y$E||Ix9TJ+g{)3+I{YGIXz?je7Q9a4mI4gB!q2z>5F%#5-^cd^7ghi7l*so<6rg
z2jVO7cI!d*U$E^k`!9LhDbu)<^T%e+AB?lSHC!g5=MIpU$2;kB`Zq0%U-1rtYzt`>
zDCmYgpTZ}z>DvPOm_=W0;WCpl4K?^F=|iAtGkXB(<De1wu2lSzI1~q;2-@(+J!A1x
zCw>~s`Gd7@t$)%jpc>Q=18U1yBc<G7sgnjgU?F%CECIQo05nr(CuMi7XPttU1+0J2
z2KIohXITH|S^uD6KkJ{h*4W5-p`PEaf@SDf308sflYBcNtsvdl$#){?SHK2PHIn{K
zpnoy+Z!rB+^e>xoIuF!>yxX7R2P^DHUFXYf$lAVnN_LSJOv~m?j($PE4&DUsfXXt)
zPrs`6p&wL3*L0x&9Qwg~lsN=G2A_Zf!uf;r3-A?a-pBd}yTESNbxQ|!J<s~Th@H!r
z|4FQWunjbT?Vu4ffgPap=5RTUo)i4e;9Jt~!H?i)khdmV+Nz(D-=OVx5Q9Nm^V1SZ
zdbXEMqDlLJeqbON0y>|w$q3RDlX;IP)o-`@+x1x=K0A)zTO+#x{K)=r6KUtxr|Apj
zKYB`Tfxik&1O?6b+egGw=&9gtFcUlg9s*~FJtdElHWX*e<D_vQ5zGTkE3&yiWJ?<K
z?kP)T7w|wY1bd*{Kx^z0&LT_rTa|2i68RF43ktx-8QD?+mO*dHUm{gtCG;v#2VDzl
zN|#7A(DA2X%#uL-X*&0`<ndkNdCE2pr|piXW%t3SO&tA5zso&MTz;B!!qc1+o|ej!
zPs`@>PfOLsr)3Li^_i!oW-M(?SR!?#>(KKG*Z?+yDo_iyfhN#=bcyWjq#dvuw47vY
zrx@efC9(%>ow0<!%U&YeKm*u*eu*?*SRzf7*#XMw&o1<|f!D#CU}NJFsc=3m??7)U
zW&FiU<UQy^ppJgkGTs_&Pz^ptrpNyY>F1znzfHa%{R%W5#4iuvm*6z~x1bIG+{6CV
zf~*x(9?NF@*^ED%_m^y`W^L8j@PE#!`mFjrb$$dtgWrJlj&~<<us7=rH1}rx4`BTd
zXZ?eg!|cn)h-0LCSVvo@>)$xD{^zj%!S)%f|1qq8umiO5zEr*-+xiBf_y3Lgc8NrP
zG^i+HT^6%0K>^=&3wy9<uQ2Pf8d(kKL%F=S`Q1@_826hH=?C9P+H@vF20{-3Bfu{1
z0=u1@8?uQ{q-{X|Cip7yt_6D}RLY4lqoFHEH^%U`0eus=1yt40j|$>LEA5ij0v#VG
zl6MD~3ho9o!HM<|d4N>Eqdr7>c2I~sO8Pj61BqZBNCTbePs>!j{z6$J@K0*l8k;O3
zJ@JT*k5<?j<c?$we~G^%F92tcFOg-W4K|ys)UvPno4e~mWfgQ2@x3vOw`6GjCc29^
z&vnqRfIaM6Z3lV(KxYf`aAXU?2CxyVw}r~9y|Kjrkq!GprIP(>Gka9kDfY*U>^W!H
zdk(V)9b<oF|J=$xS|1z1`x<Ydsl4xJ^1iQl*B{LLK526d-v`FB507CVj$+>(z`hIG
zy0Z@h{kQ&Av|9_dfhMpqll^ZBdn)vndhXxrBjniwsnP~tcaHma3HR@{+`o~R9?O(9
z`!eK(j&ylG#w#UrljXVCWLbSQLsrdjN!iK8^77!n$=~VQD?1nScYIIFi?QkQ()y=l
z-G!$l*O4Mi`O*37=zbFv^v;w*kjF>O{1wUk?QVuVv)?1{koO)~d2ES1J0@F}Pgo!;
z$`;6~0ok&;^+`EI-r486%k$Xgg<1T*Zf=&8cF&S^3&P~3)-d^){7=B=pp)N2yv!TW
z3I1P>_4%w<Waa!=as$8ruCR6E?{@k7md)K|V{3QWT%n|rZ#7#$)kP&WJ9|rYeQ&Ab
zyH4%;UQ*8=wr)K+SQ<_ZkZrd9(rCL<wr3BJyuQpcu>O9c6#pSko~E6G`}rH~1d(r{
zzX!VxGcU)Or+v&5Xd4qMdq4|zZUx(FnAb+;nY6xwc`ajJ&oj@^jit<uD}r}3&QV}9
zs0DSP8tA{T_>ne#2EPIOWcDx+38FzCP}w_7HV@{FVO*GO0o5^KQiJ_!r-w=1q%i46
znSo#k7y+&V*MiaD2GBg0_~Rn}aJSzLTDa@C0_&d8$oZo?e`5-^brOHT_DteWHt~ma
zM=G-VP{|YG5Py_#6Z&oeO$S4zu_9C^LhtSfWe*9JJD{h6Jv-4|Pa9)t3*3#Y{Cud)
zBz*vEEDe=MNgoFlouS09P~sP{t1?Oo+IvXtdEPZH@|}aTQ4P@d|IQmD<izcKi+Ci0
z-&3*Pk1=0|`5ugpL@*De0S{ORp8WqvyZ_**t~25D2P}wTEjciO35<~g71+QVn80gz
z18d1K71+QIY+wUBuz`))n2p)M1}abkWn|C@Ic5S0Brt(7ya7c$QcK<X*{z@59jHJB
zYM=rYsDTQUg#s1x8WgC33RKYS=Qf5olX+|Y*gwAYJm>W7bMLwLo^zh({JQ9SSAFpF
zkA*UNIa;QxBgs_Cm=d`g(HK+q1AaHXYmc@-zd&Y@y8j(*0GVB;{=ZciY9AGb_`SkV
zx0YYHkzcr#zd$w!FZJL3IEceIiWuTZqWvEK_YohJ?0S{|jm$j$_j>*}8b|Pd$MAp2
zmNWd{3w+xv{9m$7xN2)*XqHYIoya04y=ok%*InYnp3@IdGccSKZW!g-_`M0T{usY^
zf%~<@{X)y4=fi2oq?|X6v-HaM*f9nC(-ry$?Ay>i-Sy3Mee|4l88u7z|0|qhA3r%E
z&6Kn=&OtT^pO?l(T*g&g!wuZTZ9MJ&Ri3-h{f2vtEL$*lf&F(u{;TC*S}jN9pPr(p
z56Qp#Up1yE+?CFK+=$8w87q;0^waBJb8YfFaE`KrhBsWBxWq=+y16I}6<37e7>Us+
z#yCts-d`Ev{;-KB2~WW^RQ!!=R6q27;`uO3xaYmXP!aUWH+ok;p`TUUY|O=cEW~2;
z{q*^;lw6LLScTPCgBES<Ix^~ePj7rCY@lz#7DRpUUGM=Q{j<u@``X{0XF{3Z%2A1(
z*p0o|kAtXL&pzKA*#X7}QGY<WL6Y8pMrA$UzqdgD-W>gVuj*fz=lsf68`{U}pFk(N
z(2WeT$RWQ@dzkx!X#AniJvr>WN0I%w@ra)XeJJkFFZ2nJ%}ApYS;Spi9rFF}UsG=#
zV;7uaC$NDM7tCREOw}CkE_oU;_0?JOJgS%2*M+={tGI?6xQW~7tI*d|Ul4kJ;T;j~
z{ltK9pL~dZ44mtpd(Q@wLs5j`NI%m4V>CVTNjg#SUO^}p&K>dGGu}~jqh^ryeavGY
zvu^TZpE^;ZeV?v<pEx+goTprUzx}$p2_vO34wWw`SB>uPc>eDRZS@)b++;VqAVX$_
zb7bQ({x6!P)q>WU{NM5XU$i-I)uF**g7hX~3Z@~J(f(rwz0U7(%%Yc~p~tm7VuPR_
zv&FTzzPaRlq;KiZCl@2-dOLS%-`ERFg_k3{k=?PC9kN~fM%K8FTGthKU3I9ZCy?AM
z|Ll;J{<8|Ju?Fj~0h_P|?b`c}EAsEUx?E59b@^u_Wyz?oq_I%`hsi%$#JAp)f5)ZC
zHsPu!eJK31?aF90$99JM9jlf9BgPD*Unl+eDaX*W-{Akd#^hAj7+vd$!C|NPmOJJe
zedBTeKzapxP~2XmmU#B_)Jxbed=S|Z_0m|+ex`R|y14`5j-o;xncen6h|zmqa89x}
z$iuT!LXzH$sJ%&(oyg)iPU1AGw5eyw>PzM@k{1y>H!xf#ucGdRGK@rrvXN3o*waxT
zW9`~Op=O!3ztHb~i+uaY2Wc4OcX8Qoj0xBL{|0U%Kd(FLZ$4mrRk)+aJ2;W=L3X~c
zY?F}<5Y;VF9n-Xc4}uoHN%LfF|9CdQOl|)#HUPawd98JwanxOR?nka+gt9)yy)GOO
z?mEXq^kd-vtPaLd6k#~p+1VXK+-Jvk`Cs=`_Z4~n|DO6^oAA{C_iS3{D4PrT|74qI
zQTgLB=64u_l~&(B+oO&w#<(c_v%#U_n=gc(e^Ew+dmHsfk&%BeB?^DhJ2i@*LZ5+I
z*zv(<wXM#<9w-&=I^o(L4G6R8bCEmc+K#zKvfFW07PjA)xA}fsh{cGFC=S&&TUtu5
z8)R(Sc>HquN;C{H{yo4LHCgYtXwL6;M1J8azpus`y!~-)$(z~(`UY&m7L=hJmDq`z
z>K8+8!WcVQXUx5xOb92*h7m7@-HzFd{WyrjIEonJXfHJWKg#(3SmXcb=GR2NO;$%u
z?wJ19_4WurGg{^t|9{o^{{rLxCC2}!i!;X{DV=7d(HTi`DP-wgDf1U_oPH8HWBb`1
z#UZn&ICNLrH(;Xv_cz=J?Q2wqWAxLGJBw=ilX(s2g)83kUYYaIaK^psaStz;|KJ(a
zUe`~1OaC)j?^z|tq;Lb7{jqn%^SS8%mr?m~<^0<N!d3b;v~P|2pZIv_LN_w<p1mXg
zXguWoo1_29JJ&+CuJCS<Y2h~VhBR*CHtym+`d%6s`aUQQ59vL>9TR%r8yI>&R~&jO
z284cb1HYskU?^IYp&#lKE20m_NJQl-djCe#yOgg^?`Sc79CE_hOZo_uw{CKRxJj6T
zs`tIiWc367H+=?u7D^GnpnR$i>PI}TBl7)!!rNYR-x0lkvmG}d6`_D_!zQO!{**oQ
zW&SvQDVAd;R$(=263Tx@`A<dZyZ<BH|1s`A8I}KN{9}#d)?ouSVGGJoj&{d(OjgFm
zEB|L6m#NGZ&rSKtDQAsa-Tx)-{|fh?Y+c~~m$?7a-GBe<`DAf4HnXkw^TlB&xf^@2
z9|v(5N0I$nL5PvtH)<!)_uh+npDr#giM;*sCO?NB^#OL0y+8IIkjHTneXSv!CePwL
zE+RF^+&eNILBC91MT>FaYh-4WHci`=drg~0-Vk>aT|=~aW3^w828G+gcX1yN5nG}C
zLO;E3k+x*BzWjOK8RXlqdD^d6wO?e7-)omSM*4NAM*>MSV5t8TVK_$OasRvZOHJ$>
z;r0dm)+O5c720{STR5YQ&uZIq>-nuG_^oI7v1H2?ek++0PLpk$#l8QrP%NEssQ9Vp
z_RpS8k7LA75SP#A0R8XZQSS8T<m5XmoFSw8GRZMjiI0UT<TS*p%^@Rap;~-=j(jc9
z$1gk^N#ST*zkZ2)k~Q*P`?~zEbsYI`^B>Lsnd^V^u@H-~6w9#^?R&g`+QLqDT^G97
ztJ{%94vlxbe`vnv{qtV7p78!P$s^j(a^0E|>8!>YtV6nIko*q{8|YoiaVIv>w;;Ds
z`5mDA4pDx`4h&`Ds>~NCCo2*2-s~iIqdLK_U$0zYuW<G=?Djd@_NmJ8>&o#0<@bW}
z8~M9E`luWe&Hp(}M)QA;l2Kh0BlGhQzQO)i*TjXBXhzE}wm#X3ifzw@ESb*m0k)f$
zOz+|sbmAoaG;;JTGU!IltIEIYh`X+6u1x(r*R_@HA)gH$k^NB^&idbZTtvk;p9`1C
ztGI@C<+($-?nE@#uUmP~jBx*lx&LT9rT-Jn>be%R(o;w~u5F|8|2KU9AMwN5^#goW
z`I6>M+{Rtp$3ygE;QX-d1Nra!Dt@DE2uE#iR0me=FlV2ve#d$M@xu{I@k_p7-oekF
z3w81r<uQSJG$48wHM{t~dp`7kYx$J?(**xCNjC7k2TNxpMxz+xFaeV=1??M+zi&1E
zPIlQ(p!)>>`V9Y?%st}&OS6eD-HetU{9k@=il3fF8(R3k)1)&4vrvlkWXE7Oy=%R`
zg|$UtE`2_7`=q_h`1c~?-~8Z(;;MuflS>ifA1^0YqM9C`$giJnZ(rfnNIGA`x#vUu
zc=;S*{5vE6=NyN+iR%BE>VG8Z4fv@4yvG06VFR+?Rwl@&^KZ6${^;J}`S0@lm940s
zy>XfHhvr*RSyTQVDSzaX`8Qjn5sleJbAHR{E%S`&la-%W-~37%Q>EoPGGnDpc1b&$
z>$BhcQ0cg;QAJ@Vxf^@2AF&7K|K2fwm|myM$8nf`6b-M*+h%!OFRx@wT;H#r4{<Vy
zW~9-HERN$OPU9@jqxUyO;Ud`+EimIZN8<o_|M?5-eeqXu4cW*BjN0EXnr9<?8+UOZ
z57CdlPY(<O{}*iohN1|=F%qK@wZFyWI7~qA2j(J@J)dU-lPyc03Dd|Kn1xcL$0+}p
zP460_{Lg$Q%%#uA)A@f7*h<5c|Lw}ZG7~?f{5L5Z7nG4J$_CkRM*Vcg`j8>=r$2n5
z|1ZXJWWO~itR&k#+YWTT;n|?uJ<T9Xa@*%nQ^l>Z&d|60%AZ%^m{Ii~Vc?$5L8
zU}>$!8mz+xMCClne~-m{n}mDa?=55*%2A1`CB{(6-Pnu$$j{##WBim}H$>iMT5Ci<
zil_7c9(ex4l>axhKZx4sI@Hq>NFLHo?He4HM(H7rB$|;%C$ebg$96nm3+&?m?$9Rs
zUxv&gm*W54;{Q6P`4L}*Y!yzO<Nu-!EqC~~?B?w$^%#yv>A&xo9%=kipUqThPc%nS
z{AuJyh!@U`buRL(xGLwV)KBv^VhMBB#a%>o^@r{1Ms|(xRV3HS*CP2O6R&H3r_29L
z$8DE?)FXi;8l-#8e{SF=ZsRWQ<01OdF2DKlf9Lv>=kNJG9siH=iv0M0M*gMU=2}~Z
zu#Nu8`;Vb0LVAGwiyuz!8X^A}Ngs`zXOvwp|AXWo#p0?Km_tV1P@ZCQ_=@yNs1_fW
z*T@E&B0LTGew`Bed{sWNosS%?@u~QZHM4x?%Ep(&+uzadedU=j%kia{jk%bQg;<QG
zSdQM`>LVe0KF3!LFNan1)mVdd*nk}`m4r>?7W6ghvnI<?iJjPuy~zIT<*=VDKcvhd
z>YJ-6R6bw#4sTUHH!7dYlusn}0XCp9^6}Bk=WjtPJ%#iN{eS9)_O<%|^hI`}OaEZE
z_^9tbKM!%DbxF>57)KFD&)2NG`<G`z?-!jzxEZ}KM{)lp^!#4EE!>GLj-&GHuY|Y1
zVXgm5W6l3&XNY?;zwk8uEY72oozaDEWROJ;7yWh_S8)wDa1*!D_dEW_H(m}sFPkGH
z9L+zxPd-FH1}^aIM?4b-lS5I2;fT$&hs#vspY%F)SRC1}mxf~Ds7^~Fn&VK9nlYuJ
zc6@1wPc990WPM3#NRUb41~TS6<@c=dw^zV6jFiSW<mV;5>;5{fV`^#WL>IcxmGbRM
zLl(I+rJ->}X=oZ-8k%R8S}RZ*S|^r<6w;$gL)!>()umyA^d?~nrXejZl`0K0=v~*P
zeV{bVqL(6fOWMew8?(h#^^}IW<b1^F3(3W(z9U~(<jdX>ONEysSu0<tM*`m)G(NP*
z=Ssh=LV99pSWT`$be)}B<!er9SSP#zS=W`zIJbOd$p5<a^WpI8GsB@fpAYXYdMW(F
zvgzUAqUqtlozI0I6;2I*|N7Lh|JdikzGaic4;M@hKj`^v*n4SW*t2Y6`2L;Ggx2a&
zA$8@GVUx5gp5?Fa84;QWjL47WwPZ#;wm%ZeX7w{+i({(Tct^*Sgd-M~{&@Y%;U_1|
z)qS-f#EcvMW6zK<!}B_FkL~=vzMU(^)2#VQoER38my8+C{CKE&{n=1!oH1@Qw7MaM
zVdvB-;qQ+8=TIi?a_sK-&*6K+W`ysKnHF{}m}cC0LipR-|K$BqM|`ayR37=u@b-u)
zp<>jeuw&1paPYO#@DH9t<<GTazj`L@lt#~2OT)YGC_|w%?0sWK_`&A?5_XI4`($a@
zzu<ogf8X(nG28zs>{~G_)E+7cHSd&!x{i_%Pn38jC87REi9Yla<C`TR`<nvexY{)w
z#9<so3~_WUD+%q3N{qdfgwEGXjJ=ew3rj-IJ<5_z+e<>Dv|GeCzf+>^EeWmbOG2A)
znyec8a!5+28EJGPHu>dHjV!(H)t9x6FNfpwlW6F0eACPNF<({}y&O)9JByaB&xG@2
zwC?dDc^T0<$gAWvbe&KKA=9A_O6i~UOrvqc8-BZq+vxkMx;SbJ^(95?E&IM(`ndi7
zeQCJs_udyu!$UIa|Lc#!pJMmldnOEA7`DCbK2?6RAPlC*mMQ;>o(V(gb$pySis-}9
zFrE*len_Yv>Pyu3(z%b3ejANqjKc&>LXEbjHfl$9X-9TwH?(&N^2z$Z#z)%sSNYdy
zK`T=8_}6^zHnflBpQCd;|9mF@d?NpR1pj;t|NN2vjMp|f&os=yEIgTiSW55WPe=0)
zXVd2*$EVJU%cP`*`QoZxGY^hjj2L|>xg6E5J-)(q_n6Bsyb4Kr1K+;>j`3;KG|6Aa
zac3NNMgGWWY$171`_GTt#!lMK1}s0pru#a7>ig`BV2z5j)}j3z`|lF_uZR7I?mg^3
z?N)Xl`|lm?Kbx;<Bl~|V`=4wTPO;6?XhVzkdbM;mU=y|=y;b`!zKq`0B<)(~r&l7H
zyO$Nt9C0pkr?{#CgTroeFJf$&{p3MZi_iBf-edm@A4R@jaXs7b*5iK0p3m?B#P@z#
zI}!bkW~9-HERN$OP9vK89<4#`dzZhtIfS#~&ZFYz>|5{mWqOKC<FWnE|NNr$`?cEN
zYVUqV893rM<o0<Nhq=$5Z#0)N5nXS!=OC_T&dgA|erC8Ty=%CEo4Ad;xQ~a(-trzT
z@GhM5PF(k{=;P@+=DnNFF1z48bU)e(*<IeV6uo&Wo2!Ify2yKr8n$sQ;@jC5s7Exf
z=;{3DZx)9B=saI53<LjHb{dAF2*dHz|6k$xFY!zkcs8DMew_!|c!d3*VE<RUKOOx4
zTK@kb{{KE>5PX52ukibSsSn`ugF`RdXrwepqZs2*@oQ@W$Vr%jX_$e&Z+Zs5Hh*T`
zz%Wa=6tgiOF>%#cNUxhHZ{xkUU-BG&^_+VxZ<FN__4;A*^WXXUaXx<?S<fF%kV)YN
za@%$L4BmP^l#_F%w-`&Y94pa&f&b6%?<Bk4=l?(8|07G!q45g;-<*KvIeh&U&xckr
zwSfOmwn?{TJl|Y8tFZ>_kQSHXvu~hx?T|JfcN2XJa_>lcyK|7;d!7$v;;Qrml#`W+
z(RY%&QLXPFzCyl^Js<W8??>{)^ZYvgzw1qqHT?eCOOBhV{~h%cl?x=5;b{HuLH{|7
zqljT!;bUK=Jfj~<xZN=wj{QUboBnrYB#Rsxm9wUMANIegLt3vW|EWLpza^#7jFt}X
z%ul@=%5++|6VV#r%59I=|8}{y&avLDQQk4vm)q?4VcK|gNY=3zy`#s;lZd^~|5q2C
zrq?O!(O#iv>F3cfRo<4#<05%|Z9upv?lP|88lrx;8)PqE{wA6G%!qKC?9<<Im%NXM
z=*Pe>D-Y=Xw(no?A9`LgA3?YX!!Z*1`=w2b`Uge}7o+bd&xdj3B-D)haQ*Kb<NIv$
zdTmugTbkU+zCUFDqo0fj**CRQ&$2IGunygSreOwVq5WR8$C2?5vTGNge!KgREOKbP
zZvF2q>wn1>K7T7cb<P^T3)cUhvCo7wW@9c|j#)22=KJUFC_nVYSc*Sdzv($-$9o2z
zMXc5ui7(k-#WB%*#g$~$fq`Kaxf;o>-lJFDYkdI;&n&*tdnRs;-_~IRHen0O(D!Ti
zimb#=^nSc3>?V8u+I)X9n#-H-Z*fnvjpjT`@1Xd%KVglQ_Wv+F%3I3$j?%k6yf-m=
z9JwR@=iSX*aj#ILt*K3^cQfi;vOb|6);=e-&kf{$|4E`5X>=ls<2Z?SZF&bfPpNm&
zeMvox>>2g)3H|SEfTqptgL&F|v^q95UcJmFXhW5H`m}V;;yf-Qw$J)I_4Q?XT}B^z
z>SN(5{Tdpy-$`~u;*d5RH^kk<ZQR9uSRWdgePJ6O3io5+B6aDHwL3rIFVXYs6W){m
z6Y_sS{!w>b{vXMIR7Mb$m7$I;!f=elXcS`{CZJt;?oh@%uRrda>)!W~_J5c9Upb4u
z%hZe(wBA$xA1UKxoBy?}FoqzVDVT;Ci2CMJ8_jv3M{7ztQA(eU+)VG@ROi^C9zgYE
z<2T~xBP#a`$;GG|#10}8uCE^Xen0iVa&Z&fkGeD7MXVHFg^HhPkHf!&)%1_vzcs?^
zumM@V@fNZS<*1B!&$@r}hWeiSSlB7N8+)-I2XPoj5knk3KV@5yy`S|?kZE)xi{m(n
z(>ROssL_vKi#X~~j|7rvK%;u2NuAM5wx~Z^=}+b#wW(X$`S=~v+1@2lUlRLc0Q*B-
z5v>8vsS8@Pj~AWiGOpqp(qq-b;w!#9CUhy^(b|Ta;%_6TY-hz~(A{Jou?5~U^u5bR
zTVcJ<Z{3f-8N>D&6Ye_xJ|3bU0~gD`vAMxyuX{O^EW&V%#Ap;_944TTonP^R@&9jm
zM}&L7tk092h8dWJXw0;foQ=7dk7(^gwfKefC+oKt)3Y@%gnzq!yJo8O=ZK>Y^(%}^
z&oL&xz&JH|!*!RRv8R8udEF!X=u2lg^7DUw=l-534jsm`JISsq#>B~taF)z1v;Mxs
z`ujoF-xpecPqvORwr>7Fn%?$ETt{(ODZN!#jWtO7Z_3#CI(nD!^3DgvVFP^=a(>HR
za*lJ(MQ#ySC7m*|9I@@jQ^}pE-XmWb`Pxw&b_?%C@{oKvrk+gfGyXnN{t;i|xV7Rp
zir*^#WP|bh{r+<hhjA1!#F0cZ+TWM|d#>@3{FB|cT$}S|kwfDc`M)mz;#<g8$EC=$
za2wfjg)W^=WN{o%_D`tzYjw(4Y0GE!GsbTHmJ`m-bdIUcMLv1|s`i<S;Qgyk6^E0K
zJBt{7ljnDyUMDYc*OS;RkLw@jElKA4arDo(Y*!8(mm)8dSCOWF$9U5<dY5;q^OpDi
zy7&HpzN>S}0sV&GZlXdO+3&Iew1u~YdxJRyzke~@rT4sC9QuA`4#C%64EKc}q8|hQ
zH~Sh5VuLD2bz~fgLUt+HFrA%>if;}IgB>>%MHr5e7>#0dyzd?CQMMl_<77s-o6M;T
zvSial^MA&(y-V2OW0h<6Z!6ok?Ghhln01HJn1D&B+QGgctH~Iq(PyC)ap~4wSN}Xx
z|4289h70`ldD@yK+H6#*6S6-l&>zDtz-<4kd~Z;A8~OEx>hAhs?0`YqmqK+iIoC1s
zu@HT~3Slw16w9OVcYKdPJ=ybtG9ujjLUC9{M*jcmDEv9~m%Q!x+KsSAcpWw%tzJ)I
z6TK_-*f!llFGJ*$X18lU$Y`zj3~}YC#7^wSUhKy~)C@4M8}W_idGo#NwZ#c-a<Z0v
zfJSY76CbSkf%g5L_WgkN9qD(p@7wudWCuFevd>;;b8KaMkXhjz8I7%MJ7CVGb47Dh
zkCHLOkwi_Q`QcmLzXj%q9nn5lYoFOD`SA&Db>j|x+#Y^iEx(Qp)9Uw>-_vB<W_zT}
z;|HR1nf`fn(=&LoKT*{FHakz{XY76VS?w}CeMlPUq<6hTN;gYCj@%<U^7|j`GA<#m
zY7`rWJd4;R;}7ITRMX=R*cE5kGQwAp?BMUB{*-*2;D53qYZ2cSLfs+z!I6oKb-Jk#
z8m=gN{&NF2aT|AW9}m%wcH_St=;Vucp_`sT_KGs#{cNmd|Dsu%Ez)dtuTw}bWB(%i
zS>tbCVaH=AiZC3bQH;F3Io<nRVm~<H%J1_de`DSbeG;NMw^PVzn1NZy@4q3wl-@O2
zxx{SxT;#?mmx$)zc59pK&S={)-){>M&5chYaZFh|Wo=NkxjW*QVmVeKc1Zu0_*L}P
zSc7%gfKB+Q{jx>4EQ&*gHn;K@uK#oTDTQ|<tDk=_xgQ77qrN*#_SOszN6Cs0e7lb9
z`-STylW0a7oruQtvgC1`#A&3J@yPx@OYbUC{-*nG9Q`74j~tI^jzBjqi|hHleU-@G
zmp>M+kvDJ?w{aKQZ$1|ulKmLC#IwRs^nJqk%?jlQRoC?qDO1tDPepzkj@SXdW6g7+
zQXfK{dLW)r_Z(6Op<$ys2sM1`TEzL>(Kva%areX`{y!Q}>74g5QhK9NjB%KNNoc=i
z{{OuX=l}0ACcn?v{2}v}$((TGRQ~@=<NpJU|D*K*-yUiG0&N$Jzi)Pa#CJWfo#vMx
z;+OO9lk|p;=ff1|n1&gcg;LDMT+BziK7ft``UsBbe?Yf61sVMe*?IaO(8$Mc;_El-
zqi7*pg;P!XAJB$q{ry7eEXGnSN7{9!MihmW^sZs@Hbfr<z2e7wzK!xGoN-JyS+1<*
z*WW8k(f4;^bNsH|UhVj5_b<M{eO=-{8k>yf4<&^g+>?4T;n+2fTZawUge@q;NcX&)
ztZ|N7#79X7_4LGRo)4n_iKqPw-l>**{0q-N<+(nYU-IwvPu)@Okwc|(?8I*D#ePKN
z`v=L$pO5zRi{=L$7Cwp?;z*(yEq4k-n(Rat$C2J%7*aij;Uv9l5xZgDz;K#=7J0it
zI2r@%CaZkR<h=N(|KKut6|q+fL-p9gaE)HKhh6XvJA-}`4chFaaDr_TtpR+p4^R05
zYnuGt^Y!7O_pjI*&;Ab~T7P#}T;I!Vkxz~c_vw{i_RY61Jrf?%`;j02m)0Nk{|P5|
zutCQ7rpi*~4MS0c;TVb0sAwM)+TH7pt;#srHAI<Km*@9qqvr;AKI(xcbwo4S;<(l~
zJf9Vw&l3LMf&rmeI^!?_lQ0F-(DxFb;-^I+8vF10tLN=6X^zmlMPY{D%A<C3t8$N7
z!llUmO#3^>dR=;oOfS%vL9hIR{C>ZH?dJN_5#5hmADQiOeHZL=y6@R=c+j)q(3QgQ
zZpm}uAErJR4q6Lz;MT{(k4ios{{F(Su)k?U*f;1C;fL2pgde0vg}qZh8TQQlWcYsJ
z3*mdOy$~+Ck1hP?h0(dSjdRJxNNbmulFQMxPdkdt5`6^7A&ZrMTZKOvU)cV5{Jo;a
z_cHyy236w+g>_^#8Jlc=8hsPCpbX`x#7^wSUhKy~9L7;p)eZ_r&lQ9tmkPp<UpH@U
z?3nPk)9rWWw>T>IeKNd#<rBuYMui>NSvVs6UDGE*(r?Y!Z7l73zJc=HGavVD=3!y)
zYlYzl8w>42sSWn~{u4v2MJxzE-1VICts&O1TmOg3pZ{xk+cD9%ShsDq2kY2^z=sY;
zPIwmnS2f`G;pip){D?>4C#gr_A8pzh8}K`2|97Eg-vjpXJ$<zILZa!8zVhFCH-D>-
z_D*Q(xfL1*e4uXqO=y1ffp_bDeGKo1<I+5d(>ROsxQNTRifg!mnq38<c3(k=qprFj
z)MpAp!o5j8C<qOA3fSMj4>uim8+UOZ57Cc-|GWBk^C0U#23h|xD0GqC#vC(bRycRd
z-sA@gLKB+j>8nDkx&A4n>20VQQxLNM^!qSaT0>EU*s_99J-r|dr`N4;Z5T-(jfO+k
z*dT#=6pL%opFWP9gtYo*3ONm_j>mOR*OI9J`%!rNr@s$bK1Gf%(5;MQ$eJtiKTq3k
zysU05n?t{2VjmlXjnUA-zTaj1M|mk%US>#h7D_Q2bJ6bD4##$qT?sZ#ifxl&<B&P$
zZDhkVUC=*)7QeTOe=<LyO?bXE7NY7LJBO^k_e@wyE=O$4GhrpU3U&H2;)7h<de=C_
zwUNoq{H@P?EUfl>@2~B5N%nlkUQ|)oT9FOpCTv09U)!IDEJtM&{<*ayWX*y{`d1!>
z__;@+4)u2)85esLl4!t$=s&}}x1+qb=l(V97T$}BZ_6XvH}<Rl`qh8^o`1jR-|zYN
z^Z)xp?p}Xr-1CTi{wOq~1+7c^y`%l=zkc;!zct~H!hY#h)@vX6kO%4M^^Zbo^P_N>
z-nHaW=)_Ta47mf+-s2o|9(jhYTU^x`*Go1>$4q||(qt#9OZ+}ezK|6@j^u^@(BK;D
z2goOC&irerm9IGJ#)?M*Ni^W3|D48IoX166##LNH`!4x+J)QgHKOz4q*Vf@0Yvmt}
zJ^jjPzy4==eb8@yiu}8Vv}<T{T+6b54L78dR-SH>w<G<CWBm6ny=%O*ai9JWxna_t
z>>TKx+Ml17RyF2deT?DpGm1G+zvBk}wfx`e5AoOfLmeu<uAVVooP0yRmdNKu`J@kZ
zT<=T$p@{5x^U>q=f6qP>h6|6xXcVL3)5iZlT@c36`@Ym4CXka5?cFnl%zjZh{=%Rz
zjh<E>Q~Jqf(7UwJof%_P^it%Od2f`p4B4%1uN|Wdj^O`~e>_eV&F!gIZW^>-No6_v
zbz{H&Hy87<5R0)C%hB<`J9tOgTc8Y*nOCJN?P%|jEWK%nGN_!lD2L4hT-POK@SM3p
z!qNDA)E~D}I=#QOeuZ3(kNOX5gx6sMwxA5<==&_&iL82`FF`(8|F@f-_fHPkW1JqX
zS&XX>_R|j{-~T3G`F!rNN1?c*h#`(7nvq5yJF?=(1;z;fHFOI1R(OWwah$|yv}iNV
zk~{usR5(vwL^QUXdce1!cU7}Z9CMX^4Y@<oKt_Cbg8gsIwYHF-I_&YdYyH%y|C@h3
zQTsdIUUvRde#-j4V;}Po3jOWEH<9oE`@MUAi+_Dj+pgX3BD;k%iypV_(YS2m3jX<6
zKK4xhxpZ4c@vV`*!au)Y{lEeHo=ER5?&Be1P1X-|SU*6I`sAbie+DjNhoQm$qP+(a
z;-kF?hKg%(T}9+@r1d$BBu69ahsdx0d)*v=;c>{X3lQ$!YMf%D@&C#4KixS><lpt>
z=jG3F4n*t!Ciu@JOu;nFz$}zvHrn6+ck;hSAHhEJA9sDY{?D~GyVe%6^^yFOY2h|9
zn!7kxI`gp*ixE2^|MECE%D;OTt^ZppemNS1lQSPbqk8g>_8*<&U3N@<eGh#Vz48m{
z)vvQnCbIv=d*&sc{aEh+qWwo#J7x{?>;J-Yp;!CS_nv*}KCT@R-}imv((LCA^qybp
zQzEyZ4CScAPE;Km7<QA@<X&<=V)TRLVbmQ_z7yX2YUNwI+@S17duaTJ{eS$|qfuIj
zBZ+3D(TV@R{eSuOKNnmZp0594yXVJ$Cc970aU7>{7UyvhmvI&C?qdi3#J*;4=dQE=
z+4)T?*#BgU^jfDYn={$}^fu39o9D5;TK`Idy>A}CHEG_!P4sEov!Av9p!I*Zg?qp6
zo7`m7|93wMKj)oNe?FwY{YCo={FS*1^z<p`xWac?uKX+eo#%`>(1#+Ik*6l_K!<nX
z*nn`-@kQc?V<bkS7~?PjHA9|fgPI>cihquJdIHH>{y7>i@vm?3uOING@A0qq@vr&r
zX|$!D5AFQ=j@P_@=t4I#$RdZT6@`CT|2oDRb?1z|WBt26gemm81AN~-)(FsNpy3hU
z_YNQU0bh7~QJ5tzI&bA~>{HyWPIoP(;$|atUBAF){Q?WjT@ap+>~#GC<Mk8F(@!wx
zdHyH=U;JV$#d5?ZI{#SyKfTVi#54SV`f4=ve0YB6uGhb@M%+4Vz$R=#8Ol+Ko!E`N
z*pK{p_yy1ViuEqSHLvRbTcmFWb+7B2*{Y9bk9HajIP93C$hY6x@Jj8~+ZE;>h)bf~
zF&*ec7rL9=Ut|&O_0o7p|K9`sf5-Iyp;cT8X|y3)lhQ1m?6<U++VC_zJ&cWdPrFU;
zD$%Y^_Z$5<avSN$jL^>Fq`0a>{Cn~&VoS_}BQK)5Nx53juHMYI7ru(*RJJSX7qMkg
zGt>AU{=5C3{&NF2aT|AW9}m%w|8D=M_dbXJsQ+`||Dj#MP!yr>i@uFPR{lu8{jc>S
zSep~=J<;<!>&?VJ-OqS(2*twDn#ytH1WdvdM13z&9XyTRrJP4|2WHS`AvZ?(!;|$K
zrQ&+NY^@8~`?B|xoR5WAjHOtPUj33Q$yHd5ihr?J!LPlGwfgc9_30({d5;cwkFdru
z>kv~HY#=uwS`$+K$e261jof}uTPc1E%219<?2Kf)^7MVrr%?a@DE<Fy_5W)(>$RH+
zGKmKMd*fvN|0neSvk(4ceVuc(`>lih(0R%H06f{ZKYK_Yz&`)+`)=pki~Tr=Xl-8V
zg8h2vU1J~5EjUV#AvaN-C@w>GPj_A7s<in@vN<}2o+dj{y~ppn9$$A>_&D;{Eid)!
z<%6slq5N%k-%y8oB;HZ~H@UAk=|87&7UyvhmvI%>(9Z7e(C^wwcIl7lCNsiWGN+6;
z9#Q`%)c@7${|@!PXZ45uw?nugjdJx~H0N_0ZVKPVT}1te74-wdefrzaD*NBH7dSl@
zz5C?AuX_LHsoTk+D8g{0#icNk-ZjztH{QETFGe(HCF_06AX=-Q{pV-GIPrbIW9yKU
zFeM7J0j7~rTQh^~HUDQ;6jm-uqp<P*+2mZzN7VjT82^jf|AoSf5w-tI$>sR(wf{Bh
zf?9P<e7bsKo_b-KdI3p#1N%MyeZ(Q^OaA9d>8!#UM0-nCzVHJ3)BV5W{@-)|ce(%j
z-2d(FKXMuOe@Ns5u>Bp^((~c||7805<Ng2FNoxZ(p+$YQg-l)1f5sjtqnD!+o$?Z`
zW$xB~XJ)!@?rRRQUDn_DeK)FiuzkpAOnyIk5Xtr4Bh;_;UcDBTPj&_l`|T)V=&4}i
zk-eX0+eKmbyg3T<F|xlI6Vmif^yx#c_`SIWzkMNOqwtqL7LJoAaT;gw<UKl1kJdz2
ze!+Z%pS=(+3tz=G+`vuTMvXDz+CkReTSs3<)(^1$eu(w=Gp)bJMaSI5eLO@z2Ch(N
zVkp{AS$}`Q`ulU%-`jh?`+@!UZ`ptUfc^K;INtjE$=2UXtHpZzR(i_0($3XpK0(Wl
z7ekSBhGQf~BmItJP)v`~?Cg0ljH6FL?v%8}WyEzm&m?hG$6gFm$Z3e3c`?i&XQ7%N
zua>XGi=k9_Hj?y)UGiBgpMI+`eqW3D1LOawrzems@!UM?kIw(C_}aiQR~qy2w*HmM
zckO?7j{SE@8S7zR<4@-QE>QlLc%Cbie=@)R`wsgL`T4)b;1^17F_vOE(r=g}@R~7S
zde<Ilzr&WNuSS0UuW)9&bCGMJc+aHDT!3}->WSt`ked)QS6~ZShC0_6Kjs}g<y|Bj
zwyNXEa=%q#Cw5~m_TwN9<0xW?BZ+2oA{)VG==rrVYGeMrzj-l4>+Ykz&FzTBMUFe>
z$@;&O^wT(t^XU7OcZ<A?tGE`$N#_Q66Sr{}(ft4W<U{mh;7a8qnnOeO8i#CAPYk6O
zVK_!2EiT#vXf!=P{y5$~!}M{;v01WsGX7XIoe#2|oq~UF{_nO&f0+M0$^R?$rN3P;
zC`_SOe5)X|&-47zxy<uNH$8*wX7vN|^M7mESQ*!vGWUSrlS0~W(fr?O(wTu-D8+2d
z#eDSrqBy_)q35gK5#e5Q&li(Ru^cNA&Hr6RuErXy!;|^H8|ZbD<xStpCi)gMjNo@6
zF_s@TQQr&Sx0df4U&8-=o&UR)Z%O9Y$-Smc@Sif2qY`=hmE9Qaf4fsS@Bi|FJ0E=L
z|MPn@hd$!}_VEAFjFw0IUp{Y&?;Gut-*!pd6?506vlshu5Yca`dxOJade?Yqqmmu=
zwm!m~eu?Z@=ODXX`%%Y4eF!l!j@S{`PBx=jKf;rJ1=7NuNWRXmUn`&M<&(_&|0Rxl
zNB&VSt^D_o=ncsF&vBf@X`ID*T*PIx@8bW<b2PTmRqdKGu5G>NkNo<3_oeBK{6~33
z>nV9d+Hq|uaRZF~O6MAG;3m?e95am1N6+s+;QH>;?;}^}nvg*^9*V1)BfY46(4)N<
z2L7MkP5r&qOXO>_d@b-F_0?eU$$9dHXzzstwi##Ne#+dnE5_Q-*n874(H=fAdJ(<S
z1AqI6+UFCIf2@60b`oQ>&t!ve&%1vfdOxX;;1l$h^c{TCo}B(S5~I=k3wwW%J=PbD
zBPXEmvyS`q_%Mkc?Zur~^ti9D;dSrh0OfIrcXpZcEn!o9P#C5-ZW?AF`@6z0i!8-#
z<i8`pUdX<t&kyal^vClnp6oXkeG4^pfnO+pZDb>VIo}^I+#($9?=@FC(RVoJlNDdF
zr#`tDOL64=f#Js^^wXGca8!Q#^r0>nzY?pk8f&oQr3oS0i(nl+b-?-P8|alkwcp|o
zeb;bD)JMsFkf+QF{hDM>c#C7oP#ztNo!E`<UHNYz+W#l}-t(d7yMx1CalP9-FLus$
zwo5toi#xFP#qgsOFNS|O^%vpb=F#EM#?j&36Mqr@>jU<)_h`=>Bg6Nb{#*FL+CL9_
z&-{7Vcka)^58oJWkILa;|Cv7vf4k(@q5RcfhfUHth_`o*36*<38Fm=otJpUx{N1n@
ze53D$aM*80@x2`*!tMjczN{tt?!GbM2a7%t_Rjf4i1{s!zkkghm>ZuB`+TJDhYv;=
z`!b%GC=5xzH6x8qWby7D-*H@97>?7c1{8#&JG`?83c^qJ6oh{q_4yDpj#NA3^Py(q
z=ao(Cum)InHQ;lhe$30EVdCeE(R@zb@{;kdmz1N=82@@n|E@K4=ROloO7k?%;yf<m
zGPXswc4SXp(kCihGfe-6evP;}19eOEZ|EyXtk=J>QUAu;=fgF}+{EMWkJxjHei!%g
z5behCI~JKEfG%{uZf?L@<K|>;jQI~^^p9A7(2N$ep3pyXM*j%ew%MGB3(tps=?wfD
zn+ijbzT+6b7ty;0>BG>+GMqjVIejnL0s1aR=)=HhaaDVs55?p-#8S_P3FIVHZ<nt_
z^7X2D6vESxoFiZISieX<QM29tkBFlV^++Ju<XaU-?1||=vrvlJn2Yib_mFM94eitI
zzgeQMKz-4*&-=a0`<?QBYwvPo<67m-d)_=p`==dhb!=*k_HTqac!kEOrMD0*0}8`p
zGDR*YE7Un@>l3q|EeI=xyR^^IJlxC!=U_kQ=Mmh|_W5l!)}U%mVOU32ld%PbVFSJL
zJ?-^R3PSw4`s9Q*?~3}ywdKF><9cgcQ|)y1jqCcu_ecD$jgI;#H%Vg)%21Af;)hp~
zJFy$>uB9X4+N$NBjo-ad`_Hz@k~!f<*VyEmn~%vqTASqG`P1%ATP=S<8od=oVL#dP
zFXmm4hjA1!RD8<ZH!_K4q|x_*_l?Y=<(@qT$^7`kcMJFh1tA&(P08<R@m*V8+eUk{
z(a$3{)3cG^%#z1-QPuQ8VX63&xQNS$?XjkG{Gf1^UYBux--5YDzk!Aa#$WFlf8A&N
z72DJ^+fmNXZrNoobH_w;=b~>2-KO8geRMA1|IgvGui(4C%7;e|5B=7U8&Mf}O&P#Y
z6k#~>_K7w!{)To8_1d$%eG;|f`Vbnge>ne;jJ^ZV`o6vcq?eh0sDGlJpWlH_bRn91
zkZ}#s+=FO*qUxAF5a$_<VvIvf|3mdT`~T4EhB&wLPohsjgElvb1nRZXAKm|Kn%}cO
z;<IB0eHJRMiGTZN=I_&IV=m@nAr_-XdtJNMIppWx>wj44nB`cBRalKRSceU0U&8)e
z!Tuz>PVs*)@qdw}=Z^7zoA@6c{Ev6|A81{#Zvg3y?9a90jL~nB&K8uR9BJ)u3YGLH
z@6S&9ZbW4|i>M9jMwU+$&8gVyxBZAI>j%lhsHVrKDp%gQqrw&Mu^W_!XkY#MQOY5*
z-%+po?aB8%qcy1s_Ih$M`-053caPN9@>0cCNk-|CF*MUFzvljaMO{JfL>9gJ>yDE>
zKhb|f_I*`ZAkX4FF5)t-;u?CtYg~)$dAT6mByZy`?xW&U`q#;R3|!^?MBg_*7KV~V
zXqn7L`C>^JP9Ke8q{XE$j^1@j`BUyE&?g}$oE6TH-RJZxM*IK#)Z8BH|DyeWp3Y}F
zXRm(8Ps0pkfAVseMV6xP1!D!qWoFa!`7bC5bLlPX*)U{kl;=CdS`hkTEJf!4?+3b(
zK^D>cq?#S2p*B^jjF+k(OVy91AwedOm4=2#rD3`MuEZ*=#u}`{25dt6qF1zkuW0{X
z(f+-n{(D9J_lo-O74_dMq46E-|IvJ))cy$8|I<@QZ+<1TIVR@5R?RF8TckA-u~$n&
z8Cj0%MXqa}>q4dQP9*6Ks7C_%@vlYR547xgCG2)gs^gWAp7ToBORxL_zoCFVDBQKe
z^(}CHC9ZFx>l^z@s97NYOPmLFuS<8Ubl;Kx4(GvsX&l6197POqB+-oa*OWhWq6^)~
zAd4It&&dA;`A4+ZLF*Cuua$rIq77B<cUn4~$l^F+LrU$xSQ<{!>xQ{D;nVc9Xjtpo
zTxVjvYaLJ;&WpQ<%eabbxPhCvji>$9gF?6~oMq=$;32&qe>DE3%+*g<{*=Ka*&sad
z|MJdbD2gx~BQYAqXn&u*jm~%2x9Dd3W{{=l(6~hZ+X`isY<b_<`vdkk()6|-aVwM)
z=~RA3JN-}g<EQ8M@5($ACegd3)w#zUWBN2i-ww+ngYND2{STiARin%`@Y^iJq+LqR
zMm0TtLBHY^`~L{fM>J=q0re;3lkELUY3TXYEB4qeHO^`82**U_YcaVL%Mq2&m1H#j
z+4r^5u!_D0eP4X#@&1Ery?<Nv%?0zkek&jEn*a21bJZNP0h_P|Whh4_c49Z$-zf<l
z=saK#gFW^x=&*+Yvghn!fW|@IpF-~s*)qoaL#Bk&WSiq^4takQ-jQnWZ^k=8CWMn@
z!&dtT)EDIU5Z>!t`*9G5aTGDsXwPdApQ!yoG^QNwEt?<z7hdg{IFe{a8lA}EIP(7Q
z9{w+$`oG%wC++*w{eN%qbszD6$<!70!G#a~U;g?@>72${oJYFDF-_*o)4SLbow!K9
z?|sdU;uE>9y#MbyqjFz$UA_4#n-nqUxa!zzsJ^FNy(C|#{E~X$o5t0&wGAuebE|yP
zYow9ifAEcu*cY4SU$_A`q;nIuaToXT5d9eVb@_i?`CqI2lU-Yte=;MSCG+D?qxk=5
z-Y)-WMG9%OA^LXUVCf7+5r!lEz%h61`9trzBL5goFGh~9pG9T~{~zPTMc+`GKu$um
zck&c+8lrC~MRO<WFhh72k^|%mk?oQ|{`)uQw9}}Y{HNu=)PH6p+WTNWxe$x76wC2P
z<$s4Vc}UsZu8bm=`j5;1wzc{ns+FIF`g@7>v(ij8sUNV4USS+!HQ6cMu6@$qCH<+=
zM-EZhU*q>3-+CjgBR3%S$T$Qx(d#a$1I`&Mh}ORwA8y#HZbP)jxc->+jE*ToIV#cD
zsBhwD=Je2aqxVgH8+?(TcI$BL$F^78ejLPM97PNj))8d?x-jJX$EM3qi7{I7Ni-w>
z{lBm32cTzh94B!aXK@}EaT!-}4SoFmiuyw9f4qmnz3hXV<Zax=eMED<ACmnT_`13p
z(cZ$*9&<zK`SHiG)&$UpBX^H&h|GBJ>SXQzWb@0X8{;nd$o&5u#ud@fVV=HpNBZAr
z6l2>c<qXl-!Z_jfWybfBA7|gpCw+r&itH%k>}aeu|G(D!e`&Qeng73@Z;NPu{x(z_
zThE&NJxN+I{_+HJ3aS#u{K@EB1kpDM8WMbR%ym68#Mg<7NB(n%{p*GE`(Hn&?x#n4
z3#OJmA7<0LHo7*<rO!u>|C>d|G2O58f7i=D;;2LY9&-zjL<1K3&tfdaa;(HEti~F&
zyH6dv_+w<(A^tx-6Xhwz|KA+>fAWuJw49LtW7ZZRjW$$0C<^POvjLm11+ih~AB-vr
zW%OuGMjYkzN;C{|ZAdsK+QTsazK!ra#^{gr`)=$-g)yWoD!<^nFuyj}K-^&*MGSEy
zQS%P}caQSFPx<d)mtSC)pWy$Z;U2%$F==!n8}X)nHOZ4WjdtazLpkb1{`(tb=9Kbz
zNBKnKF8=Qv?f(L8J=wZK`%k8Y+sJ7C%30~0$3<L5dJg-UU2v7&wO!iteB+jW13BUB
z0q5BB*w(lyu4<VzhvZ$v1{g0TAEH{_5HFOk6VHTx;eo5=`HDI~xSmX0c*gg2^zYv?
z4x+sDwcEeIF+KdXiXR!%_@#Y~g|i<NgyCf0vmuPk|E}MK?ESiR9oqc2zhNG-a7%_Q
zL#7h^`)W2neG;ai^HuEwx|Pk$0?&rbJ<$K}w;7m)Qq0C&%tzmQuH%~pq34(0L*d>R
z>~l#j#d54f)l_{TWW}e=118rXHqrXuu>-?8`jd6u8|Yh*U-xs$IM6ZUKxEDP+J8jf
zc&$S{J%MPBS+rO0VE-vU^LVa6G-qIwa2YB;`FMUn`#JsJm$Xqm+JAKK(Ka2@M(s2I
z?H%*q=4t=%N8jJQqy0l08hiNfXdc3UN9!p5JJR$v<o6dS<iCsSLbtdKp6u^i?ppSH
zww2^g?8e?G4hK<l$NUBUe4L+Ocg48>sgJB*JmQ=ajWamzC}M~sy+ZptmJLAfnj>#$
zj^dHa$Qv?<`uaP?Rjn0A9!G2xdxkuX>enCl9n=l5PoeO6Bp<l1s7C@d{#)B*Y;K<U
z2i}SL6`l>5^o}%;7yaj>^8+pmU&S?KzcL`SZ#VwE%lP*W<KGAP|L>^VGwS#h|DR3T
zSi=ABG5-fGXhrIM<KO(tHgpYAzAo|gtpj}fv&w9t@+I!c{GRBWIX9i_HY)snmrP$*
zKGhBP>G^##U$vfw-j5``VV?4}NcqCR|J(gh76y|;5j$dBfgFx#eo$?#Js6Hy`{;V(
zt~+tX8b~zUVXHlokI%jkMmlyhiZKooFbPx8u~i==qP@2}5$(O*jT}At)>+d&_rN`A
zS>PVfQ!Cs9v<chqD@>Ed49r3)W@9eqV<A53KP?tsj+J=(ckbDnW7q`+A^Y<ee7|RK
zSS5Zn)?ghrU=y~W4CScAPDJ}4?k4wQKMqE5#v%@rM-hv{(u|YQ`oJU^?f=&th1Kin
zDE#%oVcW0|+vADy_{cpF|73nimVO)+-yGxpQ2!ioZ{Bf_7P&Xbp=xSzSn8OQIE}N2
zEwlgsqT+C#9(|KOKF>OFbHbxJ7Y&DsL$roIf%-Yc;i6+&hP)UqlUI?ZUn6fIMeiJB
zUO?fC;im9yWQB9$yGIzq7vFf@{D52L2aql11GJK<_stI&WqttKCz@Yi-au!Gbja>u
z{x`-P0BPi;byu4Aal^a!knG36Z?J<f6gB20)FM9*AoFj&FL3C^P_>{q6gh4<Mq)H#
z;;K<huao!qtHog)eF7T9CxsK@>)oS|w!f47J_XZI(dr%}_nQG>mT;xH8gE<ERncuP
zaN*g=wkrR-%s<+v{3EKX>$OJ-B<T$&tPgkGd@RIbEX8uH#46<XpFgJjqYK^mqy7Iq
z?<O=))YpgB61L%Z@6Y7N^H00h>Yu`DX|2K2{rRuxpF*^5ex3LY*n|r2VtTXkzuwrD
za9mpx?J-qGFGv2np*z?S2iOs)6j$Z=o#bxBq_>yckLv065L^2{hQHl4DOA>IN4;N_
z2d0F#&-`Vm7&ayBpzj<zJpA3n;o+d;4`cV9k>Pt|MuzVW8y<EI8yWVlA02*heRMeL
zx4s{43j6OAhrhS4!Vl+-4*NzGhl8%|AJk!SzbDa*o=*)A@7~eAe`;j-uVY^fhs_^3
zbl@+-yOaOIcj1hqW&S)InEB`7M@@eo{{Edm3;S#TEbPM%H%<vZnE991zx`#{)AX0&
z`_unZ_+DZ{=>3})!+GNZJzupazcQ9~j%|0?70A|TyM(hij_CRKefGt0l71RJKPe8q
z1{HeW^nHbHbNR)c$3;}U%LgX2-{jMh*Kh-UAJ_j^P#kX3Z{sfR<01Mn@D1-jhGJWZ
z`e6wB@RT|7!c`B9W3tPROnoN&_>A&5Z+&PQ@MdTp@@8lmyEU{9_?wU#@i$?HbYd9p
zKiSW)bDrZrf7kaFg-4?p<1hh7)Th>lhN`V0{3QB@-3V)@?>!g(alo^oW~}`tPduv+
z{oA2#$hY-beLEz^Yz)cG-_j5Btx$8F{c(r=fx5};kLm1>67~n#AUsL>Q!ouPFbk!a
zjk#!l#QxZ&9p1tINNAHgw9SaVQJkx0Uo7$cw-vsNKt7#+f6hCM{Qkq63&MQqEW~0g
zMLJVpjHVzgr*|DF(6><#R?=4?H-haklMOPJEwa5JtQJ>QQV`aV>kxZT5H^sTP)(0t
zC}4LKge}5lNNzQ6auGY^gmrcs%|CqO@ptp$o7pFq*ai341@e^NfAG5c{gmg2$logW
z|H^L}PgalYr0+)iR{0+!|Aq2Tc8`&NGMXchBO4E~zcy+&9M?j&E@IoPkpCs}zd+ob
z0(*2Sk2r|KNN=@94@c=;P1g1;v)+;(NA7^Wt$o(`9kJH0)^ie9l`06$WE!!VjwiFI
zrpL#4uJ;Co<H9GAJm<My@tkjY&e5~)Fb2mCs5_;Mk_mM}J)Z3UaoT^*;yf<mGOpqp
zZXiGZpN}8)J?8iSIP|y=BJU@NYg%Fq5>ekmGorr7R_}k?`+N|xrNrLk(z%Vhc!;R)
zA=SZWphtZU(b~Ss@901Habd`hpH0<&zr%Sja7|Rd*;9%fir77SZIZ(g^+7~^4|NzR
zJR14$3T&0fW9%brW4CQ*zm=1HztGpfF6jN2g3$A9wC_hj=rb<R_o)K=Ry`Mr{eK*K
zDh7lJWbdzxOOaDh@zcW4a*97ePia4*H2^c{vrvjoZAn)```@AcN0y$e=AZKao96IO
zU*(^om7ZF_KV8p1Mf-OCEjrPKZe)-}4)@)o+0vhj`KT0LNG`@w)YxaX7V)+G>(}|L
z{P_f#M8gvP?+(80WcOf1$j^UW?msbc)%(o9r`PrHf6c*PMPH4E_x)E~;-0jyMqJAd
z^Z&o0etN+(rLPmW0jWKF|6S()o9n+xcnh*q^-JK%{Qnw$bbkMXd9H7cG*?LbRp&s%
zMn1Uzl%o<mu^W4_9|zHX#Qi~Mq5C_?{T<`}4sd^wU;ka`{+@DwFS)-L+}|em7wKB}
z*Z-=H6@|mnIf@wKkrY?$*d)Eq|KrjB>1jmQ6n&d9F~GI@ty5eU$8i#Qdt|!zNxgqs
z_$<z&!aI{i<>%NTzYF2AxT~l+#s)v9?4gcbpa1^VbbVyXMFV-wF*k4%w{aKu@euuJ
z_ilG6e^J@%Lie7>zIp!poAgG0e$x{E{|aS|%<s>2=W*F<BfFaXw^lj&f8?Wr|1pC9
zAsp=y+D*2w9R~YtD2gx~>3zzlvOJO=zomTLGuMw^jHGbGBK`-NxNdH<xGFZs1acB$
z=Zqzg(@;IkxB*!{_QUtjJJ?siM*E?;&f-ha`_H~BL`L)f=aUPu7)!Anm0x4G*uQ@z
zJ=zOo6`9(q{B5+hioOQx(8<>BLiYpj$2rdeIc)G-m2$X=+=5ufcOb}eRHxX52iS$E
z6yAyCD0U(0r?L~t-QuEo1AEEdPZZg~jUDplb76lJ{_@apkUWg6I;8J&h2bbYhKiSj
z>&;J@seM4cf4WxtpzVD6eflHz&v{?_g3eRgmlN8TE83T1+85+Xe0NV;AN5b;!bvpa
zqW7aldr^z{JpIt9U#5M-lX+-03GH8sf1c6)k@dofE9UdFR~nA#2arxDD)kXY-xSNz
zkK-gxqkXOM{XuN30s05D&E4ANsBO+pG`^3<?fMtctetInUH^jiH}#ry5zP;1na($t
z&UswKW#s+;@oX;o6aW7j{RSeNBI_PynjZWAH^o)0H`jr@i^%`KPd-HTYk%<n`-KO7
z)4h4b|L^(G|G&rnAJ6`u?m4r|>&W`ap6i}Jtp6G8KSNQ3;TVbXj6QR0Lpz_pBf<W!
zX8*5c|8Hdfv%|7vPWazl|C8bSrue|M_T-daF~*@{n?CFpd|!iJ`MWob#~ItMS6+mt
zVFqTQ6tgiG(fpJ7<U;KA?ky(M%3n19WGTICtMa|k_pj+Ik(=omPFKE{DBq}>uK!B>
zYOKLJ#P;aFA8*Yey-ptEDSi3$EogY4Zn~#z?^DK+{i$z=iZ4ecK3f0n6yA-!(QiMO
zpJ07C{V<LqhB%UFMjD;S;y9x3091VNW{CXllfu29dnTMF&*D5TqU8<iY{{#*h8sw~
z#>WzWlb&CHG1j~>`dvI-|Kqtw{qOh1Jw!jEdSKvM?+qDCs3X4eW~kiex?My3miObj
z_oK)A@jm}h-O+%dj_=_oR1~}wdOyv6_~qbG6vch6APgt_erRrvcOm*lS~OQ>q`1*2
zj(#()(7vlIbnGi*<CKN2>N0&=W$LFg^;4PpsVp?UUKX0xmWAfcWuaw#S!i8Wrf<D0
zq#f6mDhoAB%0lgHW#({{g*wzDfg~C*&UyCB+k^=EU`)akOhb({YY|5s>i4|m{eMgU
z`&;_o-wHDvGYh4djk%bQg;<RC9dEJ!-(vs2rT_gc{qJw-|9VUR*IOaC;w|t0Tl(MM
z(*ORJ{`a@^zrUsb{Vo0PZ|Q#*_u#FtR65JC6049NCXb`a!fJZgboT~p=<ARh<lZ3T
znC|hu$3BAJ<9yLKn4)_eb3ZmXE^Cf@^@=~_Ym>MwNWSX6Ab+16TSKp1<T!fWR`Kq4
z!u?J@@O<y6_m!!)U(zP{Z#gQl6T7h&f3*J$KjBa2|Lo)cAL9R$EerVnOT3#av`=KT
z|4hr3tzo}(4&pG5B7JLXNFhe=TBd(s5&M9iM1KFvJ?!5D`UhTRQ;VxQur;K~PQ+@r
zhAep;)%5rXcEuRqzZE`><OR0<6?VoTwmx}Q+<9EYWjvjqIWvT-!q*V(L3ES6jk~yy
zhv-MN=4jwL{c-xIDnBR)gXw+WF^`!nLeziW`;s}y^q#L+6Ge_jF~*@r-$$+f`gpbL
z%(yNxAsnquj@sGi`@0hyHwjZP4KpwcrD!iS{yR*+8=kJOzTkgXwC`l2dY}o-o89a6
z?*B6PAL*Iy|8#Ma?b#!pxtNcIcrt%zF+H-`JF%R;690Dn{zt!m9qkFe$}xQ(ycPPo
zy}$IHkE<7ciGMERPrMb@L~(yr7S@rypDqhG+{2~xI=*5Y`FU+*gLX0cR!pTj^X<>5
zr+3(I$8pu<CUOg6^yvFNW%P1XVke$_zh^hSE5mlcUiyASV*=Uj+E223pZ0H}{%^!*
zvJdd|`@1V#1ON1({~X3q#1Kak%}AsDeeFLwA87y4P0t{^Li?|Me`?<h;OmZI-^g2v
zZybH=u<erhe@*JgkN6**(u~!b|93<h^twlE6C9^k{KPwPifwX?jY8I6aLhk>hVpUJ
z@28QjWlxaj5$!{mUl*eO&+q^BocDqrts9Hh-evGK|I6f`Z(h%4h~_0EQ_6p>@gHMA
z7o~L>S8)wDa1*!jAI?9z<o!G1{X+{{k>6u{kM|E%{PDZexsQkFM~rV?Z47AO|11Bm
zD;vu1VERxrC@)EIiH*t%io^}aNQ_1?#$f^`;c5SqG2<!1(=ZF!myOkumG&HnzBfLb
z9?i#}OQuTr2h;71Ltlv8cZVupd<Ar~Tb_<@uzME!z4zM#!csEoKU+?&#44=D8mz+x
zY{C{qdzqJ!<)~Sp{73a0>T2ale?;_+;v~HR754wHblgtt#$N2lL1e#O7~0n=|AUlK
z^}tjAPdF=_BO5On|3tInT3&bmx4Qpi8f{DL|8Fi<&zJf9tx+G~#~#oB8^V86{~dOY
zqlhDkX57{uq{%+A@9WCv*W5q*`&FrvJH=OTX2X%k5nHd`B~K$-OA$xn0UP3uck6vN
z1R3+&44lPzT*PHu#WmC{;U6#KgC5}@XZV~6zUMwZ=mY-oJ^sTU{w<o(f~Wp%6aN<N
z;yMoTQ~f9M!@HLmd&iUg`C5h)g&WRu6Sr{}>FIprLHZWxT^s!u59$5LEz$RY%<GTm
z+7JAe_Gf|b6p}*`TVE84$l<7d)pbvG-Kh8pTM@}&t{e4Z<pVX<@+VJmvaUn?74gU9
z57Ax&+ipGX$1QiSMoKfk{=U_ET6!_sPkH``_E_mU_o0919%m1D{`)-t9qx~3|8)O1
z&pjoafBueZnOqdcNoN8kVG7c(%D+5Lqj#wTI)!J@XCWt?RfaQUcawR=;;P=Tew>_(
z*r1{?pInG)`HTzK$-`pdrAS`)?1k(9Kf>-mwyNt)`29N6Bn=ZtU;-1E;1pC)g9<9V
zg}3k)r$GZMsGtEA)Sv+sG@u3*Zs8Vg;RXU8<4l-93k|f;z!Y4cv5$TH`yAWH_HjW4
zO;AAt8qk0SRO0~^-hvA%=z<EppLGblY3IFn{`l&7_Bwm7v)A5xt!F*!=Uw$k@_)_0
z=SOSLsy(P~P1d14qu%bCrC5%YScNrMhYe^`=lTCz-=X0Y|G$#|&!2Bb3q6yTzt!<u
zQK+x$!-{K6Z4zJK@8nIg6nX7Ay)Vgw?0L^tdz}c|g*|xppV@okbvqLLi?^(=BkTaW
zpYxuDJ?+0-VT~Q(N03AcSz(#=(nxQL$H<}LJ$Z+$+;9F9{TRJWzvu}vwO#peQ~7W|
z<{{++z5c3k#eO@D{9pACWnb$b&eG50A}-?!uHgo5;gjQM?l|sw+58@B{Ph}p5X-R#
z!XBaDT6JX<)@T0P{$T)p5QbnF`raNC`l@_C>wSxki!mBwk)7xp8=45?=p7?{KlBOo
zN$Bq38$MKqq7(Q0*A(H?P=a#DGssz(gQ}hEGy9f0$Ua?VpB}PLXV@pyqoGmzLA&;Y
z*?eiVyve_w$-hS2U-#+$KU3Lf6!!mljolp1J|nLGKhOUxz#=S0a;^1m7TQ;iUZWlm
z=O!+v=YQ&(WS8r@9{1tbl3VAicfaZT@f|j^_uDmqWH$T%^~OWEraZi&eDn;)`~NZi
zf4cesnMZ8Pui7nrNTCL`(vn6UJ6?}f;#h-q*nlIW*v!d^@KbHuhX)shr}ID6sh{ru
zq8xbAzSB_Y`KSjpk%je%>4o_bo@t!_8P^W}{=to5leo8_6mbuq%8AN$dYtnR=P>S|
z??U~X?oC+SqcP55JoMhD!#^JFA3pr(xo{8%4i5}J>3Uw@l{FmO2Znuz*M=X>TpK=^
zxjyV|TpRWrUT+P@_2CCsH-_D(HijKb)`fqVxlTWt=ezZluuEC{{S|}4`|az(PI~Fo
zP2umb&+i9NzIH=+cmIY^_S~khef5TLK>N#2*p)+mJAx#7%v1gFP5o=19uf9D_d@vL
ziUA=dyzfiH%nf@c{0JXx9uW3U{<BcMb6BX_J}lHEUQ|{O3+c5(LTzS9sBfPT>Jpz1
zhd0Y>uB|v+9FA1$mu(yoes=fO@K0C&G9>9`zvQET?Um5voGfz4;}}k$(Y&yR3ll=K
zKE|f&<3i@{*wB*rY$*HA$Z*o{h55aFxY&jj(|=mnS-jh(@AAtd!$taKT){Qmz%AUt
zJv_i8^jjytqIawL$Yjq?jkzQHj2#?84#Nl(V>HHM93~*{?>C9;)&4&v9&4kW7LRN7
z|ELGVw()p>eXr;{7Cr;{wpZ-|EdO8E{^B^!@h|_)D`6fz>)B`2XBN=ooWmRz(HEn;
z-M4w+aT{dkUEkkpBg1+3Kg(}Ru^h?$BSYoxkzpmh#(cmOR?*j>exv?x{i^A~#i4ex
zx#q%}SHBuIkeiTQ@oLyYmLelOr+>8r<&JkCKSsa3KK;&#`teb9SpPgys6j2#s6#z=
zxn>Xc;Q$Wd2$D#lEp2>2#{cNs?;txJcP%tNV7~vEtAA4*jc7tMS}Kh%s5ZVp-+${~
z{r|67H(%W}j%oC<gt{2_DBqdzCVI_^;*dgN{@?G}&ehVtO#0VLKeEng-d`MYWFA@N
zz%lX!GL6D#D;IFm@oD6zs~0FkI;VOE%8{y>#i2Tmu|WK<+RftL?>^r1U)$9GYeP8e
zy7Rb*%eaDTxPi8``iDA2Zngg*3;R!#dB@#k!$<1>XsY(DRH}b=u?>gSKhV0@_@TF6
z(SPwu$Q^wpbf6Pm$fFw#?|FV`8udzOHV?FA<SWY8SCp@>gx1xsgsNrUA5y46?FH`-
zbsu?uM_&!M+~XbG!vj1*zyDKS!2k@x5JVrpV*bc5j9$5MWJvDq7e>&FF&a<j=ig?3
zA1V{>vcJClaenLh9s5T1zMUxS#j$O@{Ic2D9$|&`2Yyx*rqHLM1T!!TeedggoUQ(k
z)DrexIaa$-{HW^^e@5Nor~Shm=gz|d<U5PPB62a7qHVnA?>*+c*A6na>2;m+{4aU_
zr#$~t>i_4||1ar-BwHLu+ngQg`I9|m`v1TEYUnj~pywOb_ZR0%til?s!v<`^7F>_@
z{@>Y8;cLEI$IW|{Q{)cp!X9MX{YRsDSoA0BFBkS_)Rxh`+r3Hell7O!I;UcO|8R)h
zhh*&cjS`jXwcGF1f2+SY<+v{5xSl<(CDZ)>t?wBdh|>M+Kl|C_oG1MUS$YonDCe@j
z>I8o}|F7x|_IK`M8~>lyA6W5BIOf_DIEm9Zi}SdM%V>L(|Bu{T;z#F3_8s{ZkLwo=
z$F={T(*932Ur;~!Nc{v^=eDBak~P=Fa}76e3(2ee|GU;&qd%GdcaQ!6|7rf8Hui1N
zUsHZO@_WDUsx$uBGmdLi(93_W{Z9R30DTaKU>HWA7=`(NJGK8G)V{k{y<GX6Mx8uc
ze^mbesd3iM8;fz6fJvBwX=vN%`$O)O_W$GRbeGia$UNd$?*?T^BbsK*|9IN|L~o5@
zL+u|Uo*9^hImk}+{Y<ou54~e1`+)`YMd&`Lj*KpJV*A_XFtIg@{k9az&D!(Hm8e`V
zUnS%#ta7{tbr<9-)FO@eL2pAG->}Z_dF=t4$akyM?>nClTj-@I#}4em9_+&b9KsPK
zkwWh`Jv%bm-zGA)1!l<{@;HVQIEmA!dQ1LCYNPu^Ej^7V`9EL#d^qd8^SFqo?UU8o
zCmpxV_x#`V{Fll9=o~Kp<Ej7uzsvt?<$tuI_ZP9hPGA4`wB2{wZ(5vXFKcsX<JW&V
z&VPO(+z@sPcM!+^+#?@E{kzu4B>Sybo+6Gv9zZ@Be>{jj1j7*DVaF2hjO<b$$dhsW
z@dnpdZZ=<f{lIXIKEk=h7>$b6=7aAwAN-*C;AH)4#^(zkhm_-*oyHtowN9VoNtl9Z
z$O_Bc9TZCF9f$ev7x<_2S?I3hzw@)Z8u{;J)lmNT7=9{0xQ48q&i^Ls9M_Wt|M27b
z&m7m!!vd7)i&;drxu<sIrfOe6C%p^#hx+;Nnt!l_|1F*-@x{FcTg0CkrT?AYI$ZyI
z<qKi4c*;L#UA?baKa-vmR+)JrET`A>m_xt+g|L#o3iVgbL%475{B3jS$u+{7uiLAT
z+<@$Ed$f~Vkh$u<q_aahOC6Wv$-KbM{q7T0?xWhh74~14D*f&~y-@nycN`zF!!^6G
z2m5dUhj0W*wB7an$=^A2pc8R#m$=4bH=gdl@SOCcrHB1T7Om`4McUX5@zkLSStQR{
zf1e+bqt`t2`(585{TS-U$Qwvs@@-)268TpdSh|A$HJ|VOA>Z0~nCx(UWzH`%f9Et=
z*ndl#UZ?Ob<S*#!N7Z)gf9=)BhZ=1Iwfg(wxcs^^d_5HW-!D4%r~fUS70-EG#ARGT
z+gjrfkV6ML&nf?ufqAl9nb?rg{vgh#ZQ38uGT;3n_EEN?PdRf<JU4I)W%@twkoWKa
z`Oo(ckH~%-<T3PqMg5-a`JA}{WQ9C6gdBzuC`MA58q2=X^f>o9&Ve3FABXyLo*&ZA
zsYS2vt|w>%_%#0?TOWRGPt^M-IDZnRp#(F~XK#jC<eaFRe-OtU%%d0P|3BY9ETCtF
z#Wtoz^p2bQhOwBw6y2UnUU}Q49iWpe*dP97Vc*p*cAcME*k6aNAO38Z;+o}HiB(vG
zb=ZJSXya$Mv$wf{>~DYecO?5u=FvS_`#T%kh$cRHvvXVciy7nJvS>xcc>M?BDMdMU
zASo>FYqyIY_Z&)L4}Bl%_wxU%`TyDhYRONQ4+s2y2uD!%b>G<E=#QhPQ2uLn_-no^
zdK0q9A&=goqHv7tY3UzMkSB2(XK@}EaT!-|4So8VZjiUoeAC)I<UKsVBgB0VGCkJe
z`M*5>o%#Uq41EB)-;g)ii#P_mldKw}{v)j^@@fCad_Mkc&tKhWt8e&8{bG>ohoJ29
z;xGnq1ikH{{4dU@^GoQ3^{3dD!umtcX@5YIyxx4A{Z}u|ka3TwR<iu3Uk-8I^<r^V
z7}GbJtV|p4XN=xh`Z(l2XO6x$h8o0gHa*feOx6!%Ti-A~z&Vo;>zq@_=EKI%kR`~^
zly~RKm+R$wGN<jWW3~2vzs<rN%)<$JYyr6li?I~TQN>PGFZ@LPob5`>({+u?2knRr
z%8JHI@;BLhQ~tgye?OGJ$yUd0%9z6VB<FT0UkdvVlX=AbhvWX)EB((Ztid`I^6yn+
z4(Lzv?<V>d)DM^T!R~>q9i{&AmiL3g_)Nd=z%J}ToJU;tZEcqyB*H$&y~g(+AP=GZ
z8=m`LPY*}vNu*F#X3ul730d^%8_0kA%b|yz&pGb>TgT)voWMympD|{bJd5+Vi0p3f
zm~FpI@96Sv;0pa3y4Q;XPxg<j>Y5&^%_mL`F9~t3dF{lKke*)>>QFz|ynEMta(vGX
z$G30?_t0iud;6u5kRv;0zwVoUUHiWI_4MvHUf2I-{xX`x-Hevo<}V{_etYXpb;sTI
zG#5{~^7P%G8*@odo++{ZcS-2?y=Z^j8=j#LK>cL*hPXySEd~i|KKObVLJmW=^7Sx+
zEJh~%ddR)?dg!puz-Y%~k#}$1)&l5U?LJq$9>xiqfJvBw-fvoGf$Ry>LkZdU`{`i@
zISX?z4-2pei&5dZJ#9Z&Yo55U<S6(5nt9^%8fi^QXW3^xchnzs|DIR6$NgU~302Zs
zjTCCG3KJfGs}pZM3jP0OA*>SD8mz+xY{C|_ZT9>Vo<BO!i7x8}<R_LG=UAfr_59a+
z{?2PATi*2iXL|nLZ7W%|UH;lDf1#$*H?0ntKIa=hBY&gRf0Sbfx<7j@v5VY;eK>%&
zh4R;%^7U%@YlZx^Sw1K8j=RZ*4F6Z0P22dtXjv<NB8yhU`G1GRa|B7meSoqH`5%*w
z@AEBmsFUZ`x<9|wA<ij|duu#dzoto8od1_4b4bdYdGZ)4<8|VxnXV1r@k!L(R&Joy
zd1+KtOFvTW&O<Fdjk>Ac6}BoTpOn9+U4IsN`~F-cFQe_c@)tREu8!@>^n>!&Ht!nU
z8TGH({I7+4ktNDs@=w=)?0d-`Z{jJ;fB5(Q;TpZ{8|(|&_luHni|irqkiB29Mu9Tq
zUJO%jctB=1_Y056ej9!7>-E!;%`5l;<RBE<A(pwXOYZBO`@G;jmHR{dHVhTUcb|9t
z2zoI_BWYZBC0j6-9`|BT4K&V}J^}R^wuwE7d$re+lY~vdG?d_z{TnkJ=fAB_mn`&u
z>(7}(pN9olgsOJq|BzCDs8M&QeU5EcZ>XDS{NHQOhBEcD#m-xb<yeVTSc7$F`$+#2
zau@i&XVhu#tJ@&IU;PgaOVmF`sehn(y84GWGsq^?KhV709tYytge@pVRvP0R{c?K8
zRb}sf?eFwm=)P(E0=ljnzi`*M24VYf0Ech{Nu<#CgQ8GJ_WV2hOZNUod&z6;Mf7L%
zcjdKL#eI5nek+uJ{Es|6Ia&S3^F2n7Z6Yas1SjYx5!Wt^YZw&T|JNI9Ago#WewI9s
zxUS?y@-i~?!n#1dy(^BdA@A9Aqc9JZtdd8nV?IF*YBRon`L(b<R$BYlZ=Me~Tz3n1
za1Rgg2>mvB{`%h9k<;(ifv5F*dhCm9I4A#KlK;u(x7g>6ANvRN)@8=ci(>!=q2jK(
zG+8-O{~LzUN1zyS3_{IN_ovR0Zv5E(lTXH773QNmul%?C=5Hz^UVJ``bG*{GJ%OBr
zDVT;5%)l)CSLNRv=gh+bEW%<e#d54f+j{wbv*%29C@VVUu{Z}Y|B?KERQ}KWN%^;s
zZT3E%jK7NeER<QBAg<-VN_=HMem<-r*I@%TVGBx8j@Y)j6@8yEXPMj~Y!~(*|MTa=
z_7?kvy=cEPb>H4FEc9rHI^eh7Z<ya~f5AiaBPb7_4)1DXXg)YFjCGtu3U$cx|1z0@
zp^4scpC5SJ9&hv<x`*?v^(%KdrxST$6&I}GN1i}(_rP$HJdH}%rNmRyWsf(<=TXO}
zt>@d;IxmeXdi65?FKSSWr~BK{FS_P3uHYJO;1=%S9@^$we_+0G{^&sGTH_0lM>iUd
z{>l1#jx)%j70uUQ2oJ=Q-<k-I$bHE6I7fKD|6BU|^Wiao{tUY5d30&F=tN=v-8a0C
zZ`sE{Uu5}jjj#V#eLv0}f?+88jy_tl7(Ktz-%s{_)pI8+-g+U7BPU=IrXaaY`-AXl
z^qTFS4NB-UP*0C*GsH0%wU{NWd7?glavrk2xdr4RWYi7goY=Sr%woq&k*9a3eW#7S
zQ+)r@SnI(L49oqt5^?_gDsm0t`1r#7dG=(T<2Z&qj{S*afNGT$Y5e2i=fk0m1H!?#
z280jKd^-H3aY{I_eoEMX=8NISJ(I$ZR!<81R!q{@_j>qXSxNZe+LEwm^8X0C*G>yR
z*f=e0-K#u2s60HZ|KES~{gwLicV7rw=)JGl_s+8{rT6^G_xm@>OZn-D=Y9Cnpzzan
zZ8_E22>f35^P-SFO#iL%`2SI)T(^dY@UFGWBxOxEKP6A!B`mA#ifsisWl`xR^A8jM
z!`uiy>(7<>gI^Et4u9SJ!~YSsFPRp0s5kw?nEw{`xNaYIwNDP;ANt?J`;-4$*tt#k
z;r||fc;UH_|HbfdAfD4W(At_Wgbz0US=hVm&%%NJFNB|{k00{e5hS5`B7CTxl%n^1
zReKT{=k_;|S>%w%F*LsM7y7XNA~aY3&qPz=*NIHxH;I<P|1;6|{-AKe@72fkp(S3?
z#;X5xckFxl^H6){&$Y4sMX2ll=i%^eV-%cM@zJy4$WiSQiI>CARy-U2>9GFO`+sTt
zneq8cwC^wE->v4~ZPc!hI(q#wWBpuv8fS4H1%JEI`@%)XZSVCD?Z_Rp|C@e-&USOd
zkmtj8qv2}*(0J4QJF@wq{om*rzGe2D{olIm|3<$oo+~Ib$M71Nys!RoyMMSruW^5=
zQPx$8>;GyCuOIH-gr!kCv46PboaSYGaPl6qqpXQbK0<~K&1pO6*!gVe_qP0tJiU9Z
zb_(qXUG%(r?Z%V)&e1#2hITyJr><qQ`(7^{8>IscL#0bt-4f3!KDU|fiJqz)5NgP(
zt^uJsJs=G5pMx+2!!QEH7>%(g?608SlNzu7Gh6*<uKJJnQ`gA<$2jLqz$8q;G?ZWl
zW}$7N`5DGY<j9V->Llt8UANgEbfY27|KF+pp`O!x(D+As#(7zFhgS87iprut8vi&)
zoXM+2q4Ic9m`AUfElqD}SE4UQeMXvvrBSQjsQjD8SA5z&b$)B!uI-<Udx*!q!&lN*
zVGaJj>Ibz8l_N{kH<l?^P{qDi3oDGro3H*+YaLzJZ@?zR_3yWkr6@<Id>r$5%-eC#
zuRm=MY}u{8fhXgQ8t9E^LNj)VV;A;d9}Xa{8+eH9WfSt>dL<m8C(-wB>SAOanvg}s
zd&-dUkLz@?Zy+Zuxzqdw<mom2r3J_6Cs9wY!;|*E^3QmFzZe-#`|T{w<03Aj&%EC&
zWRLX*uaUjqeKy=6Z{ZH^p;_7TfP930n|<TRHmYN^>vN*VF)6VRdJugGp058hlMO({
zeEY?_c9`Erpcu)u`oHz_kEYk`Ee@&oio;m?IMh$pw~O@l;!umNt3R&KN_*e$hldHy
znS?2rhVq}wx87k1eFkQs$C?gv$lhP-KThlaKIK0y=>I+=&fCS>){8^k^x{xI+1m2I
z31Oad7hn+<V=0znCECvD|G%XF|EB)`tNQ<Y^#3D|ZZw#$-#A}iKAO>TTmL_@XhoI&
z{OX<lUs`J7ng(Rrab3Ir&lHDM{$mZ+VFNZ{3rbOrHrKSL_48L2>)-TD)$O{-!ul@i
zgbio>Kbp{t7I9{}j2j?ZUDrIYn2qrbVh{EqD=dQp^p2t8#v%F<bbBxPwZ+<4i$f<$
zJ^zZ?-orxgK^uIMp8rsPqH`<hsrk}{ci&bw`I>n}lcmeEt&M9Kpi17WX3tYA)UP+Q
z>u<&SwQm&lDAU$iC$1)BkwYHGZ~}jt-#X99kJ{t%WcMlc|1;{pXhJjowEeyLCi^6w
z)5vcv3TMgeME-wIzi^)3F_iz0i}cHgZFYHd$wQsEBCLYHevQ0=<a%oXk#|s8`M=7M
zdyXIANxwqv8`6oY3(`Nz^Fj@3kwzWr@yIp(w#ZkA^`8OcAPm6>wCz;?mj`o*^{mb=
z?_a&#GYZ{g!&UzOP5%FV{{L<M{{?vyS^2XyqwW7?>r9BJ_j9j?(d4$u;KoB3OD`o0
z{U7%F8RvKcCSeNtzF8cmktLXc?Z5q9Viq|Ey??7ufQ;qX0<uq=!6I@o7Rl>nzc*iz
zz8rD=n_=Witil>Z8=4Wmj^0ttwjAao(>I};4auXclC5D|dRm?hTZH$1j~yYmO7}Lt
zZRt?=?zoWuzoDF^@4_DJL*Mg7;Q(17{zK#uB$0~J|5XaFqsKJ~Q|_mUo<;pg;Yg3s
z{xIIX35(+n^W?F3&gN&s3GyT|`Wtfm|2XdNwBxggdk1y1wS{qj+8C;)Yk%M7KiJ2b
zz2a;aH=d0DJMWr{xQr{fhQj*S>OWiCtv%xzw0F6W4Ey`P%m0_yU(c$^dCk(*f;cZA
zt4*QR|5XgMrkgl#;SS=uQ%}bK#e7_|?6Ln}{%Pf?_JX)3Vcf$Y&3?zVkRQ0F*?07a
z>}MKM^edks2Oy5~$*uKWZdCp|9)kQt^@8cX)A_zr{HOkJWkGtk{EvDJbIk}8V>HHM
z9425A{&@Xsb?x=?rn+t1Bdl8+;D5FL^%QYTLkTJ#nhQf#UiS<}+G~YA2lJ4cs(ex|
zJXtrmZln8?*BAJ05f-D&*nvX(({sMRsoG+NEypUXLH<|Ihjruzlv}Is-RJc`9+m%3
zDJPDLUs+K5k@UNd!uTT3X_Irepfp~C9oU6E$baGa&~{w@=VLt`|9#tYMPdB+e&xS!
zqH(VLPd2j+Eo8=VT%)h`q5N-t|2}c`vh@eZLpXvYQYfp^_f9q;i+lrLmyC6TJb4UB
zWz-4sBzpfkgwtdk|8SP<({_B3eD^bPf4~>Q*6H%+D9>k%@e9J6PYnz!$@92^Ysfz2
z{~sL~ZqPgQ8Rrs%!Y%q8bUV&F?jk$21KbnVtIgs8+4FmA*^~XYDlahrgD?ccFapKs
z`{wguG&vR(a|ecT<OEE@6eKt6e}2P05A?$Rd)xTG^ckpskN@j9O~$qQqWx>0tl#uw
z`<K<{U-s|X9G#mn1|WAvy8sI1=)cEuS-Zdr?F4-Pu8r3J)1O~`Qy;&+{2J8qrPHWG
zJ?4mK9u{B`7Go)vV<p=7q3u1M5jxO$)w8+o8R^^a=ASprH~s)kh~orW9`ae8lf7yD
z0a>AqVwHH-U>!Cf*=YVh-*ywdMjR<@p_ih5lzW@%9;Zvcc7SqWaovX<<Su0ITceWP
zhs;smWwrdTd^q6v5DM!)?3Mr1zSBzi|El#FwDH6`YMiH6XkVM|`=2a-E#YgsCW#d4
z@X7YwCdX~#t$**_oV?kA&U40>T+#+lcFR`{ud)ArZxUxS+2Z`nQ2xP4>)(?(ag=@N
ze6nJjdNNu0p8Vga&7NNVnIi2eeB~bZ_t5=al|OH~KQhiEjQxCh{g>O{f0&5<ekWbi
zOr9psBCjke^grs0h~-k8iyY@9pZD8ET*eiY6<No8jCusUl1#eK8}wVagSdC;)Bcf4
z|Bw2;)<E;y13W^%ZN7W->c<*D_S7nO$v*aP2ssP||M{x?cT@juJm;I*IKE;W(3iB^
zeQ!`G7B(9B57>P2$@u>X^hub4zTX@BZ~Xr>db2XTguJKxpFz$-cAzrmIqMbBJ9_ve
z!_^1q3($Q=9=NWJ9I>rD>%PYdUxdY2ise{|Rj6XGs@ba)n^?mit?glZZn8mF`KQPE
z_iSI|OzrFZ)@HW0g^YGL+Tqrb#{aXu?d)xi?3m8(YJ2Y*%3cp<hfy)uKC}L39X4PS
zl5^Sr*~SIXYYy{`-?Yvzy&UzsKkomp<}2$r_+<b84!`fh9>o14^4Lc|fI|Pn`{q>9
zk06N@st$gfzDntnmRe~o%x^d+eW#?;IZenShdhqq1Wuw&ncg1r89EU6RPB-<pZeeN
z`Rx4I|6ZZ|-|XFxag0VQ**snwzIe{!JT4+TUmanL^#|x3o<okEze2x;?!)44_aBYX
zhO0XWt57GnMczSjsPzZP2PmvRFkQL^8YkwsU#UFI=G0$R&vjn<uKPacy<GC1j(ShX
zPb>=E=)CPcp(*XXW{l$?8!NrngWl_1@0p(2>)pJi-2zY6F{-22qZVmwZMRkdU$|6z
zT<ps!|Lfl;VtYaUi}Hy3sanDQu4jKY%U@*eYWX{(eZNaT9XZH3Lof^@P>j(Si*ab1
z|8f7LzJv~R4wlbHihrnl|D5^<ALNt!{S@{CTKm8HA18=o5@MM#h0NUiRG3DVAUj=s
zgq($r;mWFnvh6u#9NFEa%p&LbZ63;g@i3A9`R@}8=!>uzeeBRuGWP#3Cwr}5v675=
ze;c+U+W#Ky@`W<s1APE~UxRhngz~MP*RLNYw$MxCF?OI&-LOx;QpIF#nvVBi9}XaS
zS3BPr^PcE6XFOYVnj<kBvCdTIIDPbSU1x)^6zb4~zMnlzWXT-z=>6Wq#4)nxE5A>i
zAWz~n&f+|puRcnAvi;+tu&l7m-A9Sb^p0hJNaQX&N?f6r{fBq*)*ljiVO{J0km%g_
zheXxGM~Uh&j}oczj}kRx?es^9G+F1kp3Hw<o1kk8<L|Wv+@RmW9o$3Pwm&4=k=y@=
zL<c(G6L-dcH2xvco&H0jVacOJ<JCVTnr{9f(cJTgL<>Db&tCXLq7@aDe@HwK&m;8v
zJLNZ$!YaivfL=4oy@_`aeF*Abb8o`ZlilmoM~Pv=ny)`fj3A4V-TNppnjDLa@Lc1g
zL`VCh#5l(jP}mnn8PK`=QKAb~Bc)$jQ_@-^?X_fjzW+cyCb?z`rlABgFbi`q4{aBe
zKgiuw{`80!UC5&w4HKo`eKsW?C7RJ9&u5eYS+ezd9RK$)u|Pceujywcmm+!QVWLv_
za(d0fhl$kPhl!Q+Rj5}FuVd%a=N=|%vGu<EjncUxZ2#_O!W!q4*;BOaUyH&zdiht_
z@2?w6L*IlgD2?IBYpbskS2f}oxkCMZukn8e<$tnXxv|4}yRZlQP-d*`0rC)zpiTRK
zJK`L`4s>eM??Rs5jRyI;Q5n#5Tzg+y|6{xMKxFBymG%)4M+$XlLKZpn{aPGkkN&e`
zWbe1tEy%6v&D-?h#xXIc9XH>%-!WPKE&V=!qc5GFU80VIi}a4!$_>8CW%?C#D^v35
z@*F$ot8WPF`LgfaTA#hYwPuLpTeyRJcz{RfSLWOFjXpyTz##Pfhkkr=7%G%$Bgo2g
z_FE%IBS{}ijzf+7p1SS5HOluF)Lk;(BYOT_z7~J)Ul)bGzb~@iit+r>f8r-ha9td?
zGKrjmX(&O@*PQzw);#@g5u0xx5@CgT+`m%qrpLMU8UE=UdI$TSd(Ip``T}&bt9f)K
zv>yn6I=^r)JH-aqvX|-O>@Mo>+V|cyi?I~Tu@bAW2J6tap8vhse~=xg)GyAlTkKh$
z>~`G1_BM*U=^+1`EUbUWZblp1ii(lX>d*O2VuQFgVGEKojsKr)P6NGWJ{vdt*-%d3
zfx`I1y=>bXY@F~zGw&z<vGRW6!`%-P2j|{T99Z};@slMF6F=^MJF#C`u&?q?;z!GF
zCq7txH?eo}-Nc@r>xmz3znA#I!rvrzFS(v*Y#bRHDo2Lq_lAe2H_WX`d^WV)9UgZ1
zuRYj@(rS53-&P!Jc0|8JMgM*w?n!sJ%N)ex>h{j5;Kv<d*DI~L__Ip`!#|yQE+q9!
zRVQ9D$NAHt=J-n?b!w=(&EQb0uc$upN~l{u*f@l~#NWNvmpCMjBPj2=n|OEdy+qm6
z8;R{ZZzOhXyqEZgp4*AI{#?rMyY4?sd|#RJ{`5PEoy+bd_RM~i_~F4vZ2I$|{M*)@
zYJ8CR(f$76gEt=}_Ts>@XTnd8dY`UsLKZoE80$!6RXVN#sE^^4{-;a&6v*^LeGGT?
ztBka6pmR>(Bu?Wj&f_93qiwx@n$7y3p3`rGPS<oL^gp3{x_+9q`f1RFX0*)LkAo~)
z=L(xy6t0Np8gAegvYU%SM))0iN2W;IPEoi=e}L}2McN;W_)h9bcqA<DUDt2Byouyg
zZG_|?R0>bsE($dm;yCVQSa+r<)T6eiD5TM2&R^N{`u{)Yo7H|X!g+oCnZo%0&kqd6
zj(gP~Mw30iH>Ou!9UIU2XMIED1WZEQdvFSwA*Ycg$YzYQBxj*R|7UKtvXSh3Q`t!7
z9aqe>R;M%!bIu&hLvph<JcTcy*SzPwZ+kv0qAy0h<2uJ_vUa!jmZ2|%Z6jX@rK9wv
z^G%mJ=gIi`<@A+Uh1^m8F*?zOJi1Yp*8k5JPVt3n_{Ozlnx7nf+<L5W?K*70COn;g
z(Pd2>$8D$d|DWUMqk}Ks$>;CdtN(vDKOYUV`QK=IlmCqtdInkNwcdRpRIGOI;)!eH
z?I3p{sqei~_#S%APHh7y)IYwh?o6*k8nxKxoM!ibfINga{{IM>M5af(Kt{T|Vjn&m
zgFM?G=jA;e|3CQS`46L{UwUiDOaDUYUn2e549cpkTkX0ma`Ac`!wH;38@u1mpU<JN
zzGH^{UoUUP{~;SDOTV^)rst#|Eo^_>8zPI=bL@Zhz;IeTXK@}EQ5gSUIWSzN*Tl8~
z;{&eHuc3YvyTNun9sfU&J=E44?coi--@+Zlz2nQ@mREjL6dpK!gns4n2nJvfhM+Jn
z!#x((p&aAANPp`=`y`SX$GI-=)pKsE^q%SY@!soD@7X)<8uLOJCXNv(#%PSiI84AK
zq$)jE@1W|q=X}X?J?Oc5zO};R7>YuF<50E{HFMd<H>C}A^!l0lKdehUMLg3`g0i-v
zFoT?hIcR&R{o@VKA06n#ACJG*{=sKxI;H*Nxb}}r>IP)iaVy#Lul)ZPjs3TVU(fT_
z`4{H`EW%<e#d737TNGB2s=%S|ukcHG!*S^f?O*J9q5W$$fBvAc8p1aqvsc~YJ?%0m
zuTl^DmG`lP?;>4s@3uIGvO;-z&%0Ua{7u+`<eU26m5-(L8f9b(<@6n>-==KduZ-TU
ztR{B}i+u=t$bHDt50Hlt`w(($dR&+Hh~p&kj=LRqX4C=5s;m6(o7xx1nuqK^`Lumu
ziE!7{p$S>!kjF8cz|-}A9`a-Fvi~>PfA%{+m;EOj#<2h6*?+QmI{Qy%9B0YG`ai4x
zc>SM~;!Fyw#A$lXb@spK*>IMg*QQ$UxXy8!tmPk{7ar$57UmZim!I|iFABSijJBEF
zT;IhQ<L?|_L!RC}-gh$5cY=nr_Tl5&hmUFtKnp!{N<6paPqZcEZ{*N{Ki$9WhB$BG
z4(_4cyL>=CLce#}*SVgr_LG#f*G%?&<^S|m{adJy`Tt!1@Qm{YU=W5N*`NO?KMbSS
zER#2Uza!|ysDDe|5cXvKpV7j`VjL!55_+sxF@-FQchnbD=no?Me)mk6CcFeQkT-{`
z>=%4!dUFr`=juQF`FV~PAj?iIA{XPyc%80{b}RMzZZht1u+(oo(Z42p?ekau9sTL-
z=Ss(`um<a}0h_P|r6@<=_w@hC1J&|EN}6I?YtKj<>JZ02?{MBO?7==9z#$w#5^ZmJ
zKgd1j{ov{RKYC&PpUlVO|H!yjV9Q4D7g>7iGVTAr;QRkf-M>vcz}wF}ZqJ(hxc;3I
ze;t~T|CKfTqeR~~J<DGkA4-pXpC`zZIE}N2W1!>SC+F!;<}Z|g)pOC`Uhn<KHX!a@
zs2wH!+x36%)=!L@w0_}?e&KWazt89=KCYiy92apJS8xqCa0_?PwoLy!a_B%Ox{yaV
z8oKm<`;Vrb`nt)MgZj3~tm9U)d3bTSC!Pm*gnobTy)Sgm^y2Ugz2go2-y4g=0Qw+w
zJI=qU|9igvZ*qvRiY3Ki7&!vT!TM9l(THOhQsSxUZydqT)OWN$)s2)c$F*d7sQzzh
zu9oH$YCaNvK{)E@^+${UX#JnD;w-ne#k=3NwhDa${>$}$X8!;5|J;}7-QOhfOu;mi
zAS+!N>6=0Cc+a=B%{~D1Ip}trcO2*Vb?(-#D{KKO*4xK{tQ=k(7L!X6pT#WCbUD4|
zs6OtGymvD0F%ah#$2rC;{k95gunrqg_OiA8$t@^FId)(d_TT^xA^)AnK1pHz%SLlu
zLXq)z?96wqG4J=j=U)v;vZvPbC3}DCxspxDB8NPV;RH^i@9X9Zl4o%qy+7kelX3l@
z%j6YY!wuX*+*9Zdc@Gcp2w7p7h1Lap&+{AgxZa!psd~BN?$>+=!n!7_2T#@iH&y;e
zYN!0aUH)f3(_|f@k6c*)>8>$|807kLW4hlh>K}&ChoQ~;ZCA$U$c`o6$?T8qzdYWJ
z!u<bA_CGEEA7=mQh55Pur**S&q2egUXpF@;Ou!`c{k{3$>Q6miFAh^2_kP`85@ZQx
zU=}K7DnH42Sb#-H3aeDEEvDDV<Eh#9RiiIQ{cF-DEInCTrMs}dF58ge3)ie>AG9?W
z_SdD?uP|@TH7l_SYw*ebpLLGguJXUHv-9`ad34g_*!Va`pqszlFp~XsPSZ{Hmuz{+
z{^~=>q7}^-ock}@hs3oBTaa~L-2bkW-Z4tt&MA!l`<`-tjPs}ZkBR<^EO$<Y|Jp(B
zLL57=hunw4{&#+hW9Sb!K7_i>(zQ%F$=HWpb<_L<^^}zOYEZjdIO<T3Bd$pz6|Y4T
zvdAHiw(Zit(>;=Lufk4x+#4vbep7h=x1}FVXg)6eN2Lu}w4%9PIWC?PIEmB94tCD)
zXTn)}2b-P4dHO}fHU0DGa!%(M{UmSbClG!G$=S~o>f&4F#fl!!Z>arrgvGSRJrC<p
z_C<5i<dOKUQ^^0)`uDDl>qy)pGZ)N%CGR22{*`Yvp8iYP){d)7;>o&N-9zOgJaFD4
z^nF9zUy<i_sN-V*24M(>VFZd1+q*}Ty`S~p-{C|0Hq)qMgX?{>RU_rc{=Qk%2rH|4
zK8$t#I7~pkb6}W6PQf&E&Xt$t^*rKyqqghX1Cd+eo`to1q%1)8g7!W%3}=tgL~llk
zIC}rX8sB7GKX4Wq+rG9T_La|ZJP!-77)#OTnJp(*qT<r0!YZ=-*Kyu~{m1Fa{l@?8
zHg1t#!yiv!1AP->pK45N9P?O<Ey7Asjvd&AJ=llB{$JKW=y~7w>p1TJbBH{GBvNQr
zhpQu-kVP&^b|JH$ouEIN|96aj5>M9;J*ccgReydFzc4kDpEQ(T#OF+-j*ncAMXo75
z#mDC>Z{;(eaC{o&<~F|D5TCdHZ{!~GbMNx+`NCZb`Dyd{X><ACeA-4l^*@L6>yRDD
z&%2=in-6wYeCKfymytZ9|J!d@=&?;8g=_R1sK4aikVagq@0PG;_jQN7hwL`v`N>Df
z?AI>9ckEbUEdM|FF7dSge>4Aaz5aiGX7yzMF;V(ayHMOS{RiS&`vY7v2tzOoBT$Ud
z7>h#s@BT^pcWd83HyWg~@v8KDE^)1d|J48A{>S?tj1y<_xb#cg1bU5jfw=zvBzm5I
z@wERx-s@C-z{2C0pA7pmg&yaam5}A%^M1?$>7MF4n8+rL@xIYPk86@%lV@i*Zx-gD
ztVmnXd)EA*KUx270loWkQ^O)MwOxL^>6^Xp+r^Xh?|Q#suF22&u58s}=Pku@ti&p;
z!8&ZfCiHzS&UrVUf!_01o_$81f6E+^ggpP8xXC!ir+%XUpvQHAN}XGd9oU6E*oOmX
zzv>%A2Xg2_C%TbGBU{&?d}$sd&m;318!Rl2J;<Uk{zo~Nx+{GHW51|6Qh#|0asJ;S
z|8WE<)S(Gk<d8>U{h#Tc=}ph}s%QMrvnKP7yUB*}>VK2@cg|@hTO5CK{hzJE<Cwp=
z|7oAS2Fe@EU-&oep5pG)Hc;6APusu=$8kQyNizC7r{l5qg1A4^S^9Zg#ARfU`;SX}
zPI^b9d#m=o={HbV|HpCHVfRWVh227GCi}-e)RO7BYz0}*Zrt(PJv>0g8EYYsac#OJ
zo7wMuc?zGLA3VVEAPhn5e;-DUK>1g#BdpvkrjN#0j6+p5`-_ym{2IhL{b|%C^doD7
zpWvKHn1X32!3@m89JGy({no}f7|+mw!u&tuEAr?*X#A!2`^F{uv7OgEM!)rR{nlg_
ztwV)L%RKQcz#=S0Y{Q8A|1G6=>}P-XT7Qtf65Z@-{yzJAoBbtM39DFd3<J3iNj7x@
zxe0~+|IT=>myBg_T#C9b&$ZHXCexYN{$c(9yXxk?n;Np#ca!#=)cG#z$*sm{Y&+%q
zIIe#EP2bh`v{Q=b-CEy@zNt3jIodBszi}RMtVic=V>_ffzgPO-lm5BBzlFZPCB8rM
z@%&TmM7OQ~kLLCL!Y=Xc!9E;7HY1;`FfW(hvHNiw(h+(R-IL`j<wVyuZAfHFSlolM
zjx00oC3#yuqi0bmpQW^O)tnQD<2>pd*XygQCDX_Kw$r|Us773iw7$z2Wu&*;2QdDo
zy{iVtTzdj1aT;fF9vz#d9Z&WF?7rj~A<n_;M030L)5Z`Q4|^7AX<j1j$ap?2xG0Xx
zxPoigHksWYs4bITO3&9$54Y%du>D)p!vpdW%D?pG@b1r*E9(3Gc80D0;vSzN2Oujf
zgF*C;*W|x3`swJy&^^-qp=*@;^IPv1r)c-%`!^^Tzigjp=M-Z!#-i-==F<PvJazg6
z^u7Jx!z6M_)L-TgkR>SpR*C%edYD0<g*oW^z}R^*mKO`iI5uz*S$h3-Yc9PWw%&d{
zRG2IN{lsU(J!x9(x1}iTKQDYay~h0U)QXa@lD-P{OG?yVN<!LPu3EnxTK1*zkL$k_
zK0Nxxa4<799H^WeezM-&uZ>>_``1nk`$mlqKiWPqe6afSVeg91n=kvh@WaaS;Ro|S
z8+Owh2fh><`o9#K%}Z)p_>#7fmqSbD<*>$ot-}Uv!WNX`$eAx2r<DjlT|ZU-*O$ZL
zkK_a4u@B~mJYPAo$oh~);h*k*IwbXlRVSVgRjc)Xozjni^qChQ_kW!m9O^E-Xx)Y}
z;qS(e2|L8G3*|>YAKo22K9r4oEo>k9TG;X0`0x+$zWuf@o-^_b;rr71{)I1uog*iP
zJ<BGCAD)^V4*2a*JZIU|@T2`-3LhN)QrNrM^H?@L{G`8i9{iR>3U&BUS=B^lkwYHG
zQ0_gPAWz~ns&<!XpYaZ?AyCufnInDOJ2+FKeaig)`R4bd3C(EPSmGI$gzReb`@Ofe
zQPu}QZmRVG(D|D60ft&1V6gQ829|`a>Z;pN+TS{Z{x7euzNcY&xJd8)iggRVr0<%3
z1%1DJJzOJip!~=DVDq_~rLFAKMd6mPJGh7JbJLAKlUL{+ckOMDNA!NXeE*m1ZI7<2
z_PWP2!Umw?l<;2^h03GT!yv~)kfaYIN1$f<bZt4)Lwfx5P>cGZ*2+NkE&bhoAB_xs
zEZIEUx`5;a#Jz;N(Roze0J;0>3Yg@#DX1E6{UPHG3hNK0l}j1p54yAyk@Xj}Zzv0%
z^lwaaehFsaU2DaZe^?aSlzr{UP1eq?O#7qpj~}UfxTi)my`ii?%T#46vV+w@Fh^Xy
zUn~jp$e!Aguz>7)Te~N@7)ufNA74)Pye%J+eZMXVtH?E2hYhIU>ue$`*V_+;EJc!D
zPVPX>Rb?^Km(;KDr2oH)pIOb;#WD6Z)7hml?9vkUe<3@i9x%(byRZlQZ~%vJ1W6S9
z|7~m}3jXhVY$NiA`M+ooXX8NrZ-2h-P5$p)wzcPD|NoM(YJTeY$Ip1{Naejav%=#T
z`8s;XN8)B@o9Jc67Zm(|HoI#o`^&b*y~8WSdp>@5&PncfKAA`5b?NGouASNmzN$X_
zZMKE2F8Kd@rE@d;>%Ob~o^oAb{69U~<-+{CG~Zl2CvXy{aTe!s5tq@%mbWic#-jtB
z=)#kAz8Wq_KboX7rnSZWWIUHFS`P{vY;2!+uHgo5Av@VQp4T0E$4KeHJ^BN5uaM@A
z(!W{y@km(3LUpn4tBWD7ojiaXgxEF^$4=H@h~r_Xn=bvMyyvOXi7K|a+V_xp&H7uY
zo$C8X9lahSTvLqE7>5a%gnXO+X0q)je|x)pwN1W4+!L>hzngFLuF&wNIyjn~*Q`F?
z@{m7HX3=_1{$iK6A<qBJYipP$&Nv66gq(rGxRT!)S3;kId02o&DC2*|euKsIrC5%Y
z$lmuKxB2Dt4*q!#Yv}9Hz0AFBbdTyFajeh=VVkf872~aULRP9%C*|jI`VQ<uN?xtO
zpRSM4`wMH7zi9nFYxTr=dRx~@4`O+`$93gD*Dmn&f#CrC5c=G6UoaP!o<s_DXhIe_
z<Z%prKa1->4z%{U^&cGfzGxj;@-)ukJeu$7Zz3<_3a%lm&KK(+H|QPneVg=K^gDR!
z55D1B75>!!o&K@^yN&<5lmEM&|C<>U_PORB9^er^+5Xq>2j2fd{(qYPTgm_D?|1V3
zyO!|(XY>Ex;QycE|DNOj@@EVC--$ER#s4Q;T~|TIJp7Ef24E191Npzt4GKf(HG}!T
z^zu0W#TcymjqYu&dra_ug%5Mi2o$3^^FkO+X52&8J&bj~V;zq}&RG8rbn4rGdVlV*
z>McGfQsS&ZoKuiS9qKW`wUaOf(@=sLn1wlL^PJjuN<Z1*S#;97q%SXBkK6yG|F-m_
z`8faosPrR?R#e>A=PRBCScJt$4wnAm#!}F0Cc8H*r>{hPfA@y8b85#7`Xl?a%I|Bi
z4rS_md6a*}xHM~`ZxXfzr6|V^>_XK}`G2qcj~e!?R(TL@L>wbp|Il8&&e?|pIE3!-
zQaC~;kwV*M`TwXqh7J_$>__sL{MU_!?d;!7`G2mwO13PKH_5EyRx+;tQzxD#WRZ(<
z2^)z#J(ejscJmnh1Pc3l?w5ae`!DjOu(<xuY4R+R+78Z>7ZLlw3hju;KNT)JzJfYs
zLA~QzGOjatLs;)G`Bp{x;<f*6{fIyQ@Ns*8{zdc4{ht3^QMg0i!vn-MI3JPyc6*k;
zQV+6c!87y$=rsps5ZUvq7s3#-?<Hdt$Pp;UXpF@;Ou!^e!8DX$24-OnV*B6rq9MlU
z`9>VaH3ACj$FP&Jd|4o@d7^fHvh0`U^pVSvea-XZd#$9$HA&(c5Uc2GP+0#<8oQ3l
z|76v5<FAmqZ~PT%>1oul4fVJtor_$j4=BX_)7H^9U=y~WjSX(Uz()13|LEM!HXUT6
z*w?t%Ps42X4^2zhI(DjMF57mS{X;7%)*5FZo^tHKE+pSF{$N@Eu!kOPcMALH2T<6T
zNWAH7Y#4b+*by}AlTVTvGDX%Qd-d6{!TmSUJG#t4IBaZyv3s4`7rMzjsuJw~82>Sf
zokT6t<Jo^<ah>C=Yd_h(k#n5Ku_(X5|Mra9QP>~xBkA`nx{#-Lqd{Da6QzH$^gk#4
z!=)ctdMo1m$1>|smz)3j?q4b2#hXk&p8t57UUS#IUDp>tKacuL?hR?gxfmCPH4oJX
zKwd$1x_3t2K*oROE~uB_mg75kI{s+0Z(zOmf86{3$os$G{cGQ;^$ny^=NpQBMfY6u
z0FTgbPuTi?^mEAp7=*T)>c8@MPC3%CP5qbNMbCE`XQFIqkT)CA<h*9}&z6V2O)`sC
zWldardWd+2VFZei-01!LE=JR94$8wAOCN`)<BxpzvHz-cuyLxwx2|A!pR?vSJ^y24
z;K&k`ebyK_^4)K#3%~zLm_?t1d02o&Sd7>=R~UbAMqioZ<;c3fmE<aP9F@kq(l%JW
zxgc$1et06R@!L9Vz$R=#Daz5OPk#s5!%x^n_FBhp54jHqa0rF=9c?@@KOb?N|Hdn!
z>>Jkh*LK@yOmL4jo}S+uAMj=C`M5sioH~^MSbzTSUI`U%dDi+_n}lVNL!3Jq%f39l
z<~h%Ay7H5L0`<OyxOZUOV?M5dR<(O%sNQdl|HC6g4QkJf^lgm{b>~Kg`j1A2DKS2r
z#A%$xd0fP0v~4v19l2BHzaQ5pe@WjwnMe2C;?U4OGBoZSX$+e&u5<POFVwfaME^h8
zI=eWuoicvlobdyfj2|F7ZyP`G(D;Gt#t)#O()#m_(ud}q*6-hI{{CI__np&9R_X7q
zMryqNdwuD(WAxwC>&SX>UU84ta09n+2Pb^X_s9omzWQo-ME3ik`ZER~d((el5WQov
zw2d;(fIbY}Yo$%UdYAv~^dF^f>1#XuYS{Xw?{B{O_tme45zZ?{B|WvmeYvJEkLx#{
zy>!QQ4QiLUPtT!hqWeec4d<aSFW+;ib4@))i)SpxVFD&$3Z|h1ZLjHnMsBKnKH0Z9
zUcNq~E{!<X>*@Oc-gh%vl=&HChZ}!_X76~0cxGV^<{>LAg9Y@CfxdezqAx}_J&!JQ
zVyUnS?{_)55=r_hat$i!vG2JC>l|-D9lah;`uz(2^<dvWHu=5pm-@uWQk4Bbo7m_5
z-{-VR{F`=9VdXEfxBqI+3w;lomzm>59>5_SK~}v!t}&RTcc^c~zQ7c{4&6PTjr`Vi
z`*B-wg?!%>uR#uZB-`bmMtO$*r2IKSKZ&>|KwKLzJyHH4PYXMXzQ27noF^~hGOpkn
zo~Cc8Hbuuhzj`IyB76V-)o_O_?78~BHvg|`PxyRMcp&T%`hI4O|KAmdo`Ao$Kfveh
z6VOl``hDO##Q+S#5DY_ESy71l3yz>?v@d3dCPFcNG{)lT__4dv{?IcZyKic9WoxRt
zlq(tb?;|#a?MkDLUVqdWW7kf=Buv3HlwbyC;pzUn1C77aX5Wb}<k5|W_t-x)t=87Q
zLi_t>?eAn3asS<NX|0%Qu7J4i(UVKG$I}-e?g5mV?Y^+c@nXcif9fms6Oic)e{1#Q
zazC!Yxzz8=5!awzNv=ZqSJYd+8NwR+I&8ouY(XiW?*G&7e^6U3{iyQ{>QU~T9oU6E
z*oOl+gd=G4%-d0zXHRxscwD#2t6z2V&7SU$^d9?<mih7@vS>x@e@Kcag*r6hN&iEZ
z-m#etUk`d7-Fw*`ba_slI3_IiKb#;>BKAL=CeNbK{~-Rju5ekc>x^G~vTtndaOp(V
zCH4QCYy^A!pXS$2|5Ugrp3As`{NFwwu8}uz3wO|_uHTN_1@-?k>g4y;&5_@){=b*~
zSI2Ku*Kdk-efhA3%s7r?bXuLaZJ{=R*?wEWmeXUu!#(jnK$c$c|NrB0+d}R=aq|aa
zy}#fGIF3I1Bj@(pD_>v$;@sUz45HV#w-mo*2z?mp9oIQdleO-*_utsqTK@ZoY%E4N
zr_9;_#pGy=#W+mBB=o(apOu^zb@n-qwRp0gV_birM0l^hnHgkUe`8KO?>pM4|5-c0
z&jyBhju&7N79*$b)q%Fk{-NF6hrGI1H#+HE>Sj&Co73ho43>|OrMDh5k0E0o13m5y
z71xNUIw&9S_1)47?OSWTUo7<>%drxxum<apZ+JdzAgf04f007X^v6DBdORPKZyD#>
z&vMQtY(Xi?u>-rX2W|TK+xgl#K6l4z{`GqPwZ8Q{+0EB(;NLbL=KrD@Eyz^!e|PhL
z(M;bbp7Y+}0rC*CY3JYwy<-L6+I=PIDRi&qTMO%2&(}uG_Z2t!MX~-zPhNNL&TB#?
zJ*B-M_MvAT=TJwF`xrc#=U=r<`o)vl>^!no{OP;mc<4N`+_<xMy~n(3kKqJP;xx|U
zkJf)=S7V=3*Ky~gkpFi||6b`oD6P`_pZ4GNALqq$5tnfV+4a(Y)tCo*$9(CZJt*9u
z-$M6(aqspYGo@en9bpxX)?pwYAUSbRctrNwC;gsvYM^utwLhZc0jN8tU(r3*3Xg3l
zRq}rIczI-u_CNW(mP|XYBkSd}L9Q8sVHkm8j7I*u+Stjqo$^0lJcp<2Yp-WtHnT5e
z!*%%|P4as4ZTbI#JdSL;{GXBc)81Q#{qNEjdDvRw;>?h7Ex$?hxPQPDGB-;6=tS3R
z%9p{)094#G_y6ZbVSCH}5vDn>1W93)m_e^uEo~@gUkmL=%Ah)=S4gYyS<dMV=F@-U
z+t#vqA<S_+4-2pehpRsoe)`Znk)y^m4pa}(2HnTE?^Px(_WM#SM_Ex($YUiv?)kNf
zT!VG!RbSaa_IzC(hunfvl;e~A3p*U|!XA8o_g{v-56%Dhm9pty%`Xww``ba`5P1aU
zFKgHNK~dO$^)JGY@ssP;8l3uP;ow8-Fy8*N@Q?E2A^CFmYp;eMochb~!!fUhJ?h$f
ztqu6WYkwYoH2g2@#rEgn{hk-Y&M_~BUE^O2-@k9HmALm|$JCd?KkR)elwE&0Z14Ya
zD8Kr0c=z^8A?ddiN+-S&{%+_i;lt{pQ0KS_S>%w%fip$nC+1OoV119hS3e#0;YXQI
zhhu&_fjw_NAAX3#yF)nXxFYd(II>^=UHjYNXS?4H|Fr*mA=$GzRBziDst$iI)V%lI
zkV<^lH~XDX`{s8;{r2yKx`W>djXmEE4T*2-Km1l`8u@L$!MF4=d@Gz5=UJS`MO?-e
zT*D3A!W~piH~v67KngY51!^y;R~$F~0QDoayF2HT{{I8VkI?T&(y2ebjSX!_?ltWn
z+6+2J@&A!WHyV_|jc7tM3hf`0mE+U-|JnwcM?M<{h^w&vuem;h=yC7FjJnYfdPhdw
z7)Bp~?%U#4k0`9aRQXIO7FO}*vtcwj7D;ukapVM44wf$G*I<(4DX2TeK8UZdjzbT-
zQO*BnOHy6@e|}=^80~-awaK9#(_B-68JLB#FI&TzoQDNytK|Qu`3Pi37oULaa-1g%
z{{KY&KbrWX&1m66KiQwA^&r1O9E-6O%~Sh_<zz;kJgZ)^lD-OSkh?E_bn?Ty5a-Z#
zui#hsExC=oe`w4yy<!C$`i-|k`M<p#-u>`xZCKK>Oq%>x9NQb$@Q8bTZ*guZ{;TzW
zw{6!Zi1K*;O+E;@3wy8+e^mZ!XZtVL|5fMvWc}}eI1b?mlIZz@`pDm!6Yx3pkzc5{
zeP(k=3GdT))%P;{PH#dMIW#ZvEWY}7$kUJE1md`+xEA<HdWZDKwOLNn&m!(go=2B+
zI#F?ObJ!sKJTBrglIJ#u%FO0)g<f;t|KHvmuF-Fxe(07^hqQBQv30KUd!h0h&5c{a
zE$3ubyrcj5op6U<{uSeXir9zi?}QF?-hW5^@Ev`??}YAC?}U5Kd4Ol|2>pIc#{djM
z)#@#wddZfMTDT?DY=0-zZhI%BQHOdoyzx$GL=&3Pvi==)$jATRO13R|C$x`$C*;QX
z|LOjp-Zj$y5B2|d)g2zb6NdPoVHkm8Bu8zre*cy*nqE`szcH3R4)xolf4B5!w&)Am
z5+(?ngejPY63oCX%t2wjCGAqNo-)tz0+i`1$~TzHNv}#9|Kok6@O1pyaP{=(*l85n
z4T_AdbM8_s$4aci8np4@+P516v(xw=vhyJSjx?SibdwDmjsHQD^O~<3^TPMbkXf{z
z6XyF_C!P)1ge}PK_x;E#rSzDmb10|pKsWoFpUMVLWQ)n@n^z1shRAPwki66{>?04L
z@`Ai}U!KDu$45}NS-m3KUNXJj_5WNHlEPES|3bSX*@P@|_~i4;J3fXo<B?C2rxEAy
zoF&V@s$Jmg*7c!Z#ARH;HQc~0+(EDT1^39FuUI#ed=$ez`%dV$UzqaZ8FByyVF=2;
z-9HQ?N1zy^(f9H@VJtZg&A0o73FIV9LkY6iJ-;4nhtNCL@(tfohqRAi`7etekH>sd
z{!<RbzT%#*yc1?QulGw^v_BSw(hKsrJpr~>%2)I`@wcD*cF3N8HI9c|gub#ZVKJFf
z=G6FhmeQA_u)fPd^*i;?xW3CuVHFFl=}fLca<2DHZb0QsbBCmV3Vjo{pcLiUfnBKD
z&#yr0COh(w-MGrHpcmHvxx~J)L5*qljf`X2Tj<et754X;Yh9W3#^-I;_D*)p);_>i
zbs<lW^YfZ#Y7_N8`)~k<kew?I9HAH1zj=cn9{c|^K6f|9YuP`)$MtVYm&n5je%MUU
zi=KSWy}GUrl~M2Sz6ZL$7;g-x*WYF5hf0Uv3-jX^IuAAPNnfMzF6paw9+IxfA&+A?
zfs;6mvuK;m|3@5C+kvO^Kh&{fS=-<lJ$*hqJs;09gDkyuyJzP2^WwRP%eaDAH^@xY
zj!*A+i~S#NogexwbT485(e<YEuQf(MScUlRkq?mEXiNav?|}4gmM-Vkj5m+N@c`6`
zs~)wJrE{e8pZmD~-}hRB+R5_ebaiC1e!TkmDL(qg<<}tB4?&zCl>e@=0raQi?cY;B
z->ytJ_}E9!Z&N>4zi(Km{=Y>1pKRs_wA@$!N0#2&qh9_Qe({UOg^Fu5#$p^MpwE21
zNo0@wFoo>3mtX$>F-Cx1f*FWod2;-Uyo&TJ$KC!bk1lkIzxg?1Y=qCl0xUxIl6$~n
zdX;xvjl%j??Vbz0(EejyZ2eGuYR*}Xl~{!}SceVRgtp=Q_VLyq81vYV@4WEXZ_l6N
z#~<gnukIfj*Yo4iJd@wPgx@~h`U8{s@vrp{J?2FAe4V|2iM{VMf5WrhS{=d`amO_b
z<C+1b^mqT>`U=(w*g@ZgzF$5Y`hH>ktk*sj_Bh^$12}}_!^icZBlNhYVd{+W4D=N0
zH>(dJjoPEy|M}L{eALu-ej*<=t|?dO|8M92X86CZsY4U8$RUqoIDxh<{olsd=lK3{
zoP8(Xzl+Q}?j{?~@qaJzf60RXzn5?8f3jq&|83@no)ph%oW*&>xd9nmq<82C&kY_F
zF4Kph`%V5opSa-v^IxwBtKbJ;BX1x{zeV0bB|WuJx|Z;xe)g$wPgvbh=^81W{O7pN
zUsa>@BX!Do$A!DsG+Bq}{}<+e-PRwB2d<A}Zr^RO-X^`@Po!TtTUh^*?2zX>mB(EV
z+4ueO|6cijr}V$U{-PNz$RN%WZbh^Ect$(}FbG4C6_#<|!{~8OndtwIpckXtaUNab
z?!;(e6`sRbavYNM3FIVH4*uBxKdQ~j@if%Y<J`a6GwcHj<KL$9k>&RqdAN4J{Es?%
zy*ymvni-gdIhcn9ScJuBOY{F%sDG?h{~$XztACJr$K7PZNAf?Ks^!H>`M*ovJN($c
zZ{4eW=~8ZgT)r$7-*V*tjsHxpL0K((MZWuMWySMFVFP^=wxASo-L7(S2X<i(;=JKF
zk9Z%wW3KW62k3{;Jx2L3_2c@BbRQ9xL<)82`^F2QiOeF0-XHkh$R6u%9wSfSBu?Wj
z;$9rvq&e=zQTB3wdu(Vk6`p^|y7px6zkDiOA$zn#Un9%^MLp#k*0!hL!X4bh13W^%
z57mD$0DZN_|No}QejbCuAjfh3-w<*bMxYpR&D7E4Sd7C2B;S<(=PNtuaZQ61rqHLM
zevJG-RsQD_)J~WG_kV2P)9gf-@{b)))9c9C{(as(#lD%?wzl=?v!O&BGZ5!~&LZ2`
z@pcr}*=6@T+5E1xY&3h{t<AqddD@63Wot9r`qchy)c%jyM?FV8^AN|qm4AVq{~O=R
z^k>2XVTEyS>-o|%**&auKZ}HS2#f9F6)T<ziybe;a;!vb1Fys?dbE8htf8+%^yxpj
zesI-7>3`EX&X4V=wQos3>O7ZvY;escY(Xi?u>-rX2W|KH|A>41b!=yo53<c<9^D!B
zqwCU-Cg(Jxg`T;@{(r>&BhHE3C!Pa1gd@ntbNJ>-`u|JWy~kKp?2Epy1{%Rbv&1cK
zanmRUjCjN?M!e#sQH&T_5hF&77%^hRfJeMy#MoXNf!#FX7MHkem$<+Mw%xm5bM|x2
ze$F|?BVJh%BSwrEF}5v6jM(B8uXurdKQ$4Ky4F7X+&{kg{i?<oHJ<hORsE_)k8AHn
zJw1us7-<V<(2Lch<M^ZY&qD7A(RZNLe)0dq)zyw$;=WL>53@U_L636*PuyqT{$1}I
zsZ{@W`6jdn^8P<$(@yRFmByLxk!NC>cMT4&lW*d$cn6)!<^M8yjO>xmd-up&$kKD$
z)c+awzw}$zu>a-5_IdK=0{Nfp5RU#o?@6b8n6^9l5l;C3d_q65e%5F7FOb_P?G4UB
z#`UwBgZuZre`VV@i2gs{kv|~s|Kl5PC>j)g5<Z6HZspou<=p6apXMuWmfvtH1{#Nk
zk>u$(6K5m)<=8NeJP#M*V)U&%C0t7O-{rY$bH?Ys8f|6DS@Ef*%J)Tl^XV1m7R&GE
z?jKe^s6+DOLwl6rE$RmhYzbk8{jR}v_&sjK&4~XyDZhVg=zH7vmvEf_*MGf#dD8nX
z(dH+sq<J^FhF{jb<b9~w@7?$DO`tcpruyXt;UW4XXewb_A~8=LfZ7Ab|BaW|Pc#0E
z#!1Gdk=$(jd4qNT9rGAgU^ObDJnWv0pT>WccGemHNB3;w_voE({C<}4`#Hw%$>xLR
z{~tF0f1A1fXzw!qk95NLe>MMrQO5VtiZ*?e?MUgfOfN9LKlhx_xr~26DgS_((noKl
z`en2Fh0L*WpK?8E@AVn-IlO?EQ1PRFhj)!{>hrAQS5Uvp|LHUSPoMDb0%<1Egf{p5
zvi)B{4eF70->F&g8TkHE?hUOm9=Y|#|5q9Rw@<I@?-o~8CXRd^HM{*!f&43~3p`i%
z-+*_7-$Qb{=eoml?(>}OTk9Rvt>X8L1~ekEj-PUif1Th1$9#lO@EN|qSNI0sp;H~(
zrLOHJ<6M|tG9#QNbH4B9t>(TY%r&U?eRi?`wJlSgpYf3VT$mrEvAWp%MD)M@N%$BB
zz3kgR!++$+@>*kQ<mosQXJZ`xAn&!!9vsf2=l?&eu0Efh7MH?>^qxt|26gVm^h=Q&
zBmYj8f6=QC;c{_R>-p=DGZ6osaSeGL;y*O%wc#5M=yMdl5y|~O=ifuxDfDdl;Bd3}
zTX8$?#ND_T_u-fQ2M-88gvU^R89#ZlZL+o<xt8BS#SLt9dRklxPtkj#ZC6?lo}oX7
zTt?nSrb}L)%YJ|KobZD9{&&v_FOe^!kKg4hF>Wy*1hO7U3_PhFM5d$O=$j_9cpY!z
zuXqRV;R6g@J1%@g_I*4qd_wk*9Tz?$zrr{84y%@UZ{Bs)KKD_k|D60meEAteLJd9o
zF8(Ckuu3_gyh$vR?>A}#E>{kqcD(veJyJhWJyD`wAoKoj>WJw7|A=G1K!x&o?bH6x
zNgwnVWhPEVr@AUX|9|~&>ObM!8ugzvf93zL4oo3^K>fGR_MPVE?KVGem%e}l`~%qL
zsiXQ2_G=%YQ-6HdMD2^o`VYwby8UEUI7g0j4X4NJab0JUXJZ`B!}+Kk?f%r6_0n#T
zZX=nHX3{yEx}39+kAwX##-+F%dHY-ZmGt;eoo;pJ4Ei<5%~$`<{n`FrWPXIWs+rm_
z<c)~-_swKE8zJw1KiPAg>KNmQw~J4@p4bM-`x%la*I%$t^wV!#?)g`HHwo{ET)AJ~
z#OkB+^922Ym;BEEt@t`f>uz*zm;ZOl|76cz`JbM#Z{ELSv-bC1{eKhLAHL_d@$3(z
zkVXgcer?)=_e$$N<hHZ1kx~CgKex8ch2a7558)9+|G#vh{;j=*;W6R-y1nZ574+3e
zuF`HmejSYs@|3tLHt|#BGpL!#r<i;J)f=Pl=Y8Q{@RIP$sOUN~tlj^Q{=ao*ShMlW
z@YL4R!@uGc+iLKPa`zAO$Azai{zF(lYFv13+1cTb`_2yawheqaA-r&KX!w(N_q@1D
z@BO7ogTkNPPtx{Qq|tZ5KZQ+u`4*k~yYQFs)^OCW-8@y?<bMi(_Ft@*4xbY?PCF;O
z*m6$zQ^D!xu>C`*G-mev+%v-;jhj6;<sZX_mVXS-PWo@*4|D%p=;uFBK7^nCSpFzQ
z-q%#`tHe8Tt}I@U@?8JLBj3Tm4g6ki)9-PY|KbUMfRFGA%J2AH_>3IT-!)MFv;SYS
zdjC`YfA{DUvh6E;gYWPIenK{|6|Oe+OCPjFe!!_1iPLc=&PJ8?&nwuf?A;;{-D>|g
zG0wL0a6a?_hE*Gaxnb&0<9BtNtpQhQj`l?MaS8jFU717^n#<U?Z1vVcw)|w{+tb*t
z$gksO|C8$jY~vq~-Y&NLW;RtN|9G}>TZ#4mo%3Q`ip!C1k%sM8(tGx^|2OJ~c#>V<
zyU(%Dv&ig`)<ORTuuoN~{}7gIE14%z!)Cum+;ynl#s0N#L&7@#!Z#u*ttK?4+=uIE
zEu3mx>ax(h^b+?mHQa2UThU%{fqt4v;dc6+xEuH4K0JVj@CaU+71z-6p9uQaIp$OA
z_xZPiE5fVhBUYCgS1Xtv8fKM*+S$`W-M(qc+$o`{Y)WWcU@lw1CE8#5A6#>s<IsS{
zF|G^ABG)<DJbCG^Mg^Y2Gk6X!;3aex>woc_yY1UU_Lg`y@_ZcQ$?eqtFwgs0;6BOx
z`<d$5+V8~sX<JnkUY5=)s7Dg%jsE}O-;CK?hJ+q^H@)IXc5AOTeV=&i0cH;S|D$79
z&sO(rFS2f|enNVUwMScH8mQjwyFr7vZs9DFsUqcCQD}5bBIEy$+W&R4*dGhD)8{ES
zO0~<8?|)dw-f+yD_$%H){CCZJ<Olc&o%ZQkq>ZtZ{XzcC|A*NiEoem>+L1yU9r#2V
zpWzF9g>Ml3cD^I~t!t27JS6-;{|V9e=NLI?tNevi(N@Vvf;=5(V;rJyL#jo)h~DGg
zx{+Uh?^S(*^ej&Nzc~I6vhO16e-5(-pmF}ZKg2HO2hMl=g}4}(;&NPx85nqy&%#CK
zkBrt{5?=Ghe+}1>zemk9{txo^jr4}?o-J;s--;%B5((vdwDoF>)Q{?<d_VC#^^f{1
zfn=9@3THazcHD`(aWC$}19%9XbJWlC)X#I(&r8(LZ2HV1_488oGugaD{g2kYY~9`L
z-<|6J<?8n`^*b5k9+A#tSb^0@AGVKjyn^22`nr+t|G!B7kNy9{)xQPoUo!r4vuXo7
zY^E{`H5oqFj(Z0A^?zooSMi+i3rNm!Ux%~<$iyt|j~3U5)}5|zo9jbrhxUo{$N34J
zh5Gl3v;oPUVr@P$Bb+63+JG-f^JTn(8q^~hb>B%VSv%SDo$C1_j>9+Za$Y3SgxLR|
zwqG~0cpd2}u3;LVK6=k+_m98Q-$8DvwuE*^X1V(>H7`J1?En9O{0KGTKOsLuzW;x&
z^WqEPuaF$?ypvo<iR*Blzv=&<t6hyGn(&QdzC-qFYyFVNFz6NEKTgGe+W$XA{zL0j
z`A>WOxBdSs<=a(kleqpx^Z^_x&D3uBlkIXkJ^KHgNuHRu+shV+V~<(nu-0`|+%rA2
zYL}dCzj5dvHYA)!_I*7xoKFq}Z9MW~T#CzaC1&6nT!-J|M)YrCPmq1J#%IZaZ_SM^
z_H7~BN4JZ+6L;f2WN#=550F(`UDMUmL;PpzL&A?B_C=K6&o6-<eJi3Le&UdPr%lpC
z#(xY|IA(2oNmyZ@)u_Nzcm@O4`fka-_w^Bx{UfY>PQHv+P=h%Cubxbz73s(<ZM=QH
zS$clGmr1@?`s+Bk{=gXD^2FHxYkv9E8Ln-HcIu4KD4bX`BP7w(Wxlav-o#(=4t_cR
z?>*s8`*fju_EqfvtJwcng$%NDt_r!CSB2(;`ROV1(_73>UvGZ;M)T8=UT1zfs%(q%
zUp|o5NB9Iag){VB%?O{-8<x4YB{PoafA-YL^#|s=)<rYI7xrn}a8>w<{03>~{*L?s
zsjjQ^OS!JHtHMvh$B=bR@w)QYInVV~y1vb>ug>*3XZu#yhcr6SxzGI_aDQaaQS;~N
znY}aAujbEthJ&i)1)PeJI2}L8BWIFlqjrJkJN_5XceLj#o#Y|US6Wrl8E3!qa6T?X
zO|kp8&&BkHX^x#TBV0<q98HCeMPiiuw_jXaGqyRev~BemeM#gs$j|>fjo)dU{}=e7
zin|dv<5t{`JMlO3|FroN$gk6(p19jS_u@W0fY|<jh<pT(p|e>1s~-Ii=l`wK|A*E|
z@;}<uO(~@59r^`g|KAGftVRW%LV7;?Lf_sq^qx8DPdrC|0lA~{88Ya_OX6O}E2u%=
z2+xb`ue~ZH$yM6nE2n5n-*i<zAGeCj{x~F5=&N6QgEdD-umj%KR*%P;|Cc3S$D8;o
z-a+|A+8pEu_y}?S-zVf}i0glSL4JjA@EuO9|Mdg?#QI-9(T|}?8IX+S0Am09pjUlA
zI29vtI?lw|7`VW<LH13b5zZs~uP6vBCn<|YD=)qG3&q9xf6<2``VL$yd?`-M|GS)i
zV*cNi^cnd7JO3~3n$LG!zWvRY=NkHT_&sjK-<|)rK^=u0{#WMz-7KA3aXaor&64Bu
z|L&%rnE!V#{XYD5{@(-Q+VmYfL_UJ_!BfIx<O-zNb=|hb|J$q<u0WQ}pF{6<?G&<e
ziT(?8uhV~lUUg{((Z?=7e{q!dL9zCMI=Ib#?dseV()5mn>d)Eg_j$i)KgD*1IykvZ
zJ&WtAeA~aMKb~?8&)~UuEf2CCD%q&?y#L?z)|;Z&q>Nvz;r~Eya4q%rsiD`S>8P}k
zK;tU9xH$i>l}txU+zXk-FX#WgF8pSUFVk*Wr0gg2^Zzpc^Z9>&b<8_>4<8`r|BuK|
z@EK0d|6}w2YW`mf|2+J!%>VmBI$z-%e21FS<MaQ1pr4q3^Ar6Temnnf)l&HzgQ`8}
zLj4|hj0mUFM`9qZt*?F1H`W|O;r=71gfq#rQSl-B+uC~nng8E7`<{pMaUm|orMMhd
z;#dBEJ3QAto-4BS+&1=CO8J*j{&_xa`0f0U8PbS-2iK6-Av<<N_&qsbeZ7G#KimJ;
z@(*}WAHa*|#@Kc<ZpH10{#$pFcjI2%hZFvP571Be|2;&141eqYR}_Yde*`PA8u|6#
z*=ZH@6YIa{`~QspG>vAPBEha}#7b-Iud>eG>aEuQuO1nma!hI~`_f$dXXwx21$58i
zAHW~Em&{lvAWP=>RoAjh>)5CD3)#PA?B7MwPW>(<GrtQ><V%iy8LyxQ^+=)>X>{^)
z?pn_OfZst6*~{-BGfMi%p?T-;^e-CI-Rt_ucHxx$(#x#>kE;ESl};9~<4x4Ck*g0_
zYmI(l{egGr@1aTGM3S9;V*P;+#QidVe<b_~K0};;ALreCL0|i+zB~2ISM+c29bWJ~
z*DiJc>$KrE^5fc|Z`L=H@a-fI>;FUfmJoih?@u^}K{e_NoQje7Py7F7>i@%k=zlP3
zP&i#WXX0#(LwdIQ7w6G?)~J7RKK(-E=vibc^$%jr^s__sFT2>bOHpI2?Q-%;RMYFX
zvFkBI_!=bXO*pa6c5Jg;C;mjc^7r%`aWig5T>tM*@^0LV`|toB!Xp^?(ti@k{P@F7
z`t<3mWBf{OYqHNfc#7=5O&@-geEpw`@r#SX3uNsw{ST-|L!ImDQa2+>Z$f!vVR*@Y
zFXI)|pdLxIqI2ml?87ng|8V)gME;*D|L>9iwWV6nYQHw+?#chZ(L3fDKe)?W0_!ux
zx&5*Ik(Ork`&p&U62~sOg<~InmV7<x{uLJ69|Q05n-R{hKcN19hu)^Fd5=t$@t<C<
zO-=s@pP+lL@)^DCg}i?XnLEJ7vF%ONxA6Z)-Y>3Ly`oNODzY|>ZC~IkRL!#<0$Ht3
zxTdl&d`EBF&-aY{3CA$#-+VuP>PYcJ={+s#qpjAYp^rrV|G^v7vDNC@I`yY>)s0uj
z;pF^#dfrYYPj}3jI2+?|9?r*wxEP)L^grm+?<RZn@Ar}!;VhZcH<4fef2#UXy%^`(
zwY#tU{CoG-Q4r_fYa3`!tx9QI;ZkYVY_<Ns_{-@H+7og8|MD$-;f0&x*nu`hLi&xy
zCuWF?{}sxw|9_VEm-0U(an~W0aqqMB8_d)G7QPYLX`VZJXL%1~t?R1OX2`G8GgqCy
zOx=!Tm;8@de%$Q1TX8$?#ND_T_o1`S_lM|X+JoLw=^{(dAwU0jxBQQ`I^Ul(<G<n4
z^p36CAG`GLOXndxg2zyEQ2l$jFsz`*HE-(W<JI&E{K|i<!20;&p22f?0r~HiJ@OJg
ze#bA9uOK_YSRR=~D=I$ZSD^i!rgtNYIRC!?T>smm_q|mR-XsSu;m1zCgZJ<OKEfyX
z46*<13-T*`gYR&n|Lq5Q&s?_2Y<(B>V~9TA@xM2jnd}F$cJhDF|ESJxlAp4Ri^HH=
zWiC!d>;uUAE7h5+A>3&Uuxq~OztHnX?<mh7S;RF0^6P)^mjCyv|H<~9%GPC`KRVEN
z+t2=g&B_4&f2T|HY*d`(`P@<*#?jBi`M40-AIFA^$$@W&hJjCgzm@!agfGXHn1K`X
zm#?8W$n$Xy_I32%qv?>kML7QZJkG(cU26XM@^PWw7<PlP?Z!&u(#HOi^d{rnXFBFa
z+>BdsJMP5YxEG!4jQ^v1qxt{FW_pe7XOJy3{=eM#zwz#toyO0}w!Oyx$&_%K?66<k
zp>g3p={$gk@Ceey_Q7NHp5f9qj=qAv8ad&taE9zP{uBTITh%fyteoe)EYklse_SZH
z{zL9v<8So(S?+7j&-3qJbnn7V!j1FXC;5~#pF#Nq>ot)t;3d@fRw~Hal;^WXI_srF
zHhMM*&n0<OI<CE0d`qSKLtC}`%eeMmyx;uyG1vEjxE9lI%ip$%?q`@WDCc@5o)_uy
z(wOR;^qytze~I(aTM>N<PyYX4q30;BitHw{s97{FyiUG}YRBf+MUd8Ch2KH`{|CFY
zGn_a6|Df-dv%`Dh`|mh4d_aDLPw*MOz*orn2hKO-clg7U3&T(3z#IQ>I7a?);l<&(
z;(rbs_FNdAoqb{W(;nkx%TEuLMaIjvT^#;gY&>n|nPKDXGsBBT{}}$_nqD6DPhqoh
zyG^Cq4qN@3a<lQM%5%b&b>_G~X-&*wu4}P>V;(UF{$oBekMjvxcPz9Q90_d&KY7+a
zg_eRJ`HuV$n)-eS$%5}gV&Zq9@xXVXq2Sw4uT59Cn{Vgdk;Y>D-)hIH+H<EGlNlDO
z`q)(m{X1<R-=8|)RKb4@|AL|PQ&BPHpTpX9{~XHK{nxN&*2Uo|_wuiW|DgXV&L8x@
zn^FH5{(z@jj6W_oGdx#f{Bh<#h4QxwLwPtQyfD|8A)cT6PoZ+!+2N(KapuA4*Od0@
zSf#93Ij<mWny+8c_mF*yO(OnmjKg_29~WZa{c+)9vTuTaM3DW9`4^E_Vg{~3&l2sk
z`SLgY_sA71vyd61u3x6iIbcm^@i*gE+>W&CNr}IcURC7(Ki=cr^!soxYSi!5;vb+l
ztTTqy#V;a`|39w%f7Bce#5H0XHx!1IE&dnCj$W<rJ^J@O<e1ca{rgj_y+eNtE6}YD
z?LjXx$RejMtyN#vb*cZBtN+)iQ#Y$y)#FKh^i5-=>(~lBg=g>_UcgIu8NZreP$>Ok
z={t88IW!xSXhCbKu?Do$e>p#)%DG;VP7UgjjCvXSd$cwKy&>V+kfwJd{u3Z69DO7k
zkrnqk-o#&VvVKtay(9b{K0^Ng!^V|9p?`)i@D*y;^Z(hX{l7>1KjzDW+W&}aOvLr4
zWBdOb`+bKW@Dq+<P`x?{ojZ;H@77L4&pzY-M~(kis^c~rD<PX_YyYElzWN{S^td)$
z+I}5}99K9b^b8*oI=zps_+IEa^df`S^`4uyPIJn0^nT)+N*&r)?FT(a)N5lk%<~>+
zn@5S{Uhi#t9AEJscbnVj8b{)EoQbnB4(H)~wCysFn7kPA|96*?>4Uz{!|Vw9iS<XW
zq|d-_*B{CIKdJ*)E>!-bCS~4@{eF+?%<;M}u0N8Wf8Xfeb@X4ZKk|3~Kis1Ik0!+R
z_imKd&A1h}<4)X-d(k;X{f}<MzV^SF|F=u~AFYV@+CIwjM;aaIGq!)9boygo#nrzH
z576WI<3lm-HDhDsV_1O!;{adle_0*%d-xELPvIFnhqf+rugFxL_P;#-68&Yof>=iM
zpx6CpkoQmDsQtIxyb0TDP>*EPTZ%&UBICI9hRWk(|Jhfy)r8}}(~?N6VgKV-^ZypH
zrR|d<)9mmpJwN~N3~ey>bPs#KSJ|9lhiBR2IkI*>`&ZvVJsQx61d?dN>(Y1=f5kg^
z4<Fzoe1gsb_CLDOQ!M===}(n@Nlf4O?^;^e`>kyKwmIzodF=lMu8-a^ll?!*90KWl
zfv@llYR0qw#eYX{VDH!K&-j7<6Qb`xTpKV!#(&5i6E~<qUc{*wm{1f(lBeTL^t02>
zCi~9vZ+CL#Xx}NDH}+MXCtSNr{@)}2Pm=!;*OW;hNpC_{nSY`EDzt~!Hd^b7ekm@;
zmH4~!|5nNW$f0?@{EyZH%4g-n-^~A0|IU!kHMkDHN7{FkYEf^{dp4_Ia5Mc@{C57|
znja7GJ+lU#ZFeHhJ-M5_7uAR4zdh_~+$a11emnoKbHDNn-Aj~T=q*)#X~$)?>vCjs
z3;)F~{)_IV4ef{dFCy)JI^0jKG5@;dY>P5}f@GZ2pV-BJ5>0r>IUm7e_@GWcB3GjV
zPhsUz>-`P)KE{ObjPP^FUMugAFC)%Bc!ga1qcTKaKn=YfNwlKPJ*LTSWbr!U+AeX8
zvp4BI`dPb|Ti1>L4sx+?02yt7Ui3ZqyYQa){!VjLUNe8-Ek2*ZAK{Z2Ki&69_G!<4
zL6$#!O8AN#@LhaEeuwH(^Luvk`=hTM@B6}P<#1dZtZ~QB{Yh-SCj4ZdW2oQnd+YNr
z82X?_*M7A%49SX1)blu%J`%Oll<TN3QqHSe8w-`|^d#A&K5ZVZUO?*@^~5Okf_+m+
zJAcOx{p*L-E9l;-{@?a<zq|N+KYWX0s?BqEo-=Va#vz@M2F|1R9B?eor(cNJX2>G5
zPg>$H7FVU-x|F;eHTzEqSCTVOO|P#zKCk~8;p>px=(>^r&%oAG!tcf1h?{XMZpWRt
z8~5VG_}YE+_>DY3K7{P&1>q60;!}P}?Ec5-E3i7o;VBF>4++nZas2i<vj2`D;RW&~
zyo^^+yYc7xzsUP2l>dvpFEaoCiK*)UC;1fFuO3OXB8_fj(Ya0jK{tBPiwv^JA;13r
zMEQUG&;Eb(=-Yat{&`&*Z=!8IJDZI2e%~SALt5Mi<VWb)uN+vV?vTHGk?T{3i2KC0
z_@C+O?ZVoKRrLIN&Yo3V8>dNG66?}LwQms(^jIc*=D4pgaGC!Ok!#N~j(^dS@E!dJ
z{Dfl|lu)+fRE$J_Fb9V0`?MgONp`3B{bbBVq>n>f+c$^aYBoN3zVExuKIhqXKGGZb
zC6gB;WuIDgVIAt{yEf$4C0p*Am#P1kx~IiQL-y5w50~2ia$JcSxCYmu^RV>~kniW2
zEl)3yw`a=Z$d$4GN2~vf)&FGM4)y;YWh>J34)mYP{?`VG|2OMDZRoH32Y)a98*wvk
z$DN343EfTJ8}%D}A5R|*_tC314hj#D)$8~lkdL4yqs)5k2*0@_`j*|JeG>EBqkZDP
zmXfF~I~M9z9kcCNXh0)7D}iL!G4JkJxK%ojVFgyB0#D%?JcrKIvCx${rvKyL^?&?(
z=tTxu<j}n1SZG1(-ecN7Y&dDAklt`Cbf9h2vG9U)Uc$?G1?l1TDL7^>i|?*T+Nh@|
zk#oISWD1XIqe@p?)ja8u-Kd#kJNY`QXE|=R`$G8zgTtHRlGEH58ts>udCdG)<-a<3
zHQHR)U+t4}o$1}j!aMYeyVSermOp#s%U=02LuQ5J{~h16&j<JjpWrj}y>>KwLH0j=
zBz#4FgYPiVekA-r{)A%~l$8G`dH$#`ah+3LmwQRf_xxR7lj|R9pHndsr{heVjd3^+
zznXun-Z|O-H^%ji_Wf;F|GURlvTd*SKbil(DA`e35YCszg}4}1T}Q*EWc9A2p=S5d
zQ2y@VFjF|o2PgIcHWd1He9H+kDcnS^nk}Et)CV)1J-dMa`Iy0BmSbk4W|Tbr8o$BM
z)y+lfXe0{M(Z%dYGU=G+KK?;yMV!mm&Mr^!@yV|bUgF<Ah5QQ~*G={~u9wURXUSY9
zggMSL7xOS5UnnyckPER0wFk8i4m)?9Yd~X{Ye16TRPCHI2Zbf}TZ&~^juZX|rSud2
zdu8-h$jw$JI3~8Cd*z=s;?`k3HelfC!J(4eh|TD~U*87VcavvB)*<_7VQ3-CA7r;^
z!{q(D4{O(mt59dGy`?a8(Fd;fos)fc91GjXIR9@uxdS_~3vKF;-Q*tZMIX|>(YOZS
zK6+1yZx{RN2axMiH#pac^$+v@|8v>6XxPc7-OaY$%f=;}cF-Mj5QlIWM^VreRu*D7
zezpFeHbLJ0E>K4!zy2TFz6Jl$`hQh>hK3^PjKXM)L0ki{8s(dOb8PW?#Qx)A+nUtb
z$u4aWZHGp3rSC6(gRAFjAF@No+b30~{jubfFp*wywffn*5Z$)*OmcnGv>lM8=f-Oz
zZ5$dV*>5tYU@B^iO;qd4m_~2d=lu35q0dCq0nbKU;*e*BzAFmDEb;vlhJ@MV9L&W$
z%*O&O#3C%gQY^#3_d~*Rvi7k4kKxWS%JZKr&1uq}=p6JWveZ6hScNq>F-{ikhjqf8
z``DicykoNGsQgc6gtKIBzWnbwwirigMH|`?|H~ZzG1!6HUFtveQOuhS@>I;52{MVM
zsq)!9gTe;qsl-NX##U6L4xJO_&&i%I*(1O9lKJ(y<hdN0=g8mV<!|wAwzrch`=`kc
z=W1KS2j?wiinQV!z!owsE>#J7{=Z)9lq188*Kd;Me%GX|$dJAA_L{Fx4VU{St9H5$
z`&aL^jogMBaofoqXh<jnwz;>h$IE~wGL`{5ZQF(2*n@$$28X?5ANHZYeQ?+x!`j~m
z$mnaeYPs*NEQEu?hj18iY%e=R-A)!X%O8ky04uI83Tv$=*mKA`@_sVa-cgr#Bj0A}
z!|hjuQ7G5{F`68MVhp@p5XO^z7Yz*)$v8F=?f*&i$(VwvsM407MwTGX{hLY7LN&d9
znRaDqL6|K(2YI_vz0jy_nV4k!i7$8^-|>3#Z`S{-92t(fx4Dj+hxu3l^8mvttVE}A
z{4Tb1etdsE`+GL~8~Ok2vALTM@;^V!|D3Jd)@6P`-ErUk^k(z_%h><=0P6O#|A({x
z(Kv?vU&#Jv$2XA+oo^AAAj>y%DY*=5AN5Tb2kji~{Kd{cp8bzreOf2yzpxRT+4wE4
zua(W;#`bTgr>3(1r?CIATv`LxLM$cAunKFi4(qW2mDq^Q*oyvI{Uc=G(?y|eo99PQ
zpam(Ux4Xw({PO8NJNV`cXXx9I)1QzPmvLM#8RzbA7rz5Lu?sbY`X5G(47=%ZPC&iz
z9(uGzo0k3B|6t`@??TzPdZF@fyRzLklOoeQ^`AQ?@Bep(Z*K?t0KI!WA7tq{`<<wl
z>xOF!cS*lUzCZ$vh;x4TNoPL};2;j+FtV3e^J0>`KStg|XMy}F|7BwyM4aCj=lsUC
z|Kl2<u`KAAChv<+%~Tc$AC*qc2L9?B2Zw?d-!fT94o4B<8iYw@K|-0(hz5H6SI>Ii
z<tW=nV+@Kh9uqMMlQ9KTF%2b{iIe@a^5QJvoUxbL<Xj9G1DZ#cKR+nUCl_F?wok<n
zYl?-l!b0Ii=zF#xEFt?>hOm@ehUJK5Kq(o^fb`OWP)1*cHOQCGhvj{FGK2j3kePz8
z&bIZ~fb1J%LM6Eon=zozXDb=k|F0(dFY&FB3ACX6k|B8;EAJ1eUn$N3Oo@+k0Bh6%
zUG$2phgv6)9qgK7IncO7zDIJF{(utse=q+pb;92rf8RVb%#lV0+faUq_6NBGJFyF$
z>-o=B8t1le51Ie}ZAyKe`FZ}k`m*Jq`Uh>u`~TaXw!MR_UH9|)dmGs4$gk_GPavtD
zk7Mn-onsI7q7VD99|v#{`Sl0X?I+hCSf+kg|7X?zIW$YFMPEXzwnCdWL%VaNkS^6f
zfkV<bjH4)MjnBL=^u0Db6w>=&8xvx^KAhgCe|F&G3-V)CRm#sTV+zC8_11Kmul-{C
zD2&D!q!+1Umk$la^d5C>x92;aJ`uUmo((eU_Bek(wm<%`@kp3t+Z0SizWw3ZOrtmK
z_ip<57sUSqY+_IEKHfG<?9*28Y+qhFJp9G|ZYr|&<ALGfC3AEBJgvx@kl%)l%Fq}3
z4u(JNKNz0he#n^I|81=9{|SFQcqlx3^l;d4;BZ)7{O`uUehMYdH4_!1e+X-*{SeBB
z{}k37{60Lj?c4CL#@1%pHXG0MjR=1jJu*B!^}DdXKzfOh;g73^g*mp(#S8m~gg-4e
z|E~1ZP<imw@Y1fK;m^vNdA7~Rs^Y;i>tN$w`Uwun&jqK+&u50Z^=F1!G*w<~Jntf7
zK&OYsji-mI>T%%}Y}FUBS6T6IW6tJ#a(1ZpzqnU7pA!~Hb0HRC36^3RmZKDH`z{Qv
zXi5BYXx?{mC=<5|se%dm;x7!f%CEYq%9Ik<LN-oT)})j<8SNi(jeXZ)JvQKE`Oy_Z
zrEupQ_WuI*|5E+y!`TF5j*q=%>A3>_k2{rpyOn`^m4#%xa9o=&_Rn{a8>LbHR&m%$
z#&!Iw$vO;N?|Zw=+Mm%sfZty~Kc@cRJ2;JBz~jXs`@Qj7`?VmwNScR0kN;TeA~V>A
zd_T>6<?&qQG1;Ts?IyR|wgZ!~89T8HyRip*(TCcE^#M}WaYw^M<KHFDTWmbq*m2Wx
z<KIQbzb702wyn)rczdO>>&?cc$qwPp6#syX@oBP0eD4}#_w%d|V19p&Y%4Af`<!P#
z4&Wft_D?zf5WQ!IWA~2>hv`R=D>Ytj3^_yg?lS&ANx3(_*f*sfR0h@@9v2GQw7*bo
zdws&Vz3Uh*9RDAdoa(wu+=uiNWUVphI(ofpYiJSgn8Y6Ux6^*x$Au!tjY8i4`Tl}1
znmz`_7>~{c%Gqt|SM;D4nH|dagW7m#b}uc`&ie-_gHQT@sUJFo>nD4zQ;WkyX-z^>
zxT$ii@6dl3CKl`8F-GGb4U=t~f|^mqVJbNd)h4C1%^Md==rb`3vymR-`C$&dhwPp<
zF3hFRL$1%YsfRMIt+!xYm@jSt7NW|3AQq9;yNlWX#bF73DVCxBkn0rJxYKo&u>YrO
zD@<m;E;|0)qYq?xVAI&Q6l)*UK07ibtfH^MI;_VARAM7GBl@myC96?~1X_^7z{l2p
zAfpdohU_0(5Vn!qu>(8Nv)uQ*M4Ll?*(ID)wq%h(uY3{n+-~uEuorzum&q&J*&PoS
zYX7o-+)JFB(?B-5r?^&VveNyflu^E^eU4eV=lD3m8`|Idg%9EoI_u<rob3Ob?^@NZ
zS>*fwcK%}g&wlONSSh6G9b5f#=mNeDm+^agSpGiCcP`DNsOUW24^V!^DWRa<_l)7l
znrmD{4!q4b;w}CYhX#dF!lN+;#i&`JKX%`sFrFUgR7U@wiS$YM)%=55`u`TPKXGCm
z@kV+AC)N=^;94d-ZVIMi8cHw|v(Twd??N|vkYE3%MIDD6n$=S+>Zw+;?Wp>nObMs=
zsQ=Yv(SLBZbmm|#=HZ0@;C%Xt`f~w&A#&=|xVCPF?49!S`hN-irn7v93*`Ump<$8z
zmY`aE{UUbtQspMwYVFnXzi^Xqex5zK)P4hx7lviza+IPBtFQ*^upS#wiH+!ATo^W!
zefn|QcJU>lSECLIq}^u<E%cr`&t|JIy$d<@eO6osy`E2P*U$bBV_YZl>++R&{!?A2
zd(1fIm;S}ugtub{cA-;SvJ3h3W#sFV?MvlSZl3aIqU)IK`IB)?rFMF%*z+Il`TM3T
zR(dX<4+^`bRmJaP4_VC*v<7?Web|Tk1N=kx@%f0pA7m0uyRBoV4Zq*^@;BtES%bp?
zdK^nQNFGA;waX#;+GP;)Y7Y+Eb`%9E-|IT<|5e)k^eW%#aIy%a(6rt=LjsM++wqCP
zq0DzV+V(Lh#+vq$FrJ)<Ntlc&h(5IW@jd0!RN-v%;4qC`dxi4p;}KydeHLb84(4Lu
zj-g>5IUm_C&CPv&Xjnk+<BPPAT!bZ9inh50VHvp`r6@!6?Msy^FX$)w-`3FAA$L^S
za7Y<Z=v`sGxDBX8`PC!BMshQ@VnAN6Ci^D%M#z41xhkIZzK)CtEy5|RWcx+Gx7F;w
zs=dAian;hVah?o)8@3~^{n3ENb=v<(y1pjYxzjr+v3&=2B7t4ljXkK{Z2o`jADE_p
zV2Uwsc4>b8KVSVOzU$3n*!O6)tqtwF*uH!C|L<hqvwJ&_vV(=Y>&#C_Z-(t#%?8$P
z%#pRL*#D?U0~(Q7&;DO?-2Z>CYv{v1?8gBd#339;=VtbQg8g63{%70&>i>iKA)0rw
z|Ivyzw5Qnr8TLQf;kY*T^-<{*q?NH4j{N$A;*01#+Pd8sMIVh^scT>69@yl)$orqn
z8L4pc9uU_b9Am%iY5Whx*U$CbP(DFlj`-wM&#lCBoas5Db{d-w_4AMK*C_1-k{i4q
zRH$#(mWMFWv6C<vQ!o|N@ay>}zp?MP>HqL8wD=ZU$+kJ#AM<>h3)rV*hj6!N)}vpf
zlkA%HbH9ye(@W1R_uL9RBQ#I(9FeAX6nT!LJV(@Tblvu?-RC~&$(i088s~bCC~=K5
zF$=RX2XiqG^U*f9C@dfsViA@gz0CRprA1*Wy+>WyZOm{PeK~U4*V$3W+t|J8&r)$!
zql-crxe7H?i^3Xm9jZ(CuBaayj`HLAa&XunE{=IMsUI8Fj|sANsro;mj;yv{Mm(7i
zPLfS!eD&DTs}3Gt|1oO~-%4q0#9DPgg?UAtJLEa!|FgDBo>m5DkoT{tRQ}52Ek&L`
z*;efNlPTde*<ruFdwv)CHkku(Mx6iuyRcc>ebYyVtz`e_`e_=6glc*n21ZyLifoDc
zUFQF{dk6F`lwUg}v@P^p=qWOdZS?Kff$sI%r)yl_X4gk%h2vV1Rf)o|-u9i?h25wT
zSB*XNICs8&=FqU0-iIdPq;P_4oI5n^6Sp4+a1bZ$qiG==5<ZNh$os#YtB<EcKErSn
zp;jGSm(o9x`Psi<xW4@<?0+;(=67E^IE=F2XpBKI#$zHT;Xn0%(>KsEk^L{Nwi5O~
znHtUhFJ}MiE1WEiK7Q9z$o}sO!&Gt_N-z_%FdK6)7xOS53o!7dXU^7LJz3t@PK@`y
zP<&iFAhk_9gx;g>@76|GLeH`v;+)8n^}o8mDm64Lv#sI+`TT~0u$*2q#rQ*kzJGed
zLeGEH&`?HSg{HZl{}Rt1ajk|m;@UF$k;wH(E7v!Wm56mvw|w5Spdf4%-i$c*liR19
znB_Y-%712=?;rX3FXV6kZ(IKL^sv=&)u=-PEl8mYoeS8T+S&i%`Ww5ofA<>yCfk;>
ze@nF!?4PE0kj1`@jC8gkwq>`I)l>MhU<Z9CcA>sl{v7T4isa9fHbD#fTst6de+*au
zjR|45{q~>_+0&+nePqRJ)5BWd-G2H39K<0U#!>ud^ZyDueXkgfB8<Xlj6pGeUH-~n
zdpy_e@)u%V|2OmhepCMTy{d0WI{i2KE>=zt<zuIlSA~h<CSfwBU@E4e1TztR|7Ve_
z%6xarLYPgDV?uMud2#y<A<QRZU*!Ta&VO1+F2WL2+;)|A#Pra%xg_lM9Oj5yie*Ue
z_W$-&Nmx$rIqd)K2TMXJy$rcAGnBtGLdHJ5SS7A%Pf1uqu0zd+lCYlKfNICpOQ#`I
z5-No^A~|n{@^?mPv|j?X!gc2I*DstA8qkOYl4wHA|C=4RHKvC;B+!BsI%i#_|HV9i
zbNPFgn%8f>f5v?OESVE--gZ@JK`YwOjug`9z{)P=JL0qJl3w4+8DSgQe?dvuPFCpS
zT|0yi`mmC)LwF~4VK?@mt<L$#z39U}q_;YU`2F;rSu;X64$u!GN6#YTzI#!%<f?E;
z{9znLL6>K^@TyQPzL4Iq#<Rh2dJ&rFaczb~rDugv;@Z5E(c~DU$Il4G<ang48_=!p
z>N#8xCJIkNe%(psMX$OlgW6psp$_%lWyAgw?R)QKUr9(FC<#rY*!PF6&#}|k{&x1V
zG1xdCZ;Eu*Hflfj4hfy=<F3i-=!yEj_Ur#TtPVo-4a%WenbJ}!|Id{F(Jo)7O4LtN
z*}s$6sY^@3ROwAaO<74OA(#3dtE`zclb%>F&!eeNoqT|<Jv2*v{y!cG_VbU%wuJk?
zSI56LB+Q}D#XQW%0xZNLEWuJNL!4K%oUB!*)VYUx*MIW=CzG8^IQqzx+NTVwum<a}
z9_7tL!UnQ)m;2x2{#{=WdN;WL&F;U_{jYQX(rR(9t!OKB|H`q{LHCai^gVi2=)Ww^
zd6?n<;ns6dmc{kIE2SOR5Zg#@MK!YL4hePSz=t!!z}vn7dJEb-=luLf_n-gwfVeKC
z#CLl}Jt@~DybW2`mRsgJtHrs-$>KI+J9c0vc40U6AisW|I;<Y~^??)Wo>l4~_H7&s
zZ?0rtZf0MSZML<m&*HqdI1jEvxKn)BMs?t9<L~&@`v1a>#vc-F%e*}~oXzQ+eW+(!
zHtZ}6`{>o?8Ps4u{QwT)5URwrX@?x9A4Ng8vKnc1W@^^ZFr3~q&AHX5*?U4L63(xm
zt3J()VSkTT|E@E~K$*P?EA`!tvQK{g-<ngxXnMn8ZIgrSu3`Lu9@i${qiwQF8-;A_
zJ0*;<Pun4V{p5J0*BGlMCn4pz<ktmWS`a1+PeFE(dO^5%x%@}gvdilxd%mu-!F9$q
zRbuAxHdfQ}p<$|HrlABgF$=RX2XoP>KIxKXH=?g;FB#L#7CX;q^-odkf6@Njr~Rou
zY9~{|X|lt9RqXtE(wUF!_42oM>1wuV|LrLZ3&b_ZqxCxq!$SHZGzllitAokLDST(d
zEyXe{$4d3^3IFC&;d14~Dl+fi_x6aehQ1E#u>rNl{^~aBf8W8jSH|T3%e_tgjwXHD
zmG;?)&De@+)FFWu9RL6L&;F16eS5WUGRSV!&O!5f{qM7mk3s(3|Bug<AEncU+z>Vo
znT~Puw4dp5O@r>)+9mWIaUbFAMtNt0JVfpkw+p+m2LrG2A0Yd%5B;ZUd&KZ<`bW<#
z@?m&Bh<!Q-W4wC5Z;Sc;TlDeYbV^wB{D^Q!{J{196b{Gj69$E&WdDO_hk_p8H-=-N
zR-N>veyuImgB(%hJ028@#8=DTqsY;y5jTb`Mw|mt@7fddy|=l_HsSb>r`kpAw&m<k
z_Wa5J-=)}b?DV+){{iP7@A!$Bgvq!9({VLAjjMMpWq&VXN3sihM|+0F>_{?aJil2#
zK+A6Z1AFxkknO^$W$Z|FIIiu`$Z)N6uE!0y3F)c!VNc#d?-|AZW{clOzXLh8X;wI6
z+`N~}+utp}u)o>PciAs*e{XePZ19G8Bg0~G<@_g-!c7O+r(|N*$nck0!@|o&!^385
z+W23>pO;S!FO8ZSHg5di%vYNd{xoq)cmb8GO2YF+CE<_jt_shMz9MWWnHHYK9~Mmv
z1C7>72t&f!Ed}9z=XnqhW8lkz@F>~G$MtbCmh&s4<jYbX<A)W6ty>Gilk_TO-z(b0
z)jNlTS4UYxuy{<ULEYA3eFS6F&13nn>3iQWDl{51YbrRy`x_Nn4v*HyI$A&K61Mu{
zkn%6I_R=NcX=y%-=kX#oVGE+4-Ya9A_h?brg4Gkx5C2klWq8%LT2$0&FHhC(D!wwT
zDf!>SQ|o3J|GP9a+SZI`S}qHJm~we|y5!QZe)6T^xrvvDKkgb9%CA*+sDEA<{a?bL
zJn!ch82_7R{LlCG=XKt@V>{7{*RW|-2yetVbyHVH9wYzm_}TKJ&{AeD%51(CXjf-N
zALTUJvEKKa(LbdRp?;oq2A7zVV&8<hE6JtS85~s<?s1OQ$~!|PdHd`w;kQxdyUyzW
zZ`<r$JY5*x755=NM!I^0_V0-BDZR%(@wyNC_r?Y0|D9|7zA^4|RP=8c5qj~teX911
z2w#$4qsI8?x8(Pzc3i!58ZslokHSZgoTpr4Gc|4;5fTW!VdW4E!w8&)GjJ9LdJDo>
zvhVSc;asx+rh+hmyZ{&B60~^^myuUsI<7|A^NeE**VFUse=O8KFwU}8U&62YPsaQI
z*F0;m?ly;SJ6}57;Fy{+<$q~mxQX6y*!;h(h2a+ZZD>+2BzLK^$;Q6IaEG|I`KN@t
z$a@g&>c!;!h<0}OaP>YO6n+?4^=S^h$l!g?<XZ8M;&H4*IiAGRs5N$<?;l9%A2^`S
zbxZ<Ddec7R`*rMlw9aPVqkTU69%*_9I!nzznB_WVx{d|TOV7;VA3)}&@&79x8lH8Y
z=kX#op=Lh&TRpsm-Y`eqj92NkXwtSxB7sJ%T&g_RW{dV`qwUQoe^LF*?rNu31mm@D
z_)j3c7q8(ByoI+>yFmS4X&ev@)%s_(%@ezoQ`+WDc-KDf<3oImPw_dv#MkJYul`3j
zf1jT1KBv9vKQfD){*C68{wHa+u2TP_z4UnBbb5*UA94MyZ>94+vex?lkxb9?{Y)Jc
zj?jBb<;`V-B<edw&bO7tiS;XoiHqy+k04J&&8R`)4Du{gJFZ?jv0Xh@_*^6pt9#IB
zzXUE2cM-BfjA@5);S%~~xB}C$QeD2vy{&dX<?Q&1X3z8a!QpD#uEq7Z0XN|m^xr=y
z+(z~(i|-%@&KMN#BJaUs+>ZzGFdjwUx9Z=^)V=KG{<DUL$HlEgIi8HVHu#D6`?T=0
zaa(V37<kbd9LDD2`dgYEVeN;-p~71IFWP=${jE*(o}Hc_w$NWij-EvZy=e9R+ZJg@
z$9CF0?dCG==FR>SL`ENy?)7}Ew1e~i2VTPuX}dB=d#qRcI3vDRnvG~iJ37&e*YF1N
z|MkuoXDX17$fUM=(|X_TM&EFy_V2Rd@Rt4F#=CeQHA{*^^}^!tA-!R}v44C_{}fI1
zBob)E=i=ISj|*RtUn6bY{9E#Sq>P_;Z#C|ZABB$~TRkr1>WtkRYtLYUxaZ;JuPh&;
zKPSF##P!XFh^rlMd>()Ee}V$z^Jqp3TG58~LSx^<`5zeD@0iN}poss$XxCBfI>=t(
z%s&1H2lyY5RU6I;!<?t$0`2?j3&IF`%{qP<tIi3h(Hrbjk2C0Jp~-Q{KK~Fn<X-|9
zE3R$wxNt5x0nzW_0`ek6KZtJk*n>-iFGF^@c6ph0`c~J0+FkB%kNfN6e}Klh{11>s
z6RvQ~bX<*VaXoIpO}GV}d*y%iTi(BHy>vHf|KmjeZu1n+2d!vB`yu)NsP;eE;kYXA
z`Znp@fxB=IYIb@5dy2zidcz^_ANSKAME-x8zWKyH?;rW~=jSVLw0~9`|9I3sk7Fgu
zBVP{=Pm*h&ly|=#5}u|%i|6qoYS(E0S04B2Yp7N)W{mai*8bn6p2Q~mY{9E>Uo@f_
z?dXgw)BlKW^q_aQqDmQ<C3DJ+=9KooG+T`?wxNBl{zs(IfxcUezc-$9+`q<HeE(h2
zkZ#|w5MCn(o*f+CAm75<co*+u;D&QTB(A^pq43A}6kp=R`deSqPprRHK0)67RQ<bK
z{>O>`Z+31dW*_@M$kt-{q*%X9vFleB?k<+ki$n8p?+>kGyg#(l^WS%-ICP}64eNYA
zsPFRqppl+Hvefs3mDTc7LVhB@bv^n25B)eKd{6%o594zjLC7iNF$^Pc8furRpHPnm
zG%BOwy1Dsvbjkes{`=Vf2b5!E`%&c<d18J4&b{paW$NGMf8YLZ+cPwr;XG$yEY3yR
z@p1fo0==g|8EBsi=ocX;oE^h{AH{wrFA-OzEpi!o1!6zJbn<FcAJle9=|{*EhHHhd
zM{=%lYs9q+5{UoKjdgea|3-ZU;RgHEEHds|c1pO3-mr@OZ=bmSPqtB=EF9+$CN^k`
zkhj>Ut&Sb`r8?ju`AS`Uo47lW+UnZn_h^^jC43K}?@bPQJDsfE<@vh5yq|*dzHzDN
zyIK8T>0I)2`PT(uvE%N?gLoK^;&H4*r#id~-NT)yP`btPe}Vju+*J0ze13BL&wjtE
z{})PM8c*VBRB6vWOIGjK-=c1Qp8g^>;aC1KEo?```SDw9dlgw_cR3%6T6!az(T=>o
zys?pJ?{*6J;x(*gFU0ZpxF+yh!f)eUypME=Jc<wLJ=)vdqXvhM>7OFkC2zOL<798a
z;PAP)=pXbY*{^@*Yw}xsj~{Ua;WghihG7Iw!x<R3f&DN~|DQZuuZ`S*-;RG|trK>Z
zeaGTlOu%aSDdwq_==2P`kbnN(L9ge2^7+%7<@NtD{^9xi9#X!CeEZ9BeT%g}zGeT%
z^?$ziolE~BT!Jew9arO8T#u}={~O4G`?WtVP=D7A3O5N?Ean$r9l=}ZHD&r$aT`6(
zX{?_+DBMB63vo@k<UwsrvT>&SU&H=i&rT-u|No;W*#5~BJDJUXnPcw3V%(1h@h~36
z<LFEn-?wkKV|vJ5W5$^>HaeMO-#6E>|HZf3-bS|DKd$Yc_x~s3|NmA>ryNh>X{1Z+
zQ#dj_OYdRNcQ0j6(qBYwuJQLJ?Ei(%MaH}v|NoaC2cp-MjtraZw*}QR-Pa`dHP!gL
zwd-FMmz?arMjL-86BCWUZ+8FH_Dk50Y;>Io{RYXs_S@;d1Nscs-Xotnwh_%}M<;rt
zEcO1;y;J_%?)~re{>l9KkG$6`y_WUve~o*c?fuX9{*k74Amut<lg=ARPa7HDBHu<^
z$;j|78OLgS-CrE5jsN|7U)&qM=^oFa+cT(IG%|cB?qht4&ru`3dilukB|SfXK)PSk
zzeSVlOCo{j%kjOqA2IOm@4^u>Y?D_o3?pzFvWo|Yv&gaNyM--E_TMuyOdwazmlv>l
ziT3%^^7OatOzS;dVB1Bw1ef6oOvlx@7T4nj+=N?j8}7hexCe`IKWg~~)uCP=VFMcJ
z2_%&daeh*>I=lt>|BIHX-$yGuiq-FAN0B)R_B%O$@PIOdO_c8++^>Cr2c6?#Jc_mw
z{WN51vi=4A|10TP<NQyO-Cg=8(3?<quTd8q)sKG2KC|8bJok_CVE(>mkUh&Yc$j~{
zks+af`rz=i<Kz6lXUXUBA~s<QUPUb$(TsL<q89`2`_|?0zK4Ay!g2oJ8{}Jf8}Fj(
z;Gpn6`5`{Wr>NO4|Mv|FpVJ$b%m4V2{xwdv-}L`AD(`A1see#k@^k;Q{`5GupDbl3
zpxiOBkL^VNz_-HXzK8G0AJMr({f<~i_n?=aK~|laL-TNU4O+)&AE2F{LR{m&18u3H
z;fQq1nGexFFT5@v?6A+ap<xKUXQ#9=j6MQ6dKMY<;xutp2el!{vrw~sXc$YLi@1hv
z{W|xB3Bng3NpC_U63F+zFZ{XwEfc>;{8IN%Hf<gnE^^EzxC~cdI<CgGxE`H{+`qO;
zH~BZ~2e0~{_&=)mJ6!(_(m3HScoRMPNZvx;hC7g(=6<KRUowtYoEXnI!!x_f_NoN`
zgVN*Q;L7+7-of`vT=achY<v9oU-a#)xc-T-_H*+Ggrjd~-hW`(&~U$f20qq5Lq3d0
z@i<nZ98X5M#WzgmCX|L}$^ONsgy+e=hrPdDgTjmSeEq+&ENr5u+4pgbYYRP&J$0+2
zUZvL}C$FCDKQ{(dSLhp*uj1H#9OI9^#c^HiCUsVFfqb^OEHpZ<8SUsqFJ8kN=sGBm
z@7MMx<39rOe(u7#InLu;aqO*mlCigf<705CUCM|t`VPtT9^;ym)E|{`PJ(CXJFm%T
zk4X1zRHu|X2b4R~epmQIB=;(JTx(<BaXY2ruClQ9(K2J3+EU?(5dZD^vHd<pEaN{X
zzeJDt?m6lPanbi8H%WOuO?$<*fscoVZ^`fRBaR^ack!3Ve;2I?ea8BS2*<V2hDG^s
z2qVZo?aitk1>rRM8CdgRX*i3l*~a%w{8)MeS--s?oJ*g8xCThFMfq=^Ms-&0-ZJlC
zMW`=W5gO2l1d?dN`>y2z$6bU=a2b9z|8Me&(ABphbfX8o$RLXxnrE&EEpt|c*7++!
z+q@N_ed>yknz%ywx*~LpUJ+^&PZ)oGBGjYdz!PlhCqiQM6CpY33H`TEge#nHI<CgG
zsQA|UB%L8#Prm`3_URh)MCcy>MCifa%>P^VL})=P+Gafw+Gl!4bDjw4k|#pPRB^8B
zCh6RQ+i(Zcm9DXDMYxOJlX7pkhrSrOb?$AAd)(k&t5<~k#Z?ur2oI7Eqh{BN@F@8>
zs;7Cb2R&D;6fQ?{o9DXIbM|Z!yFDLm$QFHDt$g*`&`wY7G=J})^=;6}rtU&F{^tLF
zzSSqC`81xzJ>J#x<crvZEvVJjs6)NFuVJ4)%A<V9$RwI}hw!R>e(7IaE8K`?WN*=L
zyi#5E^`Oux+>6)n2Hrx}9L)ZQSA@6eeVt|DUGjZ=h>!6pKF62%8sB2zwH4ue@<$v&
zoc|l%(B{A}jKGQczo*fA<g;$|eRhH}Uijqv-$U$l<n!;g!T%%w($BKrSX9g7^*iNX
z{R-y_ulZEpH9h(bHMYpV==)Jy#kT%C&IuQh1K$r0myi|K*<EYCW&h(V!ezpJD+|IE
z<aAt(fzSUTTuZhsD+t$<H{d4Rg1ELyY9Tw2-Xo94wZZP7--X;5`QG*n**m@<+#_x=
z?#F|87?0v{47~A#hwa=)R)li0|6AcA<<xTRjj|A)6!$cqMb%;HhG8Mj`F&A%6KXap
zlWGgY7JBskt?!cWH^}#OY)dj}`^oxW{nCKON$Q5V>WF#jzd7nn$HsB9SIOs5)#u-a
z8;69gV}^t+2lY!jrVFu-?%A%6+N-W2v&bD(|4mi@m#F{Awj%ZaXmxwB`k(9=ZtmOG
z`$HRBI^X|heZllrYsooJBT_B;4fg3b(AVEA+>R_gt^w40NIyRPFNYruFYkNQdXlAK
z)A+~3pASABURqdY{BBuzanFO{Pkj%D7fK%vl}jHE&*P7YCH@(=IBY0g9G)$HAgmt0
zH2llH|08rtyB8I67KgQ!%R~9bC1DMos$L%cwQG5J&9*o2%(4f=A67jco}RWWtjBZC
z^~buh@Rn_F<Ao)q;ZL5=^L>wo%Cb^xUlyAGW_-!E_wgY<#-_RuJ|(OCL+cgqV=F!v
z{<oIlp~gCu)&9Ho>PBN)8%DFuMu$4%eYMMmho<7wLUR8oHrnxVz&7*XS`Us5E!*@J
z=`U)Rr@oX<s_!g)F=N8l^l$M!e#A-t%%g+D5#g%Qg&}Nr-%qk1wN;1FYbF~1nOYb|
z&>NQMLs?iDPNScJrup)axC9!P6^66KjYav85#d~N0{ZS}$CCYD|DWL^@)BHzf!d+r
z3UWHG#<i%^zg@dZA5Fq}HcEG^bi4G?R69?dH8t7a*W2#~+=N?j8}2~%`w^jQwr|ig
z?r~k+WJb7`%(=F#ajX_;G)uEhnyqBYIoic_l$yJs|EG30pPIcxl#fF~!$jqR@+i(V
z$om&6AMSFVd$1Vy<Cp#o4+?iqF@A||?Tq*@#a^~SCZ*kyVIPpqOW6N7>HjoM+X(4V
z+90Ud;<?{Y6dsk<;~02#sBt*YGwN^a7tpuSC$8@bK0@XtJSqNZJd5Y?A~vCIviwNK
zIT){!wMf$&$!7HIbB)OCb&ZJEmc<a;9!5Jl(Tmsc2HryL6!tOuvA%$PjFbN9?8}q>
z{}b7_?ASK4eLUM8X>_1-yZ!-obesd<!=~;fGs0PG1?13HH!{5KJn!Ote2DZeY3vyp
zKBmWi26i7E89t?FFJiMfJ{!}pPp^HV-C1>DWXRj0bG6?pM~2Vs`z5M3yY4R6y@nlc
zUA4G=UUHS|Mx$*B`+jY|whbf0x8(OoSC0%ol1GqAICr1xMtD;lh3tN|_-^Mu<QnKz
z)u)DG;z!^#oPnC1)(5~@^oFIzfH9VSE}BZLL5ak?5&BZ~mksxPQD5vjN4u`^t_w*t
zVS-~Wz(u$Om*EOb$JOZ6hU?m`j$zODkiEj0x#|Ms&@7D>v`+MXyvO!Qo~`qwov%ax
z#I@469yg#$d2<t4U0uY6@Sner`PAvZzfD~Ieq|9F`3xld12BoEz5H2bsgLHUkMsdl
z`QGk`$ItZNTiiv@um2O&!;{s+6Mc^*@)wyz(`aiB%x43vv3-^8_-+2XM;eQ9KOV%x
zcodIgB{~<e@0V&LkUeGE1Y|}yOXl|K|F32PB=irEZQ4ui^wehcOr`#T1>%OgN9jC?
zr|~S(Me-<~r{~w#WhcK#--Mj&&C1Ui#PxNzh<g>aXha-$Y$oHsf7{8tUpTuW_6tNi
zx>H;)UPJUhi0eZ3J<tFD3h@7rYY)6-+uL{-@8d&!j8E}7zQllj;ej827b@=3J{zKZ
zt5ny&t$bU~rm+26r0L(2KO(L@-96v8aY!G?Y~KbM{}G$Ff9J6g5!bqHOsW4f>VL9H
z_zlM%LHNIX{}_f5I1Oi@bBXbFoEYct9iv`Gwn#l~%&~d5`d^!%m29(Ld#Ukx#PxYP
z78-xI?JVhx#krV(niBRaE}+Lbanb+%BKjq0%IJSb0*wVjIe7a1miqn{X+O_(-VeRs
zWzxrG_OJL<zSkd>UmswacEcps$1fpU?E3b&|4RKIo1KIFd-s2ZG^XQfT#M^*18%}C
z=v*fMp&LCox&G^J&sbi}-+znyN85VsALmKU)&GYM+$N1G_jm_+7izSnZt=YDp;y!E
zN4u_NY<c1Pk#Co6(1wdP{DPt3L2(b`Q9O>7D94j{8qeZ+yogQsW&O29xO}DWc_o`_
zguc5*|C-kyT`Ru-MbDRPMmyqMv`(@Yui*{6g}3o8-bb4}_aS-0zyD+Ur}!M*(MI1V
ze;$=T56GX|<~e+6+t>IO-{VIdL3qpehFWDp9Zr@H^aPS<Li0@ddyf1)OJ1HQ&y{-r
zNRN@f(K$)^fbMC|k6wCyeInbUFHe5lXOyyQ{PA@QtI1)`I|4P628Gkev(T_r|AfB3
zL>=3|OaDZr{t1*DL&#oV9M;}r{Qqn7c=ch8wSO!J&Lsyn4Gt5?3vdxG!DYAt({VMf
z#r3!WH{llChJp6jelHGvkNZA^<3E7zBJaUs+>bWx@dwF=@hBcg^zBR?D)55j`>nps
zJ?ewc$A%|`a|wF2a-hYx${wg4ZTxVI@xw{R50RhWKhgN%WaEeA8ID<f_#9*M=eVhJ
zLb>qMsPKQHwdPWGZZp1)ZuFoR8Dx<|^8(|S(rJ}mT<gAlyzxutiF4FDP_=bjcvgDP
z<3(&jP4zhQ9>;|(^f)G7zj0i6m0pXc1FmhKG0uao^)SS>ZS<c3vK{HnIiZv6MJnYV
z>$dgaHQ_gqEj8|sUSv?~n(F2{$2|AHM0%ytMv~ryw;b~}-o^X)5Fg`He2&gN?%y?a
zyOtjG?sxylZg&5b?thH?N2|CtwA&|zG`#~=dylVSaAN+@m(q-D7*xBzujvi8*W*=r
z;#=Xkc0m$x?SlBO;@VlU?<qT?ApB^1ntp@~JKVqWCH6V>Of--1dhd(>SeBlf?A=fC
z4Pc1<h9UZ1WZC*7=mWKUAILM1yI=|%|D13Zz3+-~VJvwrCg1{GgiBDhlOGyc{r{16
z@6lae=Yj9Hev)i(Pz(tqkbn&-B#=N04P?NT96W<(!3<spW?%+#U=GZIQ|Q1dwBSMl
zSYUCpg$*{?$Oa4+Qgpwd3%dS<J1~V-w9rBdEwm`YEu^@D6k2G(IiLL_Fm8M1bggs#
z_^#*K``zz;Z~OY}%d2fUk<3Ohzsckj#DD9=e*sl$7gwNKev1FtDb)roF$bYM&GFuj
zlNEy;*QTzb=Pk<&Q=LB@GcXIA_M6X#Tr?|FnvmKjjBH06I*>%|Fm3Q@-q8qca5RiE
z2cO^30ovdwagU{ET^r?D%GCk-$eH@aY5K^R<9hS3080@69k`5KfmLV{zIllEJj#2<
zKmET$YX9}w=3mV>*NUw7JsKt`|8tc8-v1VB3b&rj3~R*I(-y)yG8cK#o$vdOwg<?~
zXTmn_?!o?hPIl=N?2K+>fOX#0Zu(yALmnGI;|bTeoE8ppx1M!xr}f|ICFrP?22T0*
zAGueu@M30I#J>VHs7G<8`ge%A`}A^kZ<&638~qsKKkcgY+bh+*736V#pS2&I<nF{7
zoI`5=@8jwP`XyY!Rb0ajq~`bO=a-=z71|D!nd*4$hidYsFt>3R_Yr<qy?|cmgQhXs
zAE(*JPOy(@@3qk%@Bg)5`)`{0fwRpIoN2C>y8rS1#3Qsnv;*qYv3<qU9|JKMaSngO
zt|!7!dMg>{@DHbtM8^ZR4YZ?8xY7Iy*PDkxjz{rF`nY5^iq3m?r_|r%WbU~B@6msi
zD}<|TR@Yu<pF`O_^Z!smuf(JM|8up`Ha^h4`6z@b;)r{mZC%CYM87Sc|1Z}+?$TGn
zzqJ2<k1^-z;+lb3i0fkJkn^wr-B+Fri^ypIS`yvHqn1UtG`oU~^WRpHYw(VFaO=oN
z^WSpmkLJJS(YN8B&VS42w-dWj_&__GEc(RyKYi7G^n*BzGG#$5BPtNfh*&mM<FoS%
zN`$LG4U)6^geG#!G5zb|$^`l`9LGs?;tb;0L#cdThH_NM`<2Rp)c(tpV*Q&H&Ixw`
zmv9AFaSb=nG)DfH-yV<u%2%oVUwq3r&m*<}@=W<(-me=a|BJI>g#3@A1N809IlL*J
zZgUB5lXnr<VB9Ce@5#^0o(>sga>3J~7ug4W(W5QUpUfQ~eGHO5#u)qQ)Q4E0J<UJ1
zKL*Ee#tMd#ssA6CJ1~+y8e=gYh5DZLt~H!K5!ootGJcH7^w=kj|E8ZppNi^*`ljpk
zQBjfh`7oVd?8DC>XQ7ckhn$B7dW$vqtysXl2uXSe9_<rYGGG69xoa%e|BZ2ymE2Wi
zb@S6<iF1}=1y*4V)*%;pXtKt<*&2kF*{-uxyyhOXTZ@o1_n>2;`3I-0TgQ=$Yz?TV
zH=t3t`2W9c;)v_m8<8L5)fP;>Pvb&$1H?B(eBw!>!#O*J*^L&*T6_C{K;_e6FL!GG
z&q@Dvp%?Qj!a;iZOP&pf=_RP9$2AC*h-(l^R(gJvp&S+arN7P6AF3Oa)i>DeomYc;
zG@%X0a6GyfN`H9#|0#MqlITE<vcI-S`a2~3p`PAQDgEt|{!n)>E1VQhC(hs;8Z*_C
z>c$K7)`jdw>g7xHE9h9Nz00qCwf--z@+&k4{~CD%#V?poOx{M(O!ewZ@~wLJF86(;
z*42%XkChvh*!W4Xs4M=@&Kk@i3Xo1^JGM;Jubq(|`q2BLKL#T1`#qTKwr+kXIUFM~
z8g;wa?#YH`?f>=ZVLW{zvJv;ZYQ>{{NRrCb*at3oQU0IpK2g5deXVz2waOR7`Au>B
zf3owZU@E3#24-OnQu{wG_x=~F|FhJogS>wv=^dHg|5^4wJpTX3YVUu&_fN+4S6yr5
z+40u?H?ael=RZ%J3$O@Funa4(2J4WEo~zdWn;VcvFWj%+Vr*z5J6PNcGH;)`TPuzK
zHzdL~$5Z?Nc(yH5z0b~!@Ob`@=dM7i|68P<($_C1<DB-&L+W^X_2nnRDskjvCw5~m
z_TeB7BennUYW@H9?DS;Ya{cQQ`q!uRugMzm)QbCvx-R}7s9t><>(<8k>R<J8-3WD{
zcq&kXdNd9bW`gzyy>+SjccHqLeheM6)xT(8%x}5&2fsqcPm-M|=6{AfhoY69OZ;(P
z-3#28P}R?KAvMn7Iwht0|0r{R<+c1zd;W3qAF7diN?Q|~n8~j=Ke<Mm6j$lj@Gs84
zohkp}2;yEb|8)Ls{3qcJ@ub%OUrGx%>5aA0$cs;e+w|5w(iZO0?<4*@FgZ>d+bpdi
zgJ0q5Cqg)_U4-I0#)!#2h<gZ>omH=)FL!@bNq2Eyi`0KcKS~P&`3=TU497@}##oHU
zL}X(!reG?jV+N8d?2i|+L)UTR)KiR=<v;m&AC#CTd)UA>PjS4jHihur7rqs?%*Y8_
zjj_$+?m2ATNR*QkVG+3m%di5gum*YA*<l@-8+B|$xADk)GLC=O%}ftF>ASHP`_Q;H
z+JMr-L3-@-w~S*Gr<dT7-K(ALu`SvkbL9Vx@_&K+ze}Fb9!UNFQ=Y9xUWv8jsBmr#
z>d}NY9E<u)<u6(gZES5TeH$dvfq%CCpI-wW{r@kwLEddwf0HN0x249q{43d^lio9n
zjc&Ac!Ly`k?(S_NoFgyb60V?drnE?QT=;spM&3a2TKRvaa+qE|$+L}1gxmDHNd5m$
z8mS!MS+R4>@USa$WOyrUWO)1H$gq3!n6PKU=<v>o(c#^Oaba)T*ibNFY<O?agz)~T
z@nPTc@nL_#gm55dV)$Uf#4t;ojpv^a4JV!t$LV!vpAYxR@cYsuIxapRl8(1K-iB6s
z%k}3&FUQ`TKRlFv+?x%eS18Y9C(mLRx3;aqKijHbFwWK9Hyr9R*Fo6Aw2bf}y=Z?%
zIJ_q#{II!KYMkugok`(PW={C<dUhzhlN0(1+aCil7_}eu4>iWlQsZRn`-g_}#>Fty
zvEkS>Y(#i<)QB*WJ{nt2j0#&fjt+T)Mup9IEp1GA{oa@`*0G6rBW+xGbNl#^ziVvR
zF=SkL>%|G-?Lia5mTSg~*<|0DJ1M-oaAMefXMEU0fA3Isc>l$mknP;b*tlLfjeRd<
zr{;{6T>ow;eehjvtnYdU(%F#j>i_t6eb#sNf4&>0IA<!RV+Lkn4(4G2nuKc}VE+TO
zj<Ekhj{OK8c;|PX4;`O8A8OXy|KJ7tAD|BPXn4u~2f`hl@ZC_jF*__0&k`)d3KVb7
z4n=#i!zy|?y{tJqtf8+%_4aISlWb!r*`We?C)wTX6_D##9vbZ-u#L<|1HGj(C$!oF
zVkh@*B=_fpj$zskMLD5;kNqF4sVp60|AArlAIP!)zzF*fjIsX!Sv^<0S@MTAZfqOu
zb^bmaL`jURN#9?7ZPp%X8D#toZQ3gBNV-P6PK~%~C**`9sLRX=^{&-$NZ%iE|Gh%%
z9V^6@THmK!si7BFdVbW?%WrvpX{(9ehU&#^j~6}vS?6_ShhzNeMkT^=@+9Kir%v(=
z;y=t<&bzP6=IC)>Kys~e0c|JTCwYnA6~uoiT_uYqn5#$LKr#I$c^l>Z<YUiQ*`Vz9
ze6ej<h3=cmhl=lpuF21vtMJ`$SJ?Xqe<1%M_5QSJ<9eE2+&v}FhdyNAs1JQUl#H?e
z?{Z_Wiyw}`R+ukeX})|FS#3=Au(;yhNy#hep}%vs^kpmFYmNndFq+iQ&9RPF*TlK{
zZJFxktcP>fI@E17=Z(Q$Hug%^30J>X-Ht~3(S_>YN$Ov7>dOj#zW;Bt{y(ZN>i?gA
zG7NQ%;TZ2eMv|j3Ho7qpP1+F6+7d0s6I#(`%%L5R|36Tv|Ie;)1a+JB|Mwez&`xR8
zUOBpwpSD1@cqU^CrlQe3HMsZb^j39ii+i0xpM|)mVREi#oatG$HRkXu)E=2fE<pTu
z;39Gfikw@f?Ge}YFXLW;sv+7RceFExX=e=Z{LT9Rc>Mpb9A)ze-*$|8nq1|a&)Qek
zaIZryk}Iq$+-QE#4)%}D>>mZ}AKTeK)F(;xNr!T#X1?}^a`Oo4P>+V|<`<nczX&P&
zVNoKa_QM<bFkfsFcRu1jz~aBZT8H`m%ha>+n)D7f^tiWoI|{qlpZM>^J{&~syT^Zj
z9j2GdFY$jrCG-kZFPDG#RjxMHhf?=kvdi40(;<`{GB@e6{tl|ybE^0yM`wi^=hdSL
zZ8(PGIElFTYSUWJPPS;LrR-W4<*^g$VAOgyHK=>RJDBF#$@;mToowXp6vr8y!v$PI
zk9GK0$g4;iW9#<Mt!wnIbr~Twe$rcha?Kn3ZlZ3f_8FOX-1{5C{z?yLyuShNske3k
zwp{Zc03<IzytmXm@$=em!u3KQ^hM9P5c-n?qdr~!W@GQFH~!E5(rsPoVE#if93wFr
zV^R1qTg5Ts&IM^<JoiKt?@9~V<Ybg@mc~#ySNSnh+L|Vfk+<FJ6vwAxI%Z%N=3pL5
zmfD|BKe-I$`m_}d_H}E2TKQm~w^IFUed^jf`qvNQT7LcOPxP<%=uhw0zus<NIeqx%
z&HCp%V*gzF^xoQ$q<3KBe)S<1xK?aqFCv#<8CGCR9GlW!Tt#1lb;w0tB)<CseRz64
zc49Xg=eq~&rMJ#>?e$NGee{Fq7^g2kO`qN#b<qY<I9K1Ce+ep3gW~<{AE>97@6tDK
zH;;kdhU#1T@7wj?H|ndC$M_w`Np#|~<7;QQ^Vng|kz23Dz5Fu51^Oji!Bvz<Bc;+z
z8CgEUx5@d(`Tv?QH*gcTaToVt(P?f5*l0sDS}w{1^7B92|3aL#%90~wow)1i@!v3w
zu6OjFIzb$L&=+;PtlcFa?N2+9J{UvM!X_8zkhDEeZ;?sFwQR+=5@ERGBN6AEk0uLG
zCBj&8JgVB&RXembi_~58vIgyJa-w6|c)Z@zx1Y?N)Sk#2m>#Cmr(*_YVGibD0Ty9P
zoMUfnZ3%rDR$vv@U>$mD*apdHE6*e2y5MbOK6YX^_9Cu(*he13VU(b&!gnB}eLZ)a
zG@A3U{n2w<KYf`!FJqX|(ox($V3IyUA9MY$xt|4@;Sv+ZqyLvS3D@c!#Wp}2{TMn{
zE0@rIf*+2@xWs#s?4DsR40#6UZ~>Qa1y^wmH*gcT(R20bP`sFJg?=AJ%M&5|q4!^D
zUJuy|RbBEr;+`LsD0{*CM<2)fqNG6mi?UMn6e^k@+NG=L)#xwGXX~GV+=DR`$yM54
zmFoWnbtJn%E7``~&eonp2Wk%K|KkYiP>;ASuCYk{@48dmceDY-{bTFtjX7-X;vR`;
z1BiY9R&43T?ydimq<5fgrRyN}{YMKs7UMAy#cPdUIF?N>7k9J)Or}pk{HIP8D#ac5
z+D)y$UoOmI&%cA8d#c<l485A1>YVAAfmxV?d02o&XcDGb*cRtK{{KOxHbRm6T(AH4
z!1JH*{HHxXS%2B{la1U*$+}OB?Tcp_R$vtxGll8*L|8*lt$&<kT{=A%|8)Ih9=}50
zX&af3;uq{^K;}vJMYBEEde8Nu`%w<<<zF@5b8%OYm5!Gb=>L8sU7#ElsFZ)J<m+l|
zG*+LBea_u<&m4iB*5CW?2e}WU>4fq1)B4zC>t*c%db=_H<Q?sV{XZX{H%5O1b*M-D
z_eUd+qHdbDfp{uVgL*XbYd{mdb+fq9Mn8rQdJ^q>_~AIe!Yp%U$W9c~&yeR(M34Xc
zDaQrwOGxc2#9eV#8w9aV?K=E$pZ)H**S`03MY!%WAzUS|;RbppT0a}*ZFMGjH|lrI
zKT(%G>YwC=A>ls%@JI4S-yxwFx#gJmRWT&=q4!0948&jzMe7Ok7R^g+HYc&ke8i5^
z<}ITA)WbFYbwi{b)S?DQg>BS6Z$Lfbo?K;!Yx_%3y<XZyB`Pr7HAZ4I#$r4sA{&!Y
z@`?TZjm?*hGyaX#zWDo%F&n$C-tkPx5M~OdVmfAE7Up0cno5m-qh-Ev_L;`jR~lDG
za<y^xrN-qS82>(P{QHFQ?@Hs}&BnjcNIz<vzmC2@Jd3adyS=~0A;Mr8cWZBPPkJV-
zpvV3DJJuUpM{587e#XwVAqpQn^Kc&8O!NQdI<8(=C43RROxW_5o_RR_tvs(1S2Ze(
z_ow#%pKttsxp0ex6Sjh^<gOyC#k)p4>yR6-k8Q}uPVC0x^|x!a^U$U(*p5^?@4WoK
z+4Jx4{AAsJ&rddRH<CxS#p*tJChQf@J{-hhH1cae3B7fi=YPR{(QDAbo#bvO+h&T7
zUty_z4#+kX(~ptIQA97x^julyF>rUHio2S-Vu0r)ONMEKqYUM!KqabBjWf<UhYLt9
zFb+Uo!8P1K({}kEE%JJ6%#+RXCX(_+2Wmbx2Cw`$LZ<%jM{iJ0G@h6L+m$V)@{hjC
zrgm-gqV#Z6obkV$jg{7z(etjT-^i9R;vXUY9PyJ$bPUk{xNaV^<6D-ff3+3EIq#s$
zK1%#E=;cd2+d}i2>3vW=*R%1fM8(SV(3f9-48&jzMOXH-VK~`6*<KFhXpBW1j~`D?
zL^dX43O0@K4E?kfhFF)v-F-DZOebSIVg}ix4WGA2n@Rg1wf=reW|+f&9u{B`mS7oH
zU=`M&r$*c5usMR4o(b!?yZf?7lX=*Nd_<e_PI5Q)VjoiL57a#e>2XY=1&8S+=y;$G
zMI0lK|B6rbztu1CA9v-ulo<`mj<d>;Q_7N$%tdof1!_=_ls%!=_d*kQ(?b1kw4ha=
zx^1!kw=sYu*)c@@zg^jdBl_n5Z2q4xh3dIB@f<^(hjyGSp80V8-%0wT{dZFJ|2k>1
z{WJ6b=$*oxLEQW292xiixj<e*qxQ!Y@~ZUF`ht7Z?{9z6J)#3~51^8@%1%65Czz`L
zy^BZd1g|>(8gAewZsRWQBb@jAJJkPs)c<HL@+|GKUGf?G?>*12u0L{7U5|S2wE>U*
ze^i)eZI^g@p%40^c#!A!Ui#C^d%L$BW4iRgsLpb4s6+*Z@*9qk7>&>NKgM$BNrQ3U
zg00s=*fRa;Fp*z2CSwXp_N)K3L(5M8y#A+G(o_5N$ZJ!DnT{Elg=BW0Fo&Fn1!y`Y
z|D)xi{HZ=|yC;7lSs;I+=Ecv<&zrCQ=WcjO{w9wKSN9^@zj&5l8CIb2CG|g6(Ob8R
zdt-W7Ltlpu?j(0R*|tmh!LKk)8=TyR;@-vv$ek#nmz_|qolXzCx%Z-qyPCT~`BO=z
z_V<qVMfo=`m~$XZ*I{{w>^AmVLRO##J!9Dp$R@Pm7`9w7zu>lZJpCj(aRxn|PlR)1
zSB>;Y#((=>BCp^ouHgo5;x_K$KEjXm>(C2v|G3Qq?GG-lKHS}dwKqm7YlgUA{iVMA
zQvHuk`_C=051{e=t}}m_-grwN@1C(8$6GJT|7WFvtLC6{$2Msa?TG)XDY<3;G?bwn
z731`6QH5$8c2E7CHx$D$5~DE|<I!~8{C>3P3%71IFJ8a7UEer~js|_D+2)_mH~*Zh
z6R!TA`R8b)AMG-Ker;BmD4uLg#uOCS>VG$XK1`*T7wCWQGS7fM199*CDpaDvHD>Xv
zV{@28&O_sMa}CHvXrQ;SS+ufMjQ1Q%_$ApUI@k=___rUjZ~ZXi?<m7R`~RVPUgn$?
zScNr+W&Fn859_OS+)a}_KUz+Ce#A8i?TG*Q>p;yI&p*NQyWiCR5e9hvL7u<2=SN}b
z=lwufKFw7>u<>kWBPg!){Mg37{FZyWZvDbY;|l}X#V&jPv!4HB&(A+!n7U8Y0pxDP
zxu1K<eTe&frpBe0OV8Ygk)(Ib_w8dJlwM*Ef2n!>Wy9ouRG<=ZkI!n9IHv+NsK<x>
z*vxUzeIL@VZQ9QMj+C8nk3Oxwefw&8biMpf)_iRI`xE2eWZeUE@8}Je*~y%HRM@zG
z#hWL-5}L%_hGQr`#D2*CI6bxhsQWlc??g2lc-1KPh&YGfzzBOddKdeyKNa>5u;1W|
zUozhERM<QBOQB%qm%=+MzYyNt_{Fe$_7}pQPreY|I{S=qE^|&Bz7lq61Ll=J8#a%4
zHf+Jx+AoGPu5k{V=6*H2I`gX`|HUta9YtRbZ{W@5FNfSo|B!aU@z*AOF}z+WUVBFF
zk&fP3`gC~r^3&mxV?Flsf9ure!rSXV7xpRVy4QR@T;bof*M5Oy96x?f+4a8iuq#7-
zye%_yU;0V<HOFq?CVG@TJ^AJYtiXMC2W3x;XX7yKj7g5wf07>8KF0M6cZE;2e@179
z`}9Za7s4OQ|0kpYWYBxzasJmfjdRuWgujhGj`c-<48&jzMTtH}>Bq(gy7V<rK~L@1
zd)D{>YV^NraRha!SI0FVuHirWQJh1leniV!?E|Ft-y5Os?WYdTQvXlO48vV#Bt~N_
zieIw+4CCqLo7I1qNY6&Ky08jy@3IO^<~IdXF&#573v)0J@m+01EH4*u$1$TkETJz$
z$qDuUdDlSv*KfrGaSve|9l&mqWqqbFE3n1d>#c*?<LPU#4!L;z|DEmH|7a_auaKm7
zphiBcRn{IM>#noEDVJl}+(<vlE>kz3?L$1<kdM4J$My@eT0fq<wb}QpO$)o}d(n|2
zZnUHAkoiXZ;`)Ju<Y5$Fl&8rG6piy-ci0slxDW1nr2hM|mOV&0QAw8EQ~x8iK1Y3D
zF+#mKMjM%o|K4nJP8*KlI6m9{JIUR&NBzHF9eqgskG7TCmPn!lHP_YuIC4i_ukNkC
ztp0CM|JSPjk?c$lo#Huz3%G<UxQd>u#=^*G!@EItk2XJoyp7a;!h`Rpx4z`sxXbT8
zI{Jxot@go6_wvHSYvx_M&j6raMK3g}_Z!fMemL!ZdeMlVr59iPS^5vt?xy$UcW8h;
zQiteU<S1kM{at!x+RgN;F@KX@o%T2BCH?*(y|myT(#z6rr<V`;yY%|BYv~P(|0+Fs
z{C;{KYSM0`*Us)qKk`yfdfm#uPVX<iff$UT7><z`jj>4W|2sZ|U6FnN+7n?szlq4k
zWaLheesiQ>aw_+9%s}iL^c-XVQm)S8-r^azu6vL^hhD6VjdLjH(aV?nwpc)4gw(w1
zxysL(o^7G>eaENirF%Y2FWdiVdO2B9`)PV*`={wurJtr(V~TT@U>R0m71m%Ka?#}7
zHIMmOdJEZ_^Rx6evYk6gcAWlMdW|@0ab)AC>2;__!}d?p8<+kp{pgUNrRRyGYv}#-
zZDjYF`|0`QPVB~B?88AEMhQ~&uXMEuJ$)Xe*Ufg%{A-Y$tbIl{u6K>KpQbm_TgQBw
z-h$Npyb5VD>(lgP=BMfH13pb}L*X9xEZi|1$4L~o|17<z`Df{!^zu)BmR^Q4^mC~8
zOjRd-mR@=OXXzEVz_0Gsr|Fl-D`@mSuaei$KyP_bx$u(o%Y756|H5&%&6bDB68Ba*
z!TZng{-=4C7rcM6YNhv&+s?U*`v@1MVe~>D^hMJ&^H0RpGT(K`HgUGEe%Qu~w&ogT
zf9+}I4_S9v`9n5v$2sCh*PH*{@F2avcm^WcU<Z@MryisiVJN*^{SxQij-)4zk5_ZY
zxwn;M+`}_>n)Er__ds0#H(IzNvKV9Osr7&T^}$DJhmTOcC}SSm7v!_?!c0UqCSwYw
zVmfA^WWDl9xl)F5RG_j{`Ls*<gqn@YmmSKN&GNePs(!n?-L8C5#vEO*{Bj@7%7+%R
zb-przZ0Amr9ru_Elo@qd@{nuIL99FHkqZ#l0Jg{@t@6(z?j?w8|2pPt=d9Mwp)cdN
z0;{kF>k#`7xnx`mlt*quK6=zUsrkA;Z@=&4mumN!OTBd<+ra1yYu2TC9E{;h^xMg<
zRiCDpkli<>53&aJXhIu~;W+AcrG=B^qxEH-^fNez7T>y6dT3iNeLPS;-;o!t>)(u0
z|IJoUAZ7pFqyF2kjwRzhp}U=X0he$ES8)wDa1*Kdf0_E$>he~yO`pG=OpU+i=>Ol*
zK2WC~A?p?!e_wC>ovok|asFR--^_4ZJY6?4!d>!l`&ylSpF8}Ccl7i52O0DpYaDx#
zTiIi`Xb%^<*FM~R(H{d*tWGV$V0!sl&$d!OnLZqG&)}+i<`~?v)<AouWS8eh8B**2
znmvD!=dbkq<Vfd?##oHUL}X(!rl4uD{O6sf+S%T5YF|I@)IR^};+o5zUpz;|S4XDC
zKT5Ta3LcJs#66Lwif1}zU=|ujXhV%Nw}2kUK3asCM_+)B<!t$jAC7&rtu@zxU!iAR
zLM}tG@GHo;2XSitfn#wDBq>d<;a@e(bNS{KIi7QZXFs71sMYo-%Mkni6=d}t`5fz<
zlN;m1HsoU`c4IGM-@oZ2dF!G)M#jE>8^751Pu`L@<+Er5h<oqHHF|M<db9x?MYI9r
zPO}CNo8--C3rJqF_cqyxed163xACgDx4Q<~#M>U@rbk;qp>pV;aEDQX3Pc+~?EBZy
z%TFjfP)~0{H9gt@;vS>X2GGXu7>;AO^wwki?j*evXVA@Va*piEPYV~wOSpn-xPjFE
z&#RSx>)8(`n`e8IJzz~%xart!+{Jx_KUH?47y6(t`ePslV<?7WBt~N_dOEewZ|Kw8
z2WUKZ_dt7)klC1wDX9C{S`acX@98j|oPkH{|K`x+-jk{Ie+T=7dE6btq|HgnfN|0)
zS@MbgpFUKXerh>c5&PC;YM(svs(asLTz})oPy5l=(_xl47GTR><8juEruP4PQ9t`7
z{cN%g?erwsvDW;6`TD>2^ndkd>x%UM+x36RxX%9Qp3jFx;_C5?OUPwdfmK+8bx3;u
zxnv%;As^l5QS2nU>^V{RvAqH5d$A7(QOvLC4%-Dit|=(fA1|R-Ag-yen)J}dQ9(Zb
z|K3c`kH`BPtYrT{jdSYJgf@KEzIH77pVj|I3tC;btxNwONpzs5pXbLB)S>>C=SL%s
zqRzdY6i;gYZ<F~b^mr`VNY2n(&x;%9=oip2LELCR;aa%FuW-NqC3zLa4fg#bZ=lG#
zj<%CHhv6poZB(s%_*~I;66Y{H?*D7+mhX`F3zPw<+O7;h{Qult=iNu#f7<|eZU%ax
z4<7gbozuEYz1XZ?L~Mh1pyr<bKaMC@>Xa+>m-YY2#*g&>g{y0@##TK2F%W~%I7k?6
zi=p&Z_tk>o^pS}F)J{6y?s)vy_Go@_54^GDM8tpJW|NaqG+UXr)^lOYRCTOpiF=Aw
zD_bhaO2_}D|9?#p&s0pu48(r{<`(D&rt(Fcd!+ow<Mlt6<t=Ti$NN8ezS?%rztOYe
z@&6aK+v5I@v&1<E^U!lnepeqZpf8GU{r}_*_Wv9HU%}lym_3S&{h<}nZ4bt{|L`jM
z8mvPu9_>GzM}M^c@HTorIxf1mv+i+#_lNk;*YW&!VmJ1p*j&9L^Z)kIQ~Uql$_xkT
zhf%FAigV~Ig{eU9Ozpq9%D;td5!wtT!aQ1kUqMf;|C^=!{zUsg-P?w^Z%^{9{{Lz1
z17YgXgf<+*ar8{i2q(#|v26Tg_wI~vhCGK0xP&W+ec!9(HQc~W6hE+kFK*MzjXRWS
zYuu&ZN40T)s`=`Feie?DxQ9~rQiixr;2-^ecfWb{m!)^~LLYoK|M%r?+Rna#)cC(P
zL|e1E8OfRY|ET#`{r`#j|AG2n8z^NTIiddlNc}HdiEmNro0Rp|Pe;Wd{c=>%tC6?B
z_gcsHP@=t~4jSm1gE17t<MoAUMvHS=`#mhv+b20k+cvguYn1=BFKGY27;O#O|762U
z+W+KH;YNyMG{z$Sw|6}G=>HlM>Dic!)c-k@2W|W1z0LBVupMMv<2%LisVI`Kr;~-}
z?MX|{LKVIG^20VoC0X`~{CJkV{<eKZ94{GX{1Ig+N5w9408xc%^Y-QmvjB^*1k11j
ztFQ)5?dCr%Hveh4`A=k<dbpiTa(9q5+PbOz)lN&FrTT}><^UF%qjSg{K;*Vt-_cw8
z9c1mI_;Qh)V?Jbm>-qQj|N62a;kdGI8~^-xtZi`EN$$p8)ZNogArIm(N)Y#Uh<m&y
zFBuc}?pxOSuKa6|s(*9jKeP$cwL+Vie|ImoGP0-BGmyt{Jo;Pjd6MiplNLJ3)clLv
zgG2oP-x=;gW!pLO0xsbSilwQdi}sVEmwzN}-LpS3{RUF||9R$0R0xyDZ_@yI_+#t)
zag+Nt?&3a5dOvG_8hhXmc{Y>}cs5jI+56u3eif=w)9+dH%bpEK*iY+DJ?mMX4GkYX
zD}6u9{%0-z3IA~C-*Vo&C^bKz+57+`=^clhV~u|JGkFZX&<BlY#ZhDpI=wZXd&mC!
z^nvJbo#Z5O<cLeS!Tg3|I7VVLx)wYe#**FjgTr`oBC^pFhJ?xF6imf*6i#?1%phlB
z4(6fQxO&kD^AhOg#@WkOvlEx-19DdvnH$h-?7ZC=x^THK{m=ey-6?Big=z3!8|lmF
zTMlX`Wk>_+gw{&myur8k?vrHPi=m`I{zqAs{GaKY4DoHq)cVs2%6W0D!WyhYF7mJq
z`Dhv;|Bn&>B=Mtdl=zXnEd8IC{$G;+g*hTj9a*0vZIX?{<bT)NDURLPi}(+zePmIA
zd`&+{Ka3KT$?N6vZp9t-=7WcQgX-vZtOoUn?cXM{4aaaCDSM&xwP}cS%-xAI*jgX=
z|ISF+|1NM}!WHyfGY_7;h8yU<Zw@>e)BkPqF76}zxio{ig~s;DKIn`7c(nd!AU*c^
zTQHbD6dhg424zA!;`nz>v-%lFdV4>pALRXHdOtYoT{V%Z|En$ceiwVc3%y@FT3@+&
z-jiXtct>J1`pYw8$?=$ol5y-8laza>ygyW2^!`vquSVRnrz`iFkS%O?n|~45V_|aC
zrJH2mj4+ix9X)-dQ*svOU|#fF^^Ey%*8jyi_yuK|GIIg{N9%tU(aYz%Hd6op$!4?f
zlC~a5W2ivMW_`dN`oCm(f%;Ef7Wc)iYS&M!H9t=rORx+punKFi4!LMjM>bcg|B8%%
z_g23TQokd$|4*j+{jB;AM^J}<`hUQ<Ct#j<wjm!o(b!x6H;wIq-a0_s*h}Asj<xF6
zmFn0`buA9^>pGYb4wK#Uv`4bMiwV+O4%-I53e+HPfj;9hdE%xqww>w$em#5b|G(qO
zkk?Dy8uZ)p&1v9Yyj}ZYqi3O)@6k5EG5T>-vmI2i9aL`Ce!xk7b=RK=o#Yub($A3>
z&_I7=|GUI}1^>+cr$1hb)H=l+?|+2%KgRo?=Ka5*ZR4D4xPhCvjk~yyaK-zdCVe9A
z?UY)7wpRK?QaVok|L?^Ai~Ucy3~}^A9~2(4&mUPt7NbA?&0#-DA4ryENxzxW?-1#i
ztm3XF-!-Q8&SGP0FZK(&_kSt8z4A-pt=R+YulQH#JLmo+{ey*1gaZ>Y!~Pe)680_q
zN_c<vAY*jTgo3%|J-ujen*|{Z7Jv8j=fhC4ur?7sJa0e5h9|?p?b1%##8A`wd2P$*
z!jT=%89VrTs81ViZQ*kvIkrzIJ@uVX^3iufxpit~UH^*x;@iroUk=sxo)1-N+2PP2
zw#yjr^3VIfTI?4-6%N~9@rRo~7m5p>^4-nLpW;7gz5goxRpgEPN&4mi{lk_?Uk+PO
ze>vnG%nGBO|JsPY;q|O9hW!0s3_G$0hc^a28{SkGo>La}g!J&%@`2&)X<rSy*MBwa
z$@Cwzxle|7r#%_QI(IzYd+~|z{ut}NW*QIl%oF)_Nskt3hK*@)eW`CI4QBJ7j49|b
z<}{Uz>;I;casTfb<Sfj=Jj8#0E+7|S36`O8zy4D}BCMddZr6Wm*GHhQL2Cc9nc8r3
zwdHnc|BuQF>-gs)58F^Y-#UW1Ss|ZZKF!$hdi`zsUQ`#@`)!v!;K+)ZS^v2GKgL{Y
zVH(KBQ`W!JQ|<p=%3$u+6YK*Yu@4}L4h$D&Hx8o&6{tZyno#nh@%fF`A2`1p6{s9x
z{ef##lQj#iKUi-40b4^I>X(}PkH#U^9}KYmK)B`!>=S6sbbbEqx0J2uK;c6DXxGU*
zo)(Ug$5FglI^lnkUcOx)n!A&J2Gu**4^W8;oa1)^mv9AFaSb<c6SooTzBp$i+BWZU
z-$&?D7St<$E7kwfdl~*w{hy`%FHLMQCtxePPcPy7pfCDkAO@pp^Jmuo>{q98x9{+L
zWJmkY+yCmkBOh!3K2Yv`qWybW`&ZlGsN=E!I8;2tF%qNEsIG6=qfbqbbME5a_v7gk
zk^2ALC^k66H9*PEj8G_#Wji(*#UJSpkW*1~R^9VJxuK4p&OHNF%h~PK$8qgXCA~yE
zr6`-E{f`QIC93vo|L+N5mUHG{9u{B`HkHaV-dir33Os+M_J5J~|8md2+VijX{A34r
zjpwbs<M~l{!t;Cf2G8Bt;Q93jy4Gle>sQBic6YXZw)mGJ**_z!AoD5`VGX%;4f}kb
z%&?B0iynLR^i0+NzNHT5-iCbaMCu>BUDid><DPIWOU=ok=N-%l9h2nqxyopLgxHqY
z7ya`y!oeuR#Bi7_K?Q11CjG@USkf#X)Jl5~*l)U|J@%o>J3qIpEoNt8<EbrTS7Prv
z%3V)3tY$l!AMH|Ep^0ps9qm$C?EYEWK3SptPF9Hje_iD|^=LvHip@oib-*!tT)S2_
z$~t@cNyL8wR1H!`^;1`=w>tTq!8u&OXYKcwxUb+UQu}YR7ha>^z)jpnNzu>$@7J#X
zeM+5rQQe9vdi6)z5!yj_g}aaNV{Jb4LLc;nwb-Hgyzh@zX*{;++V}YW`+a}1MqOUJ
z*!n;H{krw~TJ(nH#vjO|cch64=Glp7AO>S78s}<%%rtkJ-g-zK{*rM3`e<~NYJc!+
zC)*nA`@?TMdTwThiDWh=V+y7sH%H!&{mUUCOy}-8X)PhyZ9j%tWT80bkSY7$Wcx(Z
zi!U0#IBTyrdbxL8cAA}^z68;BA8q)RWVBT*<5$<NpFplcqdI*Jxeg8Vmf6|`^V32u
zcOD+u1>zp`?d$_3E45EihI0I~|KC((hi%Tw$4>0VUhKm`G%Yp0hL+y!^Jp8QY{*nD
zWU>E||LFfW!ad%fkA76Sa9A8As6e5<MGaXrPX7b-^d__+{u?0vBcMXNBktS%Xn)?G
zf!e<VykmJi+TxE1dlFmjW-B+%Euf#lIb1;ZvL{1^wA8i4+FR~_$^Lgmn5(#k8@P$v
zxQqK}TCM%H-g_onPs`KF@%GE|_$lqLkJx`2{ulN?-{t5_${uU`Lbv<6Z5}9@H!vsk
zBKx2(`ePslV<?7WBzmk>7)@?m@3~ioFqR(YxJ)Fo<8k#ya+bXpcBX~N+}-vNoI*~;
zbZoic{oJ-@et%9l?m5Qun}Jy<KA*$>mlNjD%kNp6eJdx-qc1@9AnmGt&xcB3DzJ!O
z-J$2h5^@<DE1wT5$W>^lwbp-{b?I2cy$(rw2ink%lG)FP(#6l4qwstvCo9&Ad#P(Q
zTmOz+=j34<^05=Uu^0Q$G~4?3`PRRat;@x`TD<G+*+6!1*POEc9Y-!&|BiZk0~&=n
zy7IfJv{TsSI<@K)6o2yFa8SHS{mr6Vab3RqLf%zlvHYv()u=!v>Mm!868@?A1;%SC
z=#AFJH{8h%HS|_%=UZy6C8sx`W3GFf=^lk?L&;3fkJR%o6fY|A`2SxAr77nf!*QHM
zC(hs;n$(5Oh;>XW+T{6E`=8!{ng^e0|En+SoAqzeNI$CWS@&^HxFDWOxPq%_?6OAQ
z+Wl+v_)p%JNr`ZSeiI$*+5XVJR{OD^eI@u69?A)K$@?fSQs7-xPNS$&+7Y(=C3|Xe
z_d%7o;y-9AX3NJp31dp~E@JtZJeD5foS(kJZDljwlAWPmX7l9kx@+w#ITXV&5~DE|
z<1rC&|DSB~(f&V^=~FNj|6lk2iTe*ucm52_!W_&)T>H0x{QtfG&*blhZf%L~Ue@t1
zdp;}@=MpT#3ar8!tV1r6%CERDNYB3K!#3{J{y$5-!-dKLGN0e0{eO1SAMgKjG9zrg
zk)dCt|9?yW|3%*bB}Mw5>ax`SH_hr4dL^pVLDk3;$8PM!J{-hhl%N7lE6x8$>i>S1
z^_frT|Ffwl$&Ls5|Kh4u#~s1{-2YbnJ5&9Oxc^O!c<N(ZXhY*D^{+ba7`=6+`gghZ
z2mK^E^u?3v=JvVj--X&A{NnyMXUKCX7WM*p2}O$^wq45AlUKN}qH2KWI;)*A!gCHW
zpE}FCQ-8$tcg^t|xQV<=>`mmBT=u)6)_2qIBYZ4vqZj(1FS=J{g#KjQ6K^0n7(+1}
zBQY9dF&-1qQ=bvC$;l|u=YKr^qEz`IZ&sp;UX3_^e~NHZF&#573v)0JP0Qtf{e%{M
z$X2w`+q0DY<K%zTT$KNDM7X-e^1r&Ofo#N4b<oE4bT)8f@5XYX4Lt5mzd*c6`L9tO
zyoA0Ct?J$u#Ie1$TDJdE<p8<DvErr1+u5P=^j`~iXixIna+f_NuGhH5R&ZTj*B-7Q
zV_6XQ1Y0A_I^-e`+mMfL?|LU0*Wv9Z<K94f$$dD8!ze)oYEXAen}Y1`+cc4FXuNKo
zL05V>MsGC_s0GLAC(*G-+XU@J+A1g+rTv34&mH&5si4QX#Z_~aQ_85^)80{U>BGC}
zbp9D6*RdJYn{z+d9)sRdvvMr8|NnLG=AL(iB)tPQj@RM{;$DgM{T_~OKHC5Pg7_}s
z3a;WBZr~<*&e;QsjQ#(+WcOivBlH;%?$g6xNQa1HpS{RF=!^b%^#8Ac^zsJJhQaiq
zc(iYEWszsa<N5E?^fTt_S1i@9KvjW$2F|(GaOaK0XpF^pOhh)C^g)^*=x30v`YEaT
z|N6^Gq~`x$*Dt{lVd`$_mz>uxxvZZ;9zD(PBYXJA_FIGU3zNke*9J7`<0SRTTE~bR
zd5hA+6vsLS@$YB+dxUF^vPOVkp=)i~CZF1)BF>$k&Tj^aiuB!=dM?c3o`b5{o=ab-
zV!7wk$FJLN&o%xFun0@gxY70bFQd2aas3^i4=d=a&~eZ8Jx9B1wqXswb;v~?y8C2>
zZDiL#_7k$FqEFaK?#5p1!$BNINhD3b4v+t@K(9m<;#vpeQIFGKh43}V%a*_G>)1qZ
z8shzXKP@A?|LgMqz1XgIoZf+&Y2N=#?;mxjpWxZV+ejXDUg36qLGf%cHXi@~ag1K<
zpCm>6kJHQdu>b6^M<Bfu)%VzctP`o+%>IKj{Ob10`%~qs$r+(>c6vC+?*dZ$hpf~u
zFc07o_Z1|^=@+1FraXw6UFHZCm>YCR8HD<?@-7+?|IyWSNB`oUJbYR_C!BZLd1R8i
zgS;xvYq)`%*t~8~xJ};0B5Ai|nEWl@#XYbd+b?hFKcE`-g$sWvFQFHDX4r3v?2G>B
z&I@56nQDKKkN4Zu)*i}T=-x+?srmQT_l>3(&r}{ZnEyyG*ZwJ6&3;0ki0TQ-7R0#+
zu?>{XZ!)G}DyCxwW}!!$C%G&mbmiJple@dW{vo*li?9UCumY<P=Xb6l8?^c2|DD&-
zbCHKQhrD&O^s!(1m>}L9>4PlIGzRV8DCO$KGWB6)v_T{O`zNjyAMV^(H$JjoZR5_z
zPV7eW3-*QI&-PEYuy>}`*Rg#*T3;9c{TKfc9RHDfBu%}pjZt4~?7Pw!I9a$$8zbNN
z#7Oy9`){u}i-jq2Y#+V+l>2(X&QCv#l>K3)`&{cj&$@Shbp`5kvIenTQBO7@_5UD!
z{nWmM+{cjmFX4;YDg4{n0FLuJiB6osIb6Uc^ps?VD`eOB%y5<L_D{!a<PF@!Z4?gi
z|GhB}+Yl-H9>3!A?q8eZKE3>w`@gP_cg=SgsZRgMd%f)byWBtj|Ht(QzD1RItI6aI
zwm;{l);|uFHtBuQ9|O^}-uR1e*plTNkZs;^dvE7Wvi|^Tw1;X}d-uZCk@a(ZgBQJf
z-}fk4vP=FbkVX&5AE-DhZC;i}&p#ZW8tfWFF&rJEt%oH?V=Ts_Nq%phFMsOKw=R}H
z_37J@L<dsiYdh3`o7I2pef6v5@AdLGd6aviII>YEpH3zl<ej{2%6fXE{82A|CPxmI
zzSOa+)vdLzp>FOVlg;M;Elq@}!cNBw%tG-H`EQUt2kEK#%Q@!s(ifmwSrYSpB^l>0
zm#72d9^_^1o}a9!^!)NgmHbg1^U3JJVUhEeU>R0m4Yn>ZKVa~X&?HYa@6i4tTji}b
z`s4B4wd_wbwZHIq|HoU}Ur7Cb{bO_ZFKAn+_cj)(|FBNnxyVD$ar-)uU2FsSWcNgC
z1k4ZE8U3WC-Q-@R=KFa!d6Nfg;~B%{cMylsvDh=9{k(g_qxC=~j#Z%0Kg3s&HK<1u
zij9dE75NuAy?lW2Zyci^M|Hz9p$e6m#=mirUnkDs96me#c7gj6uAo7Ax$-OSyWTs#
zpbf;mn!CjKf9ZhF%r6*ceu1(5Dsu{|hds+a@;}3`{U_(I=eEa6Xuin)$Hv`?wq5Lh
zNFufWwK!_Uc?5N+?`>`Y9{pc06~F)7^voI(yo#_fUZc%5&$!1?#s^0D-`TG8aA?4g
za4^ewRqiukvoPIl^yAX>d298x592d!|M|I%_6>-8{8wBtC(gTSDdP6LI~?!WVf{YZ
z`L&^NfHB`k*XtL;n@dN8;vDaP(lcR~@a3z$|HZ<@_rO+N{gU^ON`7&#g7+`?5AS_E
zAQYs1nT@$`_+aI;;lSEKVgJ2BVPD!;!`mN^32*iLrLb%HsIU|7-s>0Mxz#W1**x0X
zwK3)buz~MpL%a3$@am3lhP*xB44b<~hb=>XF>GD?i=o1G_qdPOX8%%nebg8H_sn{@
zL0=4S-0C0R9P*{qa}_xL*6c5bw|5N;yM5$6_r4O|xi>hxyWai`SJK1VpELe7+r9cu
z`=rxX`^fKT+Wu6iNXyWcP~L6Sm$Of3r#?fYWBbMV+vEwHL=eZuqF+w5(KDo{_x2c1
zoBa!kwq44*+W5Vdcj}ZTWol|YHuoWM)GKcrPAl(lRJohlkL6?K;eNJXa`UX;NiPy^
z>wWF1w5LKP{km(^lKH!SGwe8Q-px(zc|%<vVS9h!20ayC?-RmTG|&DcnuYHG@7!I-
z(!%f2o0dNnexE#tKfz^Ofn|reBgOL<^k#b3;b+3PxVvZBmtoj%hE991^*ClqZ0=8S
z2f9+Zow>#ntRMIX`dj^lSzrxgNQ6(hpK^ZFt*1g3`GNP;m)t(;bK#3*scW>2Ro5R^
z4ya4M#IMsjj;0e&g_bG$n!VjOzu$c#<o-E+iox7}Pd-aN^<>B$gMs*4{1Qs}KSx%R
z|Bh@>&iyKRbmcb_zd_crMc6Vew=Qc`Vh7oNJ%oeg-=h-EgT9sc?_}%T{)uOMh1~DC
zhivkfyvzTcoTMn+NUkJ*hx`^fRG1uc9{F#`|3bd#*soz4x8EM*{zq~x|GDJX+}r;_
zwq|`TafaU6|2v6ixjPn1EAm-8Tk*&4A$O_%-D=~nE7QWOmxm?tmOdFaZ}dMxY~{D@
zP+HhtnHF9%9{sw9$!CAqQIr<`H*vj5?(9knyFPg`yyZs|Z?kvo7G}@PC*-lT@UDLM
z-m__;K)w5(`uBb1@4lr^*gv&@;(#z8;Na^1i9^j#gb$}#i?ex5;_xtb$1&`X{D0V>
zem>MMQ984KBL1JKc9`dAn~*r2Jt5)tbIWwyD^MM~@QxkFpW??j<@nQRI;3sjTONg~
zBlq_m(=^S!oavkLYspkbotGZ2+k@tTy=P9TTR;ACqAg2Zo2K6Gvd0x_^y_QQ6>YTs
z^Qf@(ckDs)iFKhl+KwOjj~-b%!M>5F%`tJTf~-7|89wA+n3)+4y2hbN<`oRd3_s*P
zOco8W-;H~C-}%pzvvJ^ZX82%ShH>&l*nd7V{06`6IP9Lsel1P~SGUpM^zHr{KgCYp
zJmLF4fp?_;-Q3mk;;%dQI^M;HD2q7u57OBeo|eARnepw!>6)>L&Y|QLvM-ik&e+81
z{HTvhobEp^(RnV)?<6``3{9Lq^NmF3So(1OeX<j$mwY$TIq<uQ)8nJeNt{0R%Zbh@
z+4M<?&WXRAI9<ZMjlBEqMCUr!nM{r(dyyxHB~B0jdZM!elP4xR_l`)MKKWdtbMWxQ
z>Au4goq50F{9kdLe;fY_VeY#Am5GVdt0pGC;Titc`&lTBZ^P-K-*oP`5@q~GdtaT0
z-S0u+Cw@KgyBXXQpOb#X*LPCl^n&LTonB@#$5`MeqZ2=Jyj7W+;T<*Sd@b<@;xH7K
zTg2Zqh}<LOuS++YBmVasa#z!<+x^e3;1?5>MSa3=bC;j^S|Z)?<BtCVd4^;b%N<3Z
zkHWNpiG%#UBHkhRV1A!)U}hilL;8eoaKArySfZM&MLnAEP5eI2;O#MAPy7+N%(wn^
zyqVc2ys@%R*s-`zSj+wTi$fC&-2e6ozm)ht$zR1f{wv7e1XEXT@Lo2$Ztg+^`QJLC
zO(7#>@XzZj|M$1w*wtQPYqmY(MrVfYhcm-#VNK}ec>Zv0aI!!8qDfh@>-Z1CTT{Ou
z-k!cT4CdEq4%<67dxs_k+0J9V!*G6Yad+NThS&3Fa~v7N4VLe;3&uzNDqZ_yB0U@b
z2H!(Tey>n^^#@@gz5HCXcM9kH|Hj>^-u`v6HundiNgJez9i{WWc8h!e&3Ig!<hRIp
zjh21>kvY$vQej<azx_%`-d!I$?*B%}?fqP0W7cztO=-_1Ud?<ikvHhM#O5K-CARc?
zF0pmMbBS%Eo=a>W^IYP!Vb3LAAMsowf6{Y_9XZb>-Wc~>;>`)tqq=d|^{2zD@p_%^
zZBPiWME#PvK_8ocF~NPg-#4S5dyDt-cGSl@9_5)tc%S?r>f7uYYQ9F%Z2Q{He>%kH
zD=B?0QCj(2!k!zUP?#9s4`bLF`tJ>{Kia&3=r-0@XMRV$`5k}2-Pqgw55)h6Hz_Qe
zUi5Az>N|{jI{cC2NpnX!=h$2FV{<#MiND=kf>W`sSH2#WFLS?^IK57u+by4Gj7pro
zA<rxl?qlITQYW0ZKLg&awKt^as}^S+jv~H;R`QSW!O~wz{3+QH{Z(L{+Ke$B{CDo(
ziGCMD_&xH^qkh&nK$N$H>-=`&FQcv?@4WWy#9u|-J1AZ3S)!kB8*TPSU&_*#%L)zb
zInlP$G&_U=zaUKvGB;)v8{8fF+dQ52x$Ha>`WOq&3N2_x6LQ<13LCWrHVw!Kubz5J
z`zteSzHDC9`KQc(;+`e_Yg@c#zwOtvdWHOcy}}Orb-&T@jl`QDKNWT^?WJwm%RhB{
zg}0Y0udH9#Go)8|XR~wZo%+rNIlaPr+IsIhf1mUBJHK<PV|~prUHnuyq^<NJ*_8QY
zI9!|Id5!;)O<CsWZ{(J4$_nfeK3{v$eN>h@Hp2XX3BAlO>=kNn33E(ad`GWPKWb#6
zVc1jVw>}k)qG{OYLi31Tp^1H>Rhy_SYe=H~vUD--bD;ydo^_D-weOJsS1TKsj&nB;
z>K(Qe^!6X%-eFr;@34Kuh(tn}^;P^LUcfB;Dt1;r6?RR_H2*6zyuC0p?8YAYJBu^J
zyZti5-U;#fGsAo2`&rgh3@{f?-FJZeptpLlw0Ai4Ks|X!J(;FnR5un0S4{rUc_m~i
zcNtkeNWHqfx95-hGy50tgWjPgt#_zhZ-3?0>S^cKFYRsaq<UKTqt0(4o4ZCNTKKi1
z?d*s|droibb9!6H+B>{DHZvSr<3Gx^8^ZTJ&tZDeRBask-{AFi8R4=z_ls%jm#G<H
zFgX;%p$nQj8k(26M}o1O`~JB|*A2^UylQ;s=9A$s=zoQlE8hP;c9(7JE+zi!#INq7
z4WY0zBiy0?JzfnN;b-J#<0(Z$mB-@RHa#QsCO?laqk4>cJ8q81FgBl2PX=bdTuVpQ
z1NyLwpA0`Hhp1=%4Zerp!W;M_;cg&P*k_=Lk=y3kHeLQ!;?<A8mB@V{AKd#^BK6&U
z`@F8F^n;{(@B4LZ!*;x}xp$c2-25*2U`OxpR`a(KZ@=)3#Lj|mC3c<oTzJR6AMdXI
zMq>Agtgz?o@WgxE?_d08VlP>+_Q~*{o%?Ip_uzA3J-HKqi-Vr+FiOz9NMDbvfhl6<
z8p#L!)Eo3R9K&&(L?_PR94_DzuHY)JL6blCCT`;{?nAdBchkNNA%i@4XIA3S#aW3D
zug^*p-dP<Ef3n)%w=aj{JuinJw!dtC@XMie)OzD1>)B7&hl(NV_5IhI|GD1$wDs1P
ztPi!*Hn4AOU|-(AzO#XSXG3UQx*;4z)7%ZA87-qW$bTC`+prCx9mzo(LI>VbmrqhJ
z|1tTJGS0AyvdsJ}W#xyyTLr3-{tK!qw2xR5+8X90{>M0Z@i|MezR3@#a1mYj1b6TN
zy}za1fz{ZIJt#s0&Lem2D`DftSHdQ|y8o4sx8s$tx%QQ?1zWML>y@w_ui^CvuY~-M
zUkN)tc_q9-zIpkTu(Rluu&eTw@D}-Y`zv8Lxrh6m<r~7gV>X1n12%*L&-vc<4dH$4
zySG8#d_y=u|KQ^d;oy)B;n0W;;X@SWYzT+RBIgy8Kb*fIlq}v5O2t)1mV2)iWF>bM
zS?zeuhz-_Xc>iRb^Xth5@40dAZ-t}hSBIttYeVz()uH9y>d<;|ZD^yn^G}{#9Xc+q
zmJwbK8*^R`o2I=SUVZW9koUsNVe?Bbhb{M3g{|vf*8gGG;C^lH%l?D@{g5x*j{V;c
zZ`9hiXUO-%&Rzd0v1`=J;jN7?hqq6^Y`yr)VbArK<%5^QyJzPn_8xjU6fFK;;=Nlh
zhxcFlUSi+bm&5*r-wy{i{-?wT=U)y7^*;{n_+H|}3I91!*!$&hc-YIHU;5kry~Ga-
zzLzLz{$8SV^~<4b{%?hgzSm{+8!gRX8fIcP7Gga%q5%6*ga*tU6LJ@$2+jBiCvYAG
ztDg#Y$>G1K9K#IE!vZYArW@Z6uP*vQSVzyrmQ_CpTibpRD(Fo(hLgB}zeEqd`b+v2
z_y*ouu`Yaz{5Ez?Ul)Fb{P*}({5k#x<(=z7$<=kC^wPTUcig`<Hst;QZ{ova>%!aQ
z`zS=`<Y|f1VOpZ|{=d`j`1gr&en;^M9-yCm@dn;Q8IDw}3w4v%8Ovo)!SCYtQNL_m
zXvkP6Lp&Ka?te0DB3~^v7rt5h7h5#^w_@A%C&PBUc2EBo`Scxl!x;LT*r}nu3vcPa
zzKz{;)BN|se=z9pV((i0U%W?uANw5Jj|21%aIo22Z+v)4{}+cZK3uz0O#dND?ir&%
z8ND2F%~B<*=+&rcW?#jTX~wrvKbL(Kjr6#_^6~midMnx-Ye$mafj{&veuO{4Tfa==
zFYpumHGYb}$EWzhx0N~g4qnFhu?bu72Hrv=TJR(M3H}T}#^2&A<JCR*8vZS|;5C$?
z1;33S<B19ED)<lhzwjIQKI{RLy9KY|Llon;@w@m#{0KkBPw?0HDgGXx;=p&L1svGi
zC;SEZ6Z{mpZ0H-Y=~TM*U)-<4KYOr+z7^Z<rHAc!js80F9ovC7RJ?B*f7_`;wafon
z-ZIYkHg?bbyzl>c-(TApd)I#6_y4@_|9RivK3$IO#{v2WIM{3-F?@K+SR@W#H1`_C
z^dF++p7BwX(aTZcSS70H)u?GUzKJ8#jBlcTuJKJY(vPBPt?^N`&|A^wSUZyRj_6*S
z5!#X5o)J3sYgew$FrP2Oct%EOMg7W*(11o9MGb0k1a+torV>^3YLvW?5lT^pava{T
zJ-09;6yt|D$bD$D_TCQdJ#ycC?ZL&`gUhuC$-UeKFKREoq`gS)9-=+T{~gEQC3kY~
z%GBP>(%vNV$H|X)18-s*w&OLtjy!C}7HmZ>HewTA#n=CpG>z}zH}EnxU=z0B2pW;Z
zAL2*&DgGXx;%gH_?!U%=z<<Jj!E1N}2k_hYef$~z7M~(9NgBsf_!7Q?ui<(8Yy4aM
z7yMWJ8a7}Pw%`rCg?I2C4xkGE1HX$O;ZN{q_%VKhzs67TDW1?~_&mOVuOPQs{~w!9
zvHv0OqV+-8Lf?vQ_w@Df8vS+TJGKLF(BH()X7)e4#SZW`c8?2T51tYC*YHjJGIF$C
z7ORVvV#ILIvmg-)Vnf;dqK)=a*mU)Y@andVkazB@Ve{^<ge?P&S2$<e@n^#J&^NqR
zV!yeHejz`M3_I$-72at2W_Yu0l>bApO|P;Ko_=39yYQxE*3~<&Kszcqg}=7crX{YW
z-`&c;M;|@Tsos=l4w*h=w+3Rj#$ktJ-PURR_bBy~e<w=gaK9JNVZ;A}D7kyGjaeN}
z4}TQh=d5MXsD4H|_~RI!p5khH_|xca5`G@0r-wg};iuAX3q$|Q=pIjJ?~iM{|0=ri
zH|Va<2!BWZ1Ac~{Wf>veJ9`p6b238DjEwL(`j_w(w0n)+cQe9oljFFbfuWAv?tK}d
z*NBk&-%y2k-k*>eU#Dkhgi&N}?|+}zGWfSr?`o6xn4FOa&+{+nr{2o)jVruU4doq6
z)L9=T!t3iuV}l1qsY80PVX$8m%va};2WG1SqnkZ>7nz)z2p=>h!oL;Y{(8q>QU}&N
z3}3*V%;Q(e)|;OQ|5cd7xsGxF8vT%SH@SB8RZ`eZ^BiM)h}Yb-4nbLS*!lPPtzh#R
zoDmLerW><9BHl*rvt)(-->BXpwLhaV-c9-XfY&m@Cbpp8aE+#$&M_Wbu-qE0h2&b}
z#^R28yk2s$YdU^_;VzY4i++B*oqMq3{T(;1lAJDXymwhxd~o84(A|e$o8$K!Uu3-E
zw&S-PH$L+L4qPP93zs3x2=?(|;#@<|dFZ!kitzm7v-RwAPnp6Gcp}90q3^ABuiTrQ
z8;^J60KX5i#GCf;9=peji#zUF9rHxbP3{TK*(nW5_Xk`r&pUYk#FO5uIfd)^vEzTh
z9n<js8OBl1DktN)=bU?*E=;^$!R04Cr*nd9FL7R_IGUYDk744o$8TRy>YO>wKQ2sx
z@G&mwVbgBz9p-V;TSM^P&7m}&bYP`<6Wp6-vAt)=zv8Qt#|moM{#U4@J@3(p!W}ZE
zHQsUeQqG1I-97W-^~@RQE1Y|;?C-qI?rW)c;2z>Pjn5d%g5rbjai{Q(bxY525BzQ&
z&TbvaZk@(dd9uTvlw;j*=fiUf?Dg2=zI$fL|0^&YBZU>`fp%d%bL#sUKTn-s<Qn7o
zdH)6MVevck+!t246rVr7)1BUbT;Hp^ojSgk`@oPm9p1dT5#LO`yI79Jv=h@%mE%3f
z#38;-1I0bjHN1lY<?egRxq_G{SDC9U+y^7PC-D|s=U2}LrVM-Uw0pkbe0h7*h4k=1
zxQ)I|{Kw9---!1jocqb+w`rs~DSZ>+oBX$BJ7-vszC!puch5*|AJ^#_?Oktk{nf5N
zAcVJ+i+j#^?_;>nhVX{4$&=<UF0t3Aa&D932VM}*DDwgw+m!e4J0yF#F8327-M42u
z@QHVDM!YK@#vAil{4aBXd+6~k`wLesjc&>?w{ZI4P!k5*2h|!U`hof4U+Da{hvDx@
zPb-y$@!PE?+p`CUO(v>zeDrKs8_UT0ArHToHTK;%hotI>_54%c)%#0*KhM2A`o(W3
zJ;ZNxJ(j1J+`sg6Z=iUxhlF8Y(FdWY4N}L^M_~-cVFD(h=WrtAL^*s&c!8`v5kjW$
zb>xxD`u?Y^(fY{REZ1lx<K7NUXg$q_jdo$$&>?(M*xt^~!b@0)ei(qB{%g{E!qoJx
z3ooU2O)N=t&s&?`T|Xne(y>}Jpc$$Edm5M9bXons+_xt;uk`KH%!kv~+BSmya{~K+
zPG)#r8!VrlZHIQ)8}v7&r=1PT`gZj{`SwfpnVu~Tm=E&~`7Za~)zX9cF7J`=bMNE7
zf39@!fAjS&P;pgf<M+b}sMw&vBr3KTLPU*~B#;n8Dn_iRM6roV6kB|Y8Z9c(s95nW
z?Qj`jfZ=|Jff<IGVSwR&n;8ZgA=X&2#)=hNe8q^0ib-1WEn3q5?@Vpi`o8bKzP08z
z&%T{~&e{8%eSP*m@oQ-RuA%*lZUM9&LziWa^)R%lEwrtzNZTs<skFK6&~cJ+Zs>|Y
zcLlnk2YL^pKL`DwehYK(DEc)p3?ndl5d9o*fgAq)-L8M@;dw}dV{jZ!z%-nM8JLAR
zn1?eViYy=(VF}K{GBhpk7Q^apS;1UhrpPMN4sj5#O7hs1`mV$y6JWG&w>aS-=0r$>
zWH?CJ6y#w@#GC}la2Qe{4UR%OWI!flffjV2hjOR}wal$YrV`f~%xOpib`w~@2fH7X
zKd*jU>qm+|=2P#6Dgnqq*a$%o3>L<a5*Z(`BaeXtYUs<Ies!NRgn7Q|Z_0e#Tgs`b
z|5Z+{dqtVA8dm1Z49cn8R^`+M`AQJy3x?C91Ip<=MT&uAJ|Mo+A+^fs4?a+`KT;$W
z`vr`}xsK1uMMz$E%@shBYp#atArkh$O>ishhXXLLc~Ln-+ws(fCzNc>1u}2ESed_I
zJ?FnjId%D!$|1t+TEqEv{YW{zW0SHM^UbP#?Ip@>$S|&LE6_yK+ymj5BY<i}^ANlM
z*-!u_Pz4P@ai)0`q6ixeX_$Y;xneNK0>z_-qCrD3q<IPI;9;nMBS0~rc^fp$foCOC
zFDKIer9R5dKqmw8Fy})-(s`mY(Vwx<pYfb8#YjE%Z3*(P9RCG;02Tflq>^>pRcD!h
zhZ?Abx|7s_&=9>r8j(%R>l>gsle#{517$U16wnyM7zG$GH^FA^L2b{K)1E0id7e~;
z@s2|^epOH#%oqpg&T@^5=SdN=aOFJF_o1tWz0`i5lsLIJ+%-$w8!{Vz?Mdzt|9oWL
zth#rCx`!I?riXVFJebE|g0f&7`Y`vx0Dk>2gn1A~47}r@emk(ekFk+-#z>Hkjf{~%
z&m3h0sADTe+LUG;V=VZq--l}}XKV$#`hB=<<lD=n-y85Tya&Fx^&<Zcp_ng(X3QEG
z2C6j8ImiR5LQMyfW{0L2c`5P-;LkJtIb&h3F`kioK6RTP?-g7HJK+|%6V6=EH)N5?
z@EBCUsXLCWX++N7b!1H&l5g&;u^~Ia0k`dC?Bg!_dWrN~f5CA_c_&|@e_2c)w1w+x
zC&^u$Yl?ZAY0m!+<6Zy5bL~cNrJHN9h-QWNP{&xJ3Aq5V5tIvXH`hBuAMHHe@k07F
z4`UC;UV&`J?UFr|O&q7i?Hj^h9gRPB2Q*_J<DO68mjrt-Uk~NR^W;m84N}7pXF7h<
zq)#vRGXM6zHS-T#vF6lYwy)v)Uu)(!{B_Oz&3{~T>gLa2a?Jo?hT-&!pRYOnv%ju!
zVqQ?~x4ysTbY0k*`N5(!^EcM6IrXd7HK*z@?}(N816^xQ{oprps^B*Hl(3(x;rINv
zoc<tIzQoLSUi1SC*PIT?U!ys8xn#{<F4@eD<v{M@MWTf~<`47FUQYdYvFMhrkiv!U
zOVQ?FDUJ>nJ?4_sby7NWg_LQ(FXajBcOG}KR5E8<g{;P2gRITu8+^!m><!4q)bI1%
z+l$2j&CmkIV;74FTEPr$&hLo@toHATZJPc0m-+tSD(nA|o!Gl(S^p2+Cs_Xvy-wEu
zLw_pk|6wqN_5Uy&&HOKn2D1JiT(krKZ~F0Hkrw|8-@qyeuWJa*`|`YF|07(4eHYvb
zzkuIB4e)wtzJ`kldkN2Y>;;t1un%+eRg9;A8D0hF)y&zztMEF!3EQuwEQA_pfC=~=
zoPuRofmLuV{!up4{@MX|LJaZkhx_4CcmZ;t8f-8Eufsd=5qt^X!rCz2Pk0($fU7_g
z#P}a%Z)N-sa-$jlgFMXnP=KEf3R4;XgW?Rv|DXhODU{(?4i&VQD#L02MA81CEU%$V
zuSM3S(N|0VPX7tngxwHDKaGA~3(`1^p56@lPn5G}WZMeuAFwXc{(<&c$~ekRbq#ta
zbVZ{(4c_l~#85WIQAQ^GYj4qkWXeYD|M|X1CS_|p?H};2|I0)_`Y`%}T!XrYs)qZw
zZ{!*C;o0ID?EnYQX&3aESm%JEn<@Q!Vodp-n3ujMtvr*Kh3`ol*m$<ANf(HdaHDim
z+(;MUJ;*U<fJ6}YIQBm5y|V$VZ3vKl>_f;w{6~<(*!K|b4)`hb!x)r<3EqUPr2q=E
z0LfWpj}p*g&V_v9%-ei9?^b}+K_heg4b0~oG6K-C43L)80Q!>wQm$j~j;sKwtPfz0
zhB-;>Rmj@R0I5NKz;hA8dGzo)ybl-fY=*-va0dG~uoiO=M8Mtf0K5SE;W0P@8YqS;
zFvI)M1pQzG54;T@g3mRywcz{k6R3bsR5Q<aGNiy^I78TNxZebKz!ta{9)VxOGf)FB
z!wIm$A7Ks#;d0)e3*mFjVaUHC{{cHO-wgZVK6nV8h2KCPG=l~D;rB2DpMVe7bP)u>
z7T5(pf_q_-vfyK1>IOgNgJAu6lml=8c;qy>&<LjpM^{s`k+9dokKk@df#)F~8bFBS
zVz?5*;YN7m0^VCVMm$WnYl8jN@?yR0L|%t`I7Fyn@cRpO&ucqq$5Yq1k#W>d*AVXX
z7whFb<X?ES{y^9q^%%n61^*!43$a(hb%gPU4+!@K{24dYFf<zpmxoM5{*&-lWcgL(
z6=6Qa{TgV32Vf(|e}(%3<_*vSpTcEu0$Sk<_&&T2R`_371yltzg~%l28bED96ASUM
z8!m^x6Zcny3CFw^KEnPM_7X_um>sxX44>jYgIhWL9Cruu5#*^mE}`uVe_t=BM&S$0
zMtBNdyhKj_Hh?h$cpmcsXotT5X?{9ny)YG~;qyG2i--@^7R|euQPa|F#(X}0k0aM2
zUqJ3c-T)n#hXGX<`W@>y-*-G8$e0P{3*aO8YMq=OgAd?!*aPd}neVuZFu7dwr?3Mv
zey4tp`DSQ?L9X#tP-Pwb3VsH6!3Owr9c9anlEqr}?A0432bqgqyD>)cf@35fS-|(x
zbx|?&e`BNwS&UtedkOxf$g;E;DNm1)ip&_PL{?$1#=QprT4WvJ>!V_5TgONvvI)BZ
z_h$TCkjAALF)hbP>uQXck!{#5xLfhJA=?RWk76A;$9E#Tusd+?#=i&Io5uIu)A_!8
zCf|2Q4q_j|eK;^iMqo4|Mx5XRH;e`H-FX;?3DEF;_bl!`n|sd**eAIg_lY)mpX4F)
zu@^+`6J5+cDMS`wFUDPue+jY_ds+HEDQ7Q%3S=erD%`7s_DPNZKB<Me=zUTT4VW9D
z2|oigV{QTC%04kcD`qpa;b#FWW*f9e?-M(8VD5x2{2b7Yxd(bP*xvy9F%Q5XenT*f
zc?3qI*&hL1nB6djp9jV<Pk?5H`-klK7|DU$q!`gc9_D-~$RPir5OWa}<EMub%%xBk
zP5y(o{!d`fw#4%o|DgUy)?%+qr~aqjRmW@_u{Ysvz`q&Ug55Yo{qLdvN1CUp|7Tb`
zHFut<Yp874+m~4%y-NL$?8M%My957jWDoY<A;v~I-n+g$f%Q#^jEyFv?~fe8KAKJ+
zD-#`Pq#OGf?jHQdkrUXv$UBFH^mLG($ad`Z!=!f_>5Xh7uPt%pM*{hQG-5aTkzeGs
z898PkJuRds$4?+fu{*V-x1RJy4$hGuxDVq$g6zfKcbxp1Ccluyhxdv;ZLgFdOEdRM
zLEK*SE%r(wvIu)F?pplwkonj(etQLdP02>)U~fv^BZdW@rzM^zWF7YUxIN4t?vX}h
zm2(gChkKYm+{66g9x3Cva*nS+R{BKCZz+S`h4<kMyg>OBN_okUw1#f7=03~@<nO@)
zQ<QCgp>Fzb>Y_KmANzK=4IY7)pd9{0{dNZY2^$Jga0mPpjzAVv!8d>!oaS=426jUn
z+zt1`&mk46p&637#zPlTC%|5kqw7NInD22M=9{32@l(cngn2<}fm3T~$Kd`F{jRgr
zfvv~~nae*;pU+7@tYJr#d=XAP3ZKHC(7$NOjgq(MuNiLLDi22yhI#UL;6jeQlK$Td
z)T0V<bi-|gdl<jlaDSG*;3DzriMNn2Z{c5q-^;jHQ5V02JrDa8#PdC_Wt_MNHCM<`
zZjcNYd@LiyA9=@e`>Oa}s9Y`sP2Wdf_Y3i~on~I=OPOd}l2Q3UoNOlIVq+XPn>Drl
zR>UG}#ri7q<t&2ukoeE%9{f1JgF%`u<_;Ynt(WdkFOi;~2TJ>y3&s8ef9d%9rPBG2
z4RQt7bS3w29oN5={^>CY=lY-H`m%}R4gCL#e+ltg;We)POTxBtPvfwSe(@0)BVVrO
zer_YoOZeYSI1}l)g=={qf94@I3i0g5+<@Q5gfZa%jG8W-XB+2v7%nB;O<+~8hwEr!
z{JI1CG}m<k8aVzs><@9CZo)i4To-e#4<b*J_O}w>e{;@A&eabI#BmeHM04za;oe7>
zFF9Wk$6P`@s={X$WJhn8oK^Zi3-o`M=>H(|Gq*`W>^9MDR-`b4^-p1ZTadMcdSnT1
zrNPW$97g|{apMYPWhVOF$ZG60$XfhS=Vy#boy$VcnPZ!f2JFqq7W|D1=szv7=ONO(
zivBZF{dSKPX-h!=Ii2;1ndrYVhSLfE@BJbEv%drVTkrR!GD79w>wia<(0`d>ec~MJ
z6OsSff0i+!apIZ)6X~NTAI_0hkCCP~k&Z6X=VRDJ8eNIoK4drkexzX*9EU^rpCetr
zCJpy<%!A}b6mc!WWu&u>V}4D3-a?of{9fW&c>_L%Mf|KB|0TzrBEKdG|7Um-_ahwR
zf+sj%3CBhfe+|4(*dtuadBpW=SmYWua;{bAApF<(ts#!*;0EI9C;s<1{%YdcNZ2Iq
zv7K}M2LD@ddkBuG`G1i4e;?+Wk{SO`Wd1*a@&9z@|1+8YM;2h$;a+I?p%f)C{~yo%
zKa`M$rBHT)F+!-AW&9tiFjqs(3S)#&hq)da@N0x7%m!$7qW=WOc=Di{vAH<L=(vV9
zq=j=@kv6WO9cjnjkwG6dmiz|?bVCpHGH=ia{V)K7Fa*Oe0_wLpoS=T2!wqBLfpM5P
z!P*JZ;9=79L-;FfBY(n4uWZupZtSl?EopX^_vRq(caVm^fLhY{71I3;((xtI{tCj)
z;?{}`AWy<b{|LB*F#F+d!gauP#Cs9v+{<}BC*FR-#}aQbaYhpEL-P42xVJzw$Gh-9
zi~lTfzDSstImbraABL}CKXH!3gT(ob`h0NwI~<ct_`8X34*44L2w@8GAIGnQbHAnD
zC&%|iZ;{?W(rzPZ2t$lt48jNugL5Nk2ySqJ`fbB8P`_<B4kr55#&Y)2vj?zFlzkJN
z0qC#?hz+|HX~(}Eckg$Ik&X<?<fF{pq%uB>EDxf89!UQ@oc=koI)?r^)MBrJ`e6F^
z(3nR59=!egEOe(*|JA?GB7RjdHiz?N6Tg=8=k_r^fISbX!@mIcBElCU^%?9-aWsIj
zn*i3$1ju!y%`2qcE9A>(a0ThqirZtPV*u&CjWk<EF2LVN|MjH**M!@J`8>icVE>G;
z9i;2U<XH;gqIlPIFoWMo?4iW@G3@8K$KVP0HGIP{Um`0w$06dl5c_Vpo4mOP4uK1{
z;r0qd5LY6kLKyiP0Y@Q>bHzeDXrUaQ=lY7co&j(}`QO*cACbS|ntEUk9wh%YTY3K>
zJDT?&axrTmFNyab3TCerT^jHI)-Wkb;r$0aT*<xfg|Z;@HINk>DN~W}Lv<AIe+=(G
zvMzx4AHRlR-hX5hcEd97|J=254{2ib^N~o>pw-7m4k7OW3vpWgedIyR55Wu2L42LW
z*A@P+G5&7i>mj~guB|T}{l`qof8-$cA>4=YA3=^1-Wf&t&+%^L*z7j(aDU^RV**?m
zSBe`(r`g8_IbMAwdT^{wjztFn+0Po@fw(Kt^X5C|tVJG9=DX$C2a!Xp>HYWj#oO_-
z2V5yHajnhp@9&GZvVXc6j9>yQ*jA|LkQQiQEp#L2tV7mouasupmC^!D*bUe#{8&G{
z%sMK6-W7ec)bw3Rzmk3ibL7=NKafHX<C3wAQ)(HfG%zlSxip?}$`jF|XTCFgBkPT+
zXElC`WFhmWsn`3c=do*#^WLwZFT;R%AF_9zH4`Y54+l~X1W^uPcSrHwgENeB0Mu`e
zkAwOR@-Z+oXxs|w8V)1a46NM%3#ht(oebD_^s$B`>>5$OrP))@+KqBQaUi=)=<xZV
z%g4FP)1#$q867|DH9_e71)}?hY-F8AL$sgNN1z*sYzanJ5Y#m!O^~Bw-=-`-$wI0F
z`}qt$<$;z4b8gnvQiQn>bVJN#Ln-DG(BoeWZ;)4edH-*LL-1>O0~FrLC*Ts^%j@7F
zco?kE0SfQqdvGQ1=WfX1-75Qx_nc#$)D<4y(M<I{g~Lz|n|W7bAdvTSGsHkWd<17;
zE${3P;CsZi9)1juKqYiSBlN&&xSjX+Uic~GfIshYFg(QjcL?&}Qts_TRq~!a48Mj}
zcm-aA&p@N+{jcWzH}L)=bFpiOc>g`T|HuODx?{moSdac6vRKRek6Q_{bd~xaSsoND
z73Iu7Vy+^dYF(t%Bm_xq%=e{^Ik$Rf2>*dJ5?@oy55!>S{U^Q_FfIhM{)zhkF!g^X
z^*_>r-Fh}yY?L$YCdR+v|MC4h;^}r!hgcZ@P6?7e#>o3;dH-p*4leWT(vBTQj$j`>
z&ij9oXE-I8b-p~ag!9B)CgaC>mQVV~H>ANuq|=4!a{$BeF<ePL-VKpF0}sL*a4Gp7
z3b(**a1R`TkKt0Dh5hgZ48uh{6W2i@T*Gq_3irYMR`QDHBLLpPzUFh<>u@Q3w5#EE
z`Vsd)BJ78|;ePs{XD=cz;S=~8_#}>|pX<EyV)7EQU^_erkHgbofR|wuJkSAsaE@cw
zATIz-EcqWnIgvpA$C3ZZ<UcYGd;Vt1mE=vLBVWApA4e(w)0zKBB>$7>uP2fJl%?fR
z5l#CKsxVhW4e9J%|Gh~51JeweP`C~v;YNss+u#5sLNYu7&%!U^SHNezG&-n+IxvC-
zj>8Rb1w0HT5CC(Rk{_@Lr{NTQ0UyBn%SdNf#{5^H3#iecmzD+JV$NY+J$H!xQ3;dR
zhrTcrl%r1!g-`^=XPJM85-5eTa`gYJ(f?<Vpptm1w9LOVa8LuanCqO(zcX;qK)j88
z%)fIE1A1|)K3ogO7*oy{)8X^ko1Jd~r?UPpo%Lr&&lfA$Ft^9EKX3wj@W-(~a4c)S
zl3Dwe#Qwmz_u$?e!T!Ec?C%@H`m<>E{t07$-_7hV2qTzBgV3!DW^ba6tUn86{aFC(
z&-~HFJBtnx^@_%y{14#$4<`SE$p2XpE&4@yGwer%4p#x;bc8Eh71lqnzv3eMcu@D0
zEU>0^o_fi^`rmqVzZ~fQ+R^_VqF(9~>M4<0%yl|+(DdkWX+;{d*#Cm?2EsSv-h#U^
zUBq;hwN9C=Z_N;COFf`i(hexr6xO+-_tc)mn$KkVz=<NA2ia#Nf$!tQ^L?Bc-v4Oc
z|E&j<zVHJI+ka@zhjZj(0P;czg4v(lqijNkK`2}cKZISd4a_FaZQ&Zqxt40q$u(QK
zc8iB=adJ+sv4d;1bM5WiLl1OgcHq{9-@qz*TPy6beHNXzW%Rb{(c3bh#|49!-467&
z?C5cEj#2RLM?<-$etUc@l|4PVhWp`Wpjgt}0e8b*n7#g>awqaW$l$)_dA=joQMN!R
z1Vd%4$Ti68VLj9)k@lojHL@z1G(Sq(r;-PU$qQ(K<`nWE;$EdGjcZ~c$_XxT!x$Xo
zeB*jw`Iz(d&e1kV@Rj~J^!<}r4}cuPK8*Yv|8^5~`%<iQIH>C_)af4Tc4Rm99;DGu
z+n_v_xx<@9{k}pQ{QLU~Hq8Gdj?2032Ot@kR@WF3{p2^umL%$GWF<7K((k+V0LYpE
z+6u@~+<y<JVRIm5C~SimxC=B#X&XQ`<UsC0+6Iu9>?a5DPl8{88G7Ik@HWiC`b&rp
zB4IyF!4>P7JA*xNCor9$IRei?1GK^jybq@!W&`yF+z$7`f5D^h6ubx(Pzx>426pI%
z0T_kf!<+C0oVSs=O1K_wg?KmwzXbFzHG}Y5I1bbBC0xLBa~bS{Xo!W|fX~Bg9s-Sn
z_n-PS+r#^h%*C#qq5Pkt{6`inQU5Pf{;yL1Ba5-?aWBEY6j?_2@*&E9j;};kVXsa^
z|1TN+KV)4R`hV0J4VmcwA)BxpaBs%H1!;89ha5ux&qG@h*@oRR!~Ay;<9XEU?P=&g
zY-at-Jo<mowUzZR3-lfF?}6TG`u_&%e;?MrlvDo)v;M`8^)F89e;AEs{fl00!?OM*
zko7KEyn8U7LHi$aNrN(If=+l9rr}HS|DW(H%)IWJBA|HCv;eQOW&lvh(0l{FJbUTz
z66C@Hd<K_s{7%>dx4=R2BY^lWf$P8oRv3pf#N*E~I`{!$|HQMlkMP^^djuxocHAGu
zy#@LqkY@rFF3ncZ^Q=|CNq7fVLBTzha7SPS#^7b*_#M<hBecbk-f^UN0{R9>BX(05
z`4LWjAiFZrx5T|0{~lyJcKc!4V`=0Ea**&t$7yd(litW)?0sS62geT}$5!bp;QsIZ
z3P-U!7s!t##uSjberrV=uvYSr`N3<sk+qU#Un|)`YbEC(eRcMz&_fB7Lc!LxqT9Gu
z3JF_;tcqDH)x=-J`D>A7K5PHA9t&9sO`Okwdo%tm$U5xx3p|gUzY(cP^_48hKI%*P
z;48TXU(srPC6DqiA6bARx(;(8<|6!xk$OA2Z7aS~3S~N9bPRlH|N2TLRQ35%|M@ch
z#r%)MSL&*LrJl*^hE(Q%pecj-A81DZt%Wtv#sKDjg3+xAV*V!%-3mYEe-hCDiDLd|
zh&2v@%>O_KbVjk>D%@8bKECWt=qo+J%>PU?|C7f2PdfYC5!WDn!y(*<kt5hgiN_hk
z{14KNeGGRG{^Q6A?3!iTKdb2JBXg*~a+j$8>Hp<HJ`_aIuY^J<f?_T0pN;f0gXw1?
z%hJ?-DeWKJD-GYZf3VkJul3OWLDpk$KsMsvgfw7pMz(0Fb2F)Xvv}UX3~gWmE7+i&
zc8guh^QPx{Lv|T>9!)%t&;z~cw11$V_R@fs_5ag;GPKP4|5f_3CwTu?c;3K8UE>Dt
z_}}q0@_r!YGvqdXD_8Tb{18&$F?fOZBpa%s0pfruZ_WMq-O2lD1`kX@Svu_>?xBKu
zwi2p*{z3V7j`Hsu<=;8VzjIQbdQKWL&e4}WCry|Q(42HmTEJ*ICnm=^+92nsAJ0)g
zo)ZgZD{eMqd;K}a(9TK6;yLMDIwxJr=frXL9DDekWB<-`(yKowebwirpD+V3s5>Wo
zU!L;soQ!zR$*A@m<=;8>=Q}5E!#Np4da};RI81<bmTv+1(e6BU0lJa&<=Icd3`Y9X
zrlAX@+lO{0bkWXqKt~Mi&@kGiNP86R%uL#u9Pj<EwVDqn*oWaT?Naii54_*O=J+v=
z^?-|ZsT)Q)-ibYrHfuiQhO<r&nH|7$NW7Xro=f5_p`BX_#R)ujGxX(;p@$9ym~}^a
z&XBd-Lsf#GRMS?epl+|6Wu4#(V+!PV^U({WC6j$OQrLF`8n8EJFqUxq0_r8cy-gl|
z3kuId7Pw&$en@^_4x8W*`~;HW1Z?6t$VdJ+v_U7t;C29Bgx^3O*kBAM0Tl(!*Kiu{
zqFl~|T=*Vkd=Oj$Y$>M+Lq@=_AO|Ypr;rLQV1<X_PU?eu;BT0}fm!$fw&3;y=!GNP
zS08l1`|u}N$G!PODLe!k>cT9@h8)N>Q2%SG|8>;=P(Yooqi!!m7U`+~tEvA9Q-UnT
zUO*f-aQ_vn)c?!W|4Y>W_|*`m7FlPf{|60D#{ZxRvw=23vxWXYVT^|v|4;kB^#5_U
z&C%vwpzUqp{de&GBfC7j|H%LJ|L3T07N~D1&j*l$#6Og%?l1XW|9^(}pD-?@n>fa%
zzZ?H2o{6C~@+Q|_0=JSc|0MnQkcQQyoeoC8iM<GSAIz^2W(xCLxH;hOm_2ZqG-!us
zaC?(9egL<Jkv9?VC%CQWxTCn;f?G5c;x>=p*BsN0-;>1s8TPaA6mGsqD^*j#7G48D
zeOB{3+8aa2PV7D4+>9Mg@v8i{KhGS*;%<W|U|3%h44WYgwn8{Wz@Ithf8Zo>Y7)r*
zgXBNtV9rfo{%<qoe;oOr=_7w4OzRr%59T=LkJx9x!?k1}&*#1<{xtbe4r{SDA}>c;
zkXIplkUNl0WEAobv~?0mhdOA4A9Bx!zzDCyr|>!a4FX7ub?_V<gB@ymK`pdEFV9gQ
z&r?5A{SM3^atQk{awM7Y-@}amq%qEmRKEp2hWs<f?d3XBLF4C?ta)_PAqR4yoqHRA
z{6%!zXVJf3>660cK6KpsX#e(!ep#*m(Q#izHy;0TsKCF8_^NajO7&UvKL}F`b@<iK
z^h?8BzckWrXmV2jLo?=<Ec8E2^#7rCbHA7^l*81C7W$Re*a4X#zIN(Ldo|^*fpQml
zH_y&%u$y$diRY$wm2y@~xl6cTU>~ff+(Zr&b_BQ4A<9_~<u1~VeGGRGag8G<uxrxM
z(XTct+4V*x2XeJWMe8vtd3{DDA9I1zsOZX#N}<816yaZtn|J(=vZ6GO`;X@SX<t-C
zutor?!qI=3;r_Yr+Bxo@^sI-5CE7ovXA@~+Al;ftqZZ6Yf7(CLO1sDGgZ@h}?H|lm
z+-$T@+KJN+9h^h`jzQN}^k1Mm4E-0v_NLJOf&MhwKdH2T(rN!>(*8M0`v>!AJnbKF
zajouH+CRy(e=v{Zrs}_F`sg1*HvQHdC;dYU{X_b$dDI>Gm<#HuXP~f}dZ(N*V?ATX
z)ITM(GfJuJ%7&<CIJSZ~Dlu2l->rrkj;$q*Is;?ZTfdwCX3o4RhWYPB=D(TWZkcEP
zn=yG)6!YKF%zxw8Mi>iN!<ql)xOT?x?Lo|cBRd1x7bk$VjTx*3OlNKE#xv5JdPe$i
z?<bxC!VNO6Pq`q&;fzIu(Z3I3>|vhge~IURf#-k4N8aVRnuCeuJjKR5*oT-ufj{tm
z&cXoi`aF+DCS}UEp}doCYn9hO&?^7LoVAsA_$ula(8CyM<AHJRd7^#{U0^?Pf=fqx
z3kFGpp_x7zp6inl!u0m_OCM?1f1Gv#>DEFVQ!sfgWfGi)f~!TN<vgjd6~AzpeblJ%
z9Rkhl0iIDLTMuapAs@b=96ZgneTA$$OnL|Q%ReyB?l3A<(d9~Y0{Ia~ek7A$spJ<_
zB#~dBSsjzCnI<Ls@R;NrACuhjG0_^vB+oJ?`T8*_a8xR~g)!#8%9Ns^F)0o#Q}oqi
zQWD-PrN^6;vc)keKlZ9r#Fi<QXAMf#%$QUYzGi+*Y7@$oy2E8keP)@`;2h)qzmWbf
z`v0B_<!@YX3j>};2m1eE?El3cU}p9JYYVuL_8+>{es42}if(p2{a@(V%KSHUp?B?o
zZp=N<n}q(q;iU9epQQc!jtusllpz?NeOpFgbn$KWzj<3+D{qS%#xQ$e-1)XlfF^!M
zvLHKYMsgq*vo`Zh$xDAz^3&dwg2XpPm;9y_rq4)`V@8UHW<>9qkrHHS?3+?{>`m_f
zO{suNsDf&!fm*18`sp{N;rN@<IQOPB&Cl@sv;V~`bGTrf`7`SuW~Fs@R?NsY)__^2
zXT^%N1<gu(%B<L-<HRicU(B-pVV3m|v+T1uD?Nno^`Di#jk7%evohd#kN*EG_x~RA
z|9_E@xj7kKm=ovnoVZr!sQ>3=jF10$EOY3q&B+;_k?gbYNe=uk`SH=GwA%vTrQh=|
z^}u_gg~F}xN)hy+Q);>PJt@Il3dNv@ij;Sy5*9i3OQ<>guGFTzD^=<5q6hP?G$g;v
zcQ4<Szu|X=1!8?W{zuBsP!4cx|C{uCqM18C*sK`C(6z^GO=SL>MPqH)Ey;X)fI06@
zANqkQ%%vZs-9sg6qtDTW-EmxzzY}l%>uco9eeA1uz$NAH`by(hf0pvg(Yx98XE8kQ
zE91e8`~94@FzeroSum#W@|9JN^DGxC?VBgW9zG!*&>1s9`8pwv#0lv>IKlYxgcyS^
z5EFx;t%Ng23`iScEMXI34Hys`a&&P(oYTK&zmNfOJN;#hwzh{&b;o1ZGLOpsY|#_c
zPagC$Czzl4J^GmwjL%HS@X`SNh6(zY9{QIaDMTT;D9j_pC|v73?5`I#E~UD)qAXJ8
z1o%s~MI8@dU$U$VL<@QK>`&=fD|IOzsfUI%f997BN)r<UhJZ2lDqwxhQGY4hI4<RZ
zElNdHi&7cTqEz{hOEuIS^++vbkNQe2^{#V@G<*PErn}H(Vu4pKY1A9TUYDHze4wb3
zK;#+xU4iuZt|A>jqTk<zt`m%o(#|A5YgjbKzQA&+8umrvBhPDCOxDBk-^VQkwn8NA
z1MT1WHY74%L!0ZjiX>rv44#7;a7VKzI<f_HFhO3b`RRc%%K7Tw220h{CaHauJ*Ux`
zD-U5z{%zXg?=p8R%(Z=Ri8P@D*FvMJ8NK)VGeJ`K)%T_G@0Uq~Y@mNcI~+7zUl;LD
zUr3u`Eq#Lk#+u*>cmYtG(EJL11KCgxwa@?t2;u$t9r7Lc2mBK*VxEa1Hw{B_n#=Kz
zL^2Gg`4jDvb&N5;ha8pb#aX;vTm{?2MZJ^PbeX*VBIR&hxM+WNnT+7}8Wae62-k0r
zC0}%9D7OCxuAut98SaPYp$PVHD>uT;a4XyacR?F%zXQnDbmkdp0~%?+m|*OvOFZSn
zGLD?E49n=*QE{fb#8v->xRHY?E*VO5$#8~?_Jm7%gIv_NE}mzX4EVYD2CPdQM~8V|
zT+&0h_LHN`SGbt38D_r1C2dD*6iZ5tVny0KF4p@rDyFk8X+7Z*^KlpBK@Ex_+{L~C
zZ!jKGqtp>^{ZNh45bcsi-7x#9xa9Afc@|P#j3?D7Wn4#jl#BM3i}5WN`dDv>eqmTj
zf*O=klS>Lf7dtG4$fERN$@LkfKj)IX)HfK<a?###Nfy_UZFfnI1$!3l-vGB{Zyb>v
zA2;FMqTM_qdC2@2w-orhMTfaC$jx}0TZ)gmMW5-Gl2o^p9(GHa$tmSwZmBr&2dP9>
zZFW=sk4O!&c6mhV7XKjiaU=Bq8kI)kX>z*J|8Ud(b4$xHw-^K6wEwD<)_Av=^|eYH
z=d%peD%NN>Ycxlg|7le0Twh1{AL#$pDqR5$iX*~}{<oWV$}PPoYn8rZwMsv7AlNN~
z2c0rB=a%8)BQkQ*O@6wW{~Zw*(mm~F{?{!Y+{aJ2Swqi)$oe+P3M){u?QP6|waZU=
zZnd5^$um&@Z7fg<3|d8JYGeMpO^T}9q<C|IqK_z0N|2>XZPb5ODNksVio|xQjBS%D
zs7`5@8f0x|o7Ba(Nj>I<q&8_xY?G#lHZjDsN%Ph=X$fl+V~SNw3vJTs|E8FcZS!qn
z@o5(;(iYV&?OWeu{Hq=Pw_K(3SetZZwDEn0Ht8mw9-lVWzqd(We38=c-zEdo1<D}j
z8(J<<h6y)vqMi0%uHxkST-4?6g#u+PL92L<w$c7=lZm)CUQnHq<)c%wkvVe~+CMhY
z23v$}E;RXp7RqOf=mPDG|5@Y#+<pdn+)Ml|QtD@6yw!p}wM8m|EK<4A!uY36eomO0
z*^}u1Sfq}y^=IwUfNVT&m!_4I@+khzcCFHan=#&kKDk9&iN_pakv60ys8F#cS<ofX
zDech~?q8>LaK28bUAp46ii7LxjyWkkLpr55Ggs-0wMc)41^pij`hPlQ=%h{=Mvib@
zqbW9VE?MLkoYOsTkumTbv&%U0DcnZVtTH^4rwncFkU{NRGT`i>FJYBFeTVcSCpH%<
z;|o^W3_68vKQwN=Rh~yWomLrLw#ti`t?5>D7Oe6s%o=D-u<{;JW<ig?Rk|16l48tV
z`Zn5dR_O?{(x<VKc9fyfR;h`yN_BvhcRgRJtS?k50^gEy)&Fpw(&Dh9E7`%gB6Aes
zJVQJ$4UR&k-czX5A@eh>lE?Y9@to7oDmfmjWc%eQSt}iMvfq-@<qj!{vx@#ip;F9s
z7M(3r3OS!HU8fYJS*eU{lC{|;+0iz}-E5M3&?a@9D=*&0_Y-LQ-=R~c3v^1M{f|-<
zXOrSso9IoQ=o#6hw7L`h&p%4s7RG<-xpv~Is<+Yqvq{ZKo7Bz|*I66${{>2e-6oA`
zMM~46jrpG<rFqtd4qvAjGx8NvxQ+D$e-yK}NNEetQ!Iftv2tCuFdJ*Tixj)5Na;X!
zX4)8sw1~rFlWt3=^z_+y|82~@c1k~TV96$fD>fNAYm;HlGqPZlQ4{&^u!+lQ6Zepf
z^)KXqwT=0oKgtABQ*K0es8q>5VPySFv68DZiZ-fP$#WVd-@l9gzedsNjkHhRrhQVZ
z6zj7TeV<WE2v@2#O4-3;r98G+sX$g5j8b*7RjQMX^f$Vs_MlPfjvA%@U>D<#Mrlki
zN>h?i47fMP8>J=AD8^$(F-;q3{~E<SW2C-l70a@b`7hf4$o7*)o|P`?Kz90d(f)s1
z9MxUYovKlKaO?FqN+0p`#~Nkes8I%yLrJB|aFCJqZ?Q6(VHD@_V#USz+$&u&=C4sa
zT;F&m?f>OsMdQ&ZSy?8@cAA*~Gx6N&6>Xqi$wTH(o2XCBq6;@kVU&aQ&nC)0ljwaM
zQi3cEHqoClNqLG%Dl$w`nQoG*K$BGCUgK|Ke%2&)S!St6HZ0CaBeKcIB!+p1G$UJP
z9Q6NZXkR*{H7QFm&zqzz$t0GICb1HaZL^8~ze(&{OOy`I)9KJFU0Y4uzh3DMGD(l!
zLHjaG=}R+7f5MCm9M>y@X&Pndpo#rgO)_%WMES2*ob!6cg>=W5WbCL}JgX)dUvS98
zSrcOht<q1wsSnw^M4CCfWjL)>hV<Psh;%2kifcJrar$>NfBz0+Cap5=X%&yNRmPTE
z#f*PzQn#35TdCWPVk7^oxLMM=r48Ac)QavxtJpEO$F<UDYh~=Bm9g|zzBAP-mDSx+
zfxX;bqLgiHmD>6er6%wl_E2j@mp)r*3~H4I-0Q<zDNA#dmXua$c9bXvPl?im%t`HL
z?4nh&lDlc7wn{$a`DH8Gm2SyRd`F6sTct3qRdn260p}?_TcVUiwTeDnuN3312{cPq
zM2}=6bHdG%8*LVCgjw>onk7HXECng#f0`Nne>3d~vlO2s{}avVfB#8Jk!6G}KiNZj
z#4MEyJ?ww_C#g>Fks4i&Qj1%io%~-gOT)Zb8c&+(|Cz<GQmiyP&5X;JD#jo)`CqEE
zaz1l?5BeWDiY3O(`nNxct-4fc56)KXo6XWexXz7cbOuWmM_;MZjqKsNdSgwDCzz#w
zrU(6RGv&WohUR<d|CwcE)hweY%;<lc#Wib2|H~|6edNE#%y@;F_MTZDr@lW)yOC~|
z=2gZ#o<m-Y)FP{pMaTx^C^8fI8uB<&p&k1Z<ObwD$Pi>1ZSr~Iy7($|wZhOf*~U2W
z^~@QBGX|fCF3DDr$Qu|-Mi=mv9n9fIV#fd8?KDGhYr$;^c|{as(u8wx+yefO5pE5B
z+u@=;jPWDC<(Ox8QQvItmMlw`WXHctd#+1zO<kg;49G*~ryHdp^<B{&r~Fz)e<$J<
z#xK#|Vc@rfLEO?4Liul!tb`4cy_xaf`B5b|x<j;(7wA#)4{l(5pYi{hQKfL2_dndN
z6sK$u{j6RoN#pxBhc`%B2J`>UQKcf@qg2MZm8yUZQq3T5O%UV%_EDuSxkKuG_(l)$
zH~MX$z3Wj7+EJxB!>zRFMipb0M=@oLDy?jSWv<@9``;}V-3GDNGyc!udOPF!c8^=>
zaH9W<dsqDibVoNxH|8GjuK(cO>|bR3cgC#@PP>(%r42G%J*JGzZ=n3&AWm={Gl|=>
z;a~cHL8Hp}$qw}YAP!wX@I+lEW3dy8d+TMa6&zKZKBLMg<8dQP4jD!c1z*P4!euf5
z{m=)!jM4QZjVj#<jLD^5hHesLbP1O+R_hRZ>}ArPaG`XQM<>ZcOXg+L24+(?V?Ei5
zDKlF!B3q)nr8%mbK5I8^!&0TOK3i!Bd{62lx~1-LiBdb%Dm8xH=u~DaRfMbT%T_AV
zyXgaDDP`f^teNYUl7w#7WOPe0;fg%nQs`Hr=vH{&6W^2kc*+AqH)A@jl8el#r@Tmi
zPqL7j;4XQ+Jy)5&D^Gbn#EEn&uispv+{W|vdYzW<WaP4+eU37HAV+!Kh%83#(D1zu
z4d3=a=4zCNopc?!zX+aH9qoVwxF4Q`N=OF-q`+_B^)s9&m-8GLRi?Y#ie{SqA9)tD
z=R|Ulx!ASK=%4Yt<|7NR-_NstjC0JOGdT@&;H0m*nRlh^Fn#ef=7|cflPK&l5C`#)
z0IjlvJ=S-M@l)2>esQB%&+Hc4SJz6L>=MgAc+pS-%A$Reog64RiGh;4)mOBO9wm=w
zB!A<$Qji%ay0}2}AAO~0)}s^$^oX8gN{%xBPx!L<QKfvy!}C9`R60FM)xlAv+U`+m
z;@3(o$JN<8r9LB&`G3Y`5(0VteZ`PAt~6tBLHWpN4-^x$Is?TFZ8~4EBz20_=_@wv
zxYF(!SL}vB=~!?oo#lbjbvBUs-$3aur~SViD7}lm(zn&4^i$>yz~JntGE^T(`^Q&C
z!ad5UUpx91<BE&8-9EJcErE>B1hW3cS0;QsipIyu_b(=tY$jxK($>-b^JCm=OvwxD
zmi&x$lpl<LC$RtKW~Wj#H>MOvOep$eV@gT1Qz=E3p`%fb;#Ngghg2S1CsiC@jWSFP
zvNmo)sXNMeZ0tH|fJSHn12m^F_S@GXM#hIth6$y$dO|S|t&_I7QN?0kC)U|@=>M#f
zb{*~irFGJgg??G=n9{{_>wxY)r_ytFo%EhXKP_rZ=?}Nl|Cvw*v)0K_H0^)=IvLTf
zlhIkmql3m2*P>H#FH9(7j&-#E)-nFcc(wnS;tY4NM!+HNxb@=lb19yA!Z5cm!7<~s
zq5Fc@Gsf;#2GZ8kFIX=_8S5zrtTHk)p{P0*_Vo3vQClyaT08YTa})7gBl8YDC=+yZ
zy{6dp%yBZeadb>+MOqTqOB>RLv_gG4X`~%f8j%fpmtxSZM@NFOQSExx2-~GRaY89u
zU~Xl4Jvt^1sh(XgRjbUc#7`(SLoP*EzaAY`_C}4GPzsmXe`%IHnsF(mPQFo=wO+Cu
zOm;1~l$;>)c#i#-&W<U$fgSX13za<8FQ!YfQZA9~SmxaiUn03dZbi%Uk{3kzzsU2y
z*{$ei$CW~qAc~j(*Kg$=T%>-Q?x6lCQpyZPO1Y&-sUR;ap=#F7HQJ?S#V)l_S55g}
zUZgY(6)BB++W)!^F+ekY!WJ-Ywu=c`GdskbLHj?UNU@|Bq5s(-HfY~!r~OCT`rBFm
zP5&>hLmaV1O84O+r6;*a>7}mfgZ^0Z9|n)wWeA4DJ7i>Q5oLIh;`A$0T->u8#xm^U
zf$=1}Oh7h;RvP8RQP9IU_m*G31)XNbys*EDy~uf$*fHx?u9A}KEsRTUkv=uN?ke=&
zwn!jmCzQ``5%cHt>5gxas<T^U9=CepX)NC&E2?>Ji>xAB{KLhc^30&0q2Iqn41|fr
zZd%?VaY$Rx7RG<Ck~GX6OIJw-vORc<*fXw@?u}cdhhw_@uM!9J$82F-j4`_iu8Bf+
zIB|=N#BCuRwy+L-i@1`uaNpbyVVu~<b>UKtd5lGmf8pLV&M?W!43+GxQ0j(I$#sN@
zb~a4%oS~9m9x4UZq3rP$CN<OvMSWpXoE0W|C^3XeDaVu%u6!|6DuzO(@?@w~t%TD5
z3zeF=P^p~`l{)P8*c<R`tPd4C=k9}f&h0@`e6w~Wl>UFHn1e#)IBu4tFtJ95iY+iy
z+Bb5{(J-0Dy)z<Ax_rXWe+rZC;84atLTUep(smD({-{tHI2bB}8La<I43**JP#H-G
zl~L?Y>@NJ=L7}p&o*S0nENJxGX+Le3YzOOKJ=>Z8+s^#ocIJP!Oa9z;DOlJpx~1*x
z|Fm6J2=7BZ=LZ3>5z5N>wjfkMB~<lomujdP+D`emlf7wnN`2x^_J7zZjX^u5seUKV
z|4!cR9nuoNgZ|$RF`d{!|Brf#1!HZ~JH*09_SS_RjQ{MA_Mr%|J0qlHDMC8?8l>xF
zjX3)DO83xS3F7>{u~oExYiR#A(EhEV{aZu(w}JL=4ej3=+P^ile`{#}*3kZ~q5WG!
z`?rSnZw>9=8ab|}e`biZW`)RB;%jqUBjLzsu$6~MGSYtb8os}94d=f`y0Wej2Xq@k
zq=#dAogvb<7$W^cA(BGa!Ico^`$CxS3z3oO5Q)Rhi9G@7t`A|Zfp-nkAp<ng*GSfC
zh-CYONKR0Qq~WGbx<>M%LzssNk%EmO==oej`d>r)^%}-euA%>V4f<yxQpz!9gey-C
zk&37g>Yos)$_SC_#1QnaLYRLGkvi=4*c<R`3<{xa-6jUVcliFoHfaHkW1nOV?V}I1
zkNz+FUtrfdUq%0&{V%53|6-Z_FIL(AqW)FJPS~%&=SQ6XM^d7HRZ63MssDZX{tfd0
zN$CG2`l5&AiyqRAQbRpoo9-)hxYuv?rTt6&AN960Aq}%1v;LX-KiF4{i@su7_LWx7
zWk$Abr2b!dTdbR3rT_aX{r|T~GhgZ8T%8Fo|4aWb{Z*cSU+Li-y<4gOBhb%|@s$C>
z3=(!|75z6m`fu=``PdBRW0?aT^YIf8jKc(|A2-a3LhmZ(UwT(wJuEwVS<H##AD&_V
z6Z9XLe=me0=HrW3PKZ7O{f8v<AE@ujQcutpW&S^gItuBX|3A+BfAas2{xAEEG$)~t
z2}V2mrqJp{{}0+QTfjPt{vWh2qHhWvE9n2MqW^=Aj{{wwZe&l|r1YW#*a!dlem3Tz
z>5pX?U7ryP`hO=rW<DDIpH%dJU@Qau9~j3x0UDwG$upM;)$s8m<r8uUJa7`eg3ob3
z1HvQY3m3pYa9@WsZKceD%Q1(*GU0AS-VAraH`wn(J^(k+mbDZ1Da=fHXmY{#Gs+R9
zKe7o~2S(_HL2wX$3i&#`32#FmZWj@L2>HL*zlDFo4E4|DxW5-gf01)J@&6I_0NnRs
zza2cd{S^5mbZq{L@BXsyhlPDVrazJHxHITgvd>EV8OA8SlGei?ikWxA=w#myuvW7V
zh=F}U2ya2U{BM+}iDwk)gkNILhvS^%81&Dv&xarT)FhmiKHjO}%rlI!d_)_HeQHu~
zlwhu{l<;P-!3g{wCgFFGJ@l>o0jXK#J>^|Ih4~ZsC!9||xWU7|GON>4hpgv5nv>WE
z1e(AA6<O>{le$Ph^&=_oV_zEVwOL;=Pw|OV*YA_dh~tNFJ=bv!^82t8egr>(2;A>S
z?&mqV6ZvB}!F~M~@^N??9)W4x@{zj<KY(0@lQ4_>44ec5b-xK#aG!^6@+jsy@+N}5
z3<SYXIDU?>QJA;FJ&=UkL&z9-44!}^@GSfq9>PBlxxjH1$Vaf(AQ!Q>BD-N6eu4Wx
z7a8k7zK{J`<fuRTQQ)G#?uIeU9vF{CKOTCgkJG+>9sN@D>tRrbemxB1Hv;X8%+H@d
zKYy0*Q`nD7*Kzdor};hwvS)?)eJ1=&F?`<wY3@5NZD%hM3s}Jhb*b!+2n`wRj|fee
z4bVJC8<=>?Rz8#RqwKGk$^MGz?5~LaL^b|3=snaz0mtZ|(Enm7g5scyMGqwbuS;pb
z#Z)pEOBQ6u^Nl{pO}bdLkcT-R3J<cUJru0G!n_&#S{!9xizUWMFzey}vwjx6oBT}X
zhtadqA{&^4Z=7bIi*n|<wC_nX_7?0-GwAssD+1UT1Kpi6WI4ywAZwS|Cj(iHy@RoZ
z&N%dZkoG9_dXgDaz}|&)B%&9T{EC=;&=bPX#4%RPw&U+g8#+H0a7D9+I*fu7#%IYl
z@UC-2_7m>_=jlcE5yx;4V;V3R%$UaF%kmk|!C&D+I0qNe&bt&s;cECL&&Y+yo8T5$
z;+eV!`77LaBcFvV%z5wv=0eDZvxF-_HbVngVUlNW7+FNvmyy-TcaUF01Lg}EOSlMH
zFt@>VnD@dmaovPmK^{Ur2#>+Du!>tD(oT3QvKzS`UdB9t9Ob<oMS6g3E;N4dJIpV`
z>!9&@U$V|p|3MDqZX{nxn>=Jb>01!?KKd{3OCf1n1jXoN>7j%)EX{mP$`amZ{F{7+
z%B5pcl}WxHCZ9L|R%%muE|Sp0h<I5V5_wL-c`nEU19{OL!E*w}ljJM3f*IP*lCNL|
z8?>L~xquGngsupl3+RR(=sifjLO%?^;8F4whG7Io5As}qYvnKE<{HKlSl7-qjB^bW
zs$8P4OTTdp>7k$LI)R=Xj2{2JIFZA+kKjLu9Kyd}kDeX&-jj3ab)biLmcA}@;pbrR
z)4NX<{nPg2^o@~L!rG8+_*;;z*v&}e3VMHrIcY|=Koc0C0UDtW)Ndzv*HIv=j?PIn
zvVvnPiK`4*PFy9(QZ4Ngq^g%$$TbxC%t-;)r0z|ecZ|J;As4e2vde$RI46U&i|9Qb
z|E;(FKgn~TFiv+KTmpgD(YA*_<9;>vAj}_NUVzV_2Df0`T9H;bjoVi+j(Icg6j_?L
zFuwy~*w11AE9R}370Uc?FkgtgfI2x88IJol<ZiePBCubA-(?Vm`3J~{aNmr40{H@5
zjroUAfq4hA0eSOhwA%^ak9inkF}si(Xct5gW-r7MW<R`-``3&yYo^ivok9P1j`3e)
zE_UrQ`roVQ{~`;p>u@i`zi8QCicg^b4JGV7P|A1e%i=aM|Gh~nkd>?zu0mF0uR+#k
zZj!p#-!cBPiSeIJjQ?z6{AUy6KYXJdT2lT;jGj$mI=+ea?<VviHZlIQNi0Y!b{n#N
zb(7eU9oRdOUHChY-Pn7Oy&T^c$M@|MHnC3w-?v{qAwx-<`2Nu*zJIid{@*5Xf(x_z
zIQn0So5Yj6Nyd>A9CP7r@*OUMAo_QgATNb0AQbkIW>+Kk!cA};+yaMSH|&SI;U2h^
zuy;ee3Tgi=ZZE*E;bGjek@q4K@H+?(U`~ca%&qu!KsUHx7+!{Pcm-aAw;+kI?;+oZ
zWXxY8A197;NQL}N!R;bsHRa+{xF3bXxTV5g++V<6fvkaU&<v6P9`YZVLtf`DeI?qH
z<Uiy?!CCSj3ZV#!Pm=#o0;N!vIZ3@XDHX{pQb{~j7WDre=>H>Y=>yaqX1!G!>#dND
znXLaq8n8DbTcRf!|D*j6tzd>Wuz(e8&^}N5AO5rc53>Whp$Go2?SB*N(O?8dQ_#x-
z7r0>zJTPvVlnK+MxYRTtP2O50oiV=*Z@{0hzlYS~R|r=#?-B`np%nW~$j9JWcma+;
zHsnDS{uRhH(!2p#kNqg-w=lQyE`Ek=#;qOFanFEDqG{)2&&1r1jKn^I+>gw{?Lp*Y
za0FfeEpB7@TQT>84!a(PF(1Rd9P=y4-@}_wjs0u<z5#`O1}=p6a4+NCF6Z5@Kvpi1
zf6L_Gv2Vru{q~h_rH*H!9=`^{c)#C%;#*PoZ;B(nQtDO>ZHz3)UgY_K+!e|X$itiu
z1^DTp(Epee@mv%S@%-?dlpss78;QdNtzd>Wuz(e8&<?D(ln&^GE^t6M^gu84K|io2
zQwCuOhG7KM`X7}^ae?}+nKAIdIH=#6kq`-aJqvOm7qpNE`B0$py2ksjhAEiN=D!e%
zpi;%_Tj`CV((7A=Yn9iv%IjL?b+zHz4g(UR+NU|oEOE>htIU@3uUGM!N-&jpO{JJh
zy{0luWnPmSwtTji{|X5?m0hp?HOOY!%x|BB$`-jw!sKeX28Fq6Beud0_0MaHR82e8
zzg_BIl=`<@{kuW^+oS$P^QQ)jRF6`Fsv*^YYB)8R8cGcm3%9_Jfqeq6jf30ZcDMuL
zVL#jncR>Q&4L^Yca1ibRHV(e_K1hT^a6dc%N$?<~z{BteJPL>5F?bw)0jcl=JPA)h
z8XSSA;Tbpz&%$%?Jfy=5@Jo0RGT<ec&((PUv%UX0-hZw4KhOK0@BP<#|BJl;#oqrC
z?|-THzs&nz?)|Ue|D#bg&W}db*ghInkNwEWVDGgbxzuAna#gASZnOG7mgW6dk6qAq
zdjB2jf8n6|uUCB+^lJEpGQIj=KBoRRHmd(kd{F1w1&5XYk83UJe^Y_?Up?c;P3`Kx
zu|xguRb%@kOO5{%om2g9RL}lN+qn8~QLpV2i+bKq^VRU5<`1a<{U-H)Kuv+geDw-H
z>s9aT^XxkHzo<?9H>lV1xlKLq=e_E6ev#d!{#WO!|8;F&7{4<tHF^!rYFd3^R*(9k
zSG~e7oa(hL<)|?)6|3p9R89O_<vRHxI-=nWylhi{+hn`!kR8{{j%~8z!>rt$vg4C%
zr$k7^^%Aj7B6jdo1u3EaHx=;j4L}IhKf>z~;q{2{dPGV@Bp#7ok4Udaq}OAoMC`<4
zr`Kbr*J+nT?80f6*J+p6DM})uaEkIeMR}ceOT=!Rc6*(6d!25Oh#PRa!RvH`*J+PL
z?7?Y|*J+Q}DOw_;af<djMSGpNMTy`~y)gtA>%D39pNrw2^YhOs`R6SB6EXk9<&DOB
zLuyo!5_!FFr;$7OsY3OR^d?SZghYBnM|zValAzw?i1a2z<W7n7CPJh)5h8c7W0&_>
zZvsR{Nu>AYBX>)r_m<Idi1gkl6=tOO_NdJwy*Cxfg%KP79OaEey*%$(cFNA{h4g0h
zWvBPjc1Flf??vs5l%4qQlATeqbGPigL3Zwuozb$BXmIpKrvBroM!8ds(tD-5WY_gV
ze(l;JyCP&)r0m)$yKsz>UAtx14YF&G?249M-Uz&Rv`anIdsk5sb-j?U48cd?zuWu2
zN4z&0rH1n6^lsUGy^z)3+up4@dvp5+x#4;tyKnGbls74EP(8i*zDM?4FJ!zoVfLub
z-n@^N=<9{d_a;-c>g>(`y|Q<2jQmJ$lo(+P=oq+3_Q}4@vTvW<EU^-sD6t<FRHw-;
za?4TqvHbW}`SH`+<j1%@`MBIFx9*o)ACy~Pl3R~^U|izlHn}bK0l8gnzeR4p<0ZNM
zK#JU+A$Q0f2jq@Ja>wI>W4y$xKiDtvi4uQE;vbOxa;MyRi`>b7g4`u{?U%dmyH)Ob
zLhgEbL=q(77D@QAB-|<qM36wN35O)%eo1)4E_ch_`{nK@<nE{CCvrdz+#v@JB)}m#
zaE~0gFHQ)3Q0|d?o|Aj!-dp9~1i2T(3v!>_7k8K3M}+q!NTM8)L-G8)@aMhp`{n*0
z%l!|^{SV6n@<2SlcuA56<)`v9NtTD?zvSnVA}QO_wn@tNN8mY0d2L)CmWNZGl}F@}
z?N39xJd*O1Jc9L6c_fWrB<z$&<<VQ@(G+>~DS0%FUnJoAk~~UE9+tz0<nXhP%VYAG
z`b&kUL>2s}$>Z`kQT{@H!M*(=MN%bI{lOEGdPGuRkkl9D33*bUjFl(3zb6Uy<RN+T
zetA;86f*Bg{?qtH!cKYed3lnQd`g~rLY_*Kr;f-|Pd_eCy(mwS_-R5Y0;R=D+9651
zU(z0yw8tea^(jf?hqvml26!6lbCO2NefJ+JN90JX9Jx)79FimV%aJGKNSYjZPM(&h
z56Ls~jGA!I#L6?b$urpRmuDW4XP%Rz@~k|Y@~Db8uJ2@|JSWdRE6+VI&r7<zATOly
z^O7pRlo#d2RDP+FAukCZm6l9-xm8{^%gYvd*(xu0$;-VzhTDXDn$*Z-mP}^LWR6Vc
z%A{5%^JFq#CJSUzCzFLTStOIiGO3rz5}7QO$#R*jkjYA!tdhxUnXHk?dYNpH$wrxM
zl1YP18fDTXldUpomdQ4mw8*4QCfjAwE|VQH*(sA<GU<@XZkg<n$zGZ4lgWOW9FWOD
znH-YIVVQKwq)R5<GC3xb9+{kwDUD2J$yByX<;Ya7Olf5*Pp0x^sz9c6GF2#3MKV<^
zQ+k;yk*QLds*tHlnW~bhYMH8$salz;lc{={YLKZ$nQD?LgG@EcREta*Wy&N|tuken
zsWzFi$dpy4Y%<j@Q+Ao^kf~0Y>XIpkOm)luq3XVp>&VizF|VQgQ_+nkpa^LQx{-9F
z2^0Xrd+)vXg!jNr^N>}cV08!MKV9ags!0IEdMBYLf$H;1QT(>eow=Ue20Jsqch}zz
z%irFYza5pootD3ymA{>rzg?8SU6sFGm%rVVzulI<-Ic%Hmk({_!@KgKy?p2>A3Dp2
zuJWO`eCR75`pbub@?o%i7%Crz%ZJhOVXS-@FCQEroGc%v%7^LlVWxbTEg$B}hxzhh
zp?p{@AC}68<?>;rd{`|X*2;(V@?ood*e)M-%7@+ZVXu7HFCPxdhr{yWefe-yJ{*@1
zC*{Lw`EXV~oEIJXA1=#>tMcKx=*0hUTYN%!Tm8pC`H!LUA4}ywmdk(amj5^`|M9+j
z>@Ob&%E!U-aj1M8E+0qA$I<d}tb80VA1BJk$?|cke4H*HXUfOf@^P+wTqqwG%g3ei
zak+e4DIZtM$BpuFvwYktAGgcLo$_(FeB3J^_shqF^6{{Iw20I4@vMBjEFZ7R$LsR(
zrhL3DAMeV?`||OjeEOk$YAK&u%cr*T>0SBMUOsh{Po3pcSNYUkKJ}DOz2#G1`P5%N
z4U|uV<<n64G+aK7lux7O(^&a5UOr8fPm|@-RQWVrKFyR*v*pu5`LtL*t&~q2<<n;Q
zv{gQBmrpz8({A~+S3d2RPY31GVfpmFd^#$hj?1T$^68@dI8=TdE<cWxAIHj%<K@Tc
z^5bmzajyKhSbkhCKdzP^*UOI^Mf3lU+vUfd^5bs#alia{Sblt8emp8a9+w|a%8#ez
z$FuU|W%=={h(i3-QhsVJKed&gy30>J<)_~AQ(yUMsQffsei|u1jhCP1%TJ5tr>pYQ
z?f+G5-q^OWY2;+IeC{ZpyUOS8^0}vc?kk@M%ICrId8m9IE}uurXA_K-&*SCuMEN{f
zK2Md;)8+F_`8-=b&y~;f<?}-MyjVUjmCwuN^J@9LRz9zn&l~0QX8F8TeByoHDW7-C
z=e_cIzkEI@pAXCD_vQ0Z`FvbHpOnw1<?~tjd|p0Zl+Tyt^HuqLT|VEG&$s^lPwIbG
z|Ev1n)c>yj5A}bl|GRv?E1&Pn=ZEt159Q~U@^fqXxvl*CuKe6ye(oqgcb1>K%Fo^9
z=brL&Z~3{e{M=uD9w<N0m7nL!-+w57Zz+FoD}V1SfA1@Q?=OEJD1V<Wf4?e!zbpUf
zDgWpz{}?L&7%u-9E&rG-|ClTPm@ofWDE~Mw;wWEQ%9qyiWvF}^E?-8<m(lWNqI{Vw
zUuMde+45zse3>s_7Rr~!@@1ubSuJ1I%9r)>WutuAEMIoYm)-JZuYB1rUk=Kb!}8^Q
z`Epdg9G5RA<;!XLa#p@vmM>T3zpwviYx(b||9!nV*=<hto0IeM-|wCLbI8B{yl74?
zo0D57|C}rT9QLodGW?$p<)0(|HCIOcYtF~~YtG02^Qt*<X`(rqYEGvA^Rzi}X{I@u
z_3!n(na`Ym&6WA`&xPiEsX1A0PFBi4SDW*-w-eKDG?zA;lP&+6^X<P*{@L<+`Jb)M
zUN1M7cKmDZKW6!5x%{$HepxNQtd(EZ%P$-L{j%Ad>@_F*&B;M?a@3q0JNdP}Iq7Il
zI-8TO=A^qh>2dOVbI0$^9ltks{65s2jF)DXo4>aGvenFI^VfI&GtH(=dMBslmo5LA
zC2loKbkfXgt68v<x62;4)hvIjS^ieD{H<mdTg{@knn||HFFXD<b8*s4)ttGvIdlI`
zGixVr6YRfDU<o^x@H+9q+r+QS+iAwR(=2c2_0C`0UmxbInTfOJfxouDKEhcu`({Wx
z&5(B9ZgtzPf6e1|o4M^abK7m^w%bg)+f2IKOuE}V+*Etbyqz?+HD}GOzrOo#&B^O3
zGyN;e$?KGFi+Am9%H~ncB{O+7I(fa?ERnXEYjf62@pjoIii2j+2hF2iSHtT{cwGgD
z<(K#V*>lp2{^*yJU%P&N=fdj^zxOpmDh=txzh)jznz=Y@W`FW_`E?{G&2pT)-TpfB
zlV+YL&Cr~@KKc#i+S`q=C$G~t1NH(rZ6<Q^dbOGNX`^{NYo=%xaoWuE^!3)aXUOHZ
zYvy*^%<Z&!_?dsr9Gv`h_IhhGhqGo5XV&*s^Eelp2fd#Cx7Vk@tM<RS@7=FGF8tbM
zi0}TI@b{5s!q*4AF7xc|krsFEUo-ggW?@d6JDRiRu4b7|n#Y|ti#~t7^g2azzqjW_
z^QepFW+#7L{_EQ76wS03&818KnmexiYZm9^_3GPTU4OgwIyje`$2DgjdG&VLrECA1
z$6hznI{EjRX`5@!6wS2PF8%t0_pXzH*OOm|o3p{!lef!5&E;kXoU_-Hk@9Pce`fDw
zzd1R0J+X{mTmCv}=G$TkPF!mBubHy-*LT*gGgCTwyQvvlbM1A)<{_=kr8fV5*=$ZM
zrmdN>tyxT)m#PzwaPsf7X7c8NZOsGS|IX*2lhx*Ay*V+Tcg=j9{B_nm)V0?KG&6tq
z`Wk#aGo$8OGplBaPF^Se>*RIfx4||ae6O#o=7}lI{q4<?nm0jn(oET$nTNBtyI&Xh
z>UQ!rUGv&EfwNx@-!3%sdV5s!j&3f$P13yG-q7T4myPxHQb+l<qnXx8vsh=%Lz**F
zG-n>!@pjMa@?3tq=hfNKEUn|0qvqt;$?IdyxOwbf^Y=ODGH=J*t>))SXS2-CW^yNg
zU4Fafp`Fb_JAKCZyYgReFrPf9rI|%{bB76fnuj@gyKL+|Z|lCN8B<R)rk-X@J<V*L
z{QInVWHTo3-sZ#%dtT?z%-sX6NsqiLzg`!i*I)0;uMg$7AIfiS<+pd`w~q2#XZfwG
z{MJ`~>o30zl;4KSZ&T&B<?`EF`E8^8wq1VPFTWj@-;T;}C*`;6^4m@M?Y{i}L;1a>
z{N7rAZ!i8j`rG(>rv>`+_<MKxy|?_{SNxUq3jcky{INFv$J(v4iMO-Kx3jsov*ow5
zmAA8PlkL7;JbXJla?5G?V`KS`4fEJoc{@9NJ=^Rpe{A-ZKei_R*mCFA<lEWY+u82h
z+40-i<?Gqr@*jJq+*^4)+wUrW?3>5Fc^r(DKMs48>xZr%x$oo$)v21x?mKbcne(&e
z+%y+6YEI3oh4RN${~wpf<&W##Kdz_W&X&s`H~oKHyYbdz?tb{={>)kHANLo|`rgil
z-_9o9&Su}vj{dmwpgU*q&h8E6e(3e#_am>9+|Rw8Ex(;@yq)d7ogM$zV~*b*dh%a4
zoHRH5`NKx}vu&=}->q6yt7=p4RJ-a>ovKT9s~**>`c%IfP=jhn4XY6~rpDEbYUVw!
z7SxhjRx4^%ZK`dxtM=5sI#7q|Se=(Y+l{N;^4l%1eNs)SX=PsR=GAUq?dH|Krq&ft
zyZN?nDf4dMQRd%n{_W=9ZvO4&-~L`5Da&a;QK#yx{Mo_X!QH{z!PCLh!P8;<9hTQ&
zejVo3VO|~P(P17P=Fwpu9p=$t9-YS9>G7S$+-d!Ejw)mCH11C8qH|GsT&KtFS-*Rx
z-LqczOuuLPJ<Hs?QFrCfeg1vV%f9F2)N^w>q0H;ldObC-Q|2?%o*VO}X)jHCX}U|(
zU$R^_w_C=gF<n~DrRVU{vMxP`mzH<wIlQ#YOV8n@=kSvG((*5j`I7t6^LS~QmzH^H
z-Cr_XdR{MQl`&iz!=-g|$#m&CxwMWhJtvo*lS|LZrSV;QPA)wsms`qna%ml2dQL8_
z$4l$+(sOd@Ik~hxFRd5voj)(F&r9p`()zsgI=H-0m+DGgE9T2vWgT7KtB3OE75^3g
z75^3g75^3g75^3g75^3g75^3g75^3g75^3g75^3g75^3g75^3g75^3g75^3g75^3g
z75^3g75^3g75^3g75^3g75^2#aQL5B{8#)}{8#)}{8#)}{8#)}{8#)}{8#)}{8#)}
z{8#)}{8#)}{8#)}{C-5}&nx~b{ww}#{%ihg{%ihg{%ihg{%ihg{%ihg{%ihg{%ihg
z{%ihg{%ihg{%ihg{%ihg{%ihgejjLmUUOb^UUOb^UUOb^UUOb^UUOb^UUOb^UUOb^
zUUOb^UUOb^`Z)aan)90Tn)90Tn)90Tn)90Tn)90Tn)90Tn)8P9hVzE=hVzE=hVzE=
zhVzE=hVzE=hVzE=hVzE=hVzE=hVzE=hVzE=hVzE=hVzE=hV#breB*h(@jTyno^L$Q
zH=gGk{u}-q{u}-q{u}-q{u}-q{u}-q{u}-qe!u+v=MDc2{|)~Q{|)~Q{|)~Q{|)~Q
z{|)~Q|1IY&=Pl<g=Pl<g=Pl<g=Pl<g=Pjo{D}UZ{-g4e@-g4e@-g4e@-g4e@-g4gZ
z-SXY?-SXY?-SXXX-E!S>-E!S>-E!S>-E!S>-E!S>-E!S>-E!S>-E!S>-E!S>-E!S>
z-E!S>-E!S>-8L@I>#gVY*7JJndA;?#-g;i|c<*@cc<*@cc<*@cIPW;`IPW;`IPW;`
z`0n`b`0n`b`0n@w8veZVJl-+h@!hfAdEV~$?)dKb?)dKb?)dyj!k>41cYJqzcYJqz
zcYJqzcYJqzcYOXD|LIThpLa}mJbpao&pVFhZ|S|U-y8eAvELi}y|LeWuI@co_YC(8
z_YC(8_YC(8_nxbJ&(%H4J<C1IJ<C1IJ<C1IJ<Gl4?4IY|b9T>k?>W2Yy7!vAXS-*+
zXS-*+XS-*+XS-*+XS-*+XS-*+XS-*+XS-*+XS-*+_uSp{-Fxot8Sfc=LHYBZ^WO7z
z&w9`L!1%!U!1%!T!1uuS!1loP!1BQI!17@H561ss{13+eVEhlp`e3XN#`<8K561am
zoDatNV4M%e`Cyz6#`s`7561IgJP*e6U_1}T^I$9wx8+|y45?8yrjGsl>+Jt`>Fvtd
zzkWD#Z*%r~sdc;jtHZ<nng5qR^Z)8Fn~oVZs}9P)I{Vdt8Z7_nYE|!4r|MRH%Hz5`
zu4_$gs9pd5I{W`!db@J)uP!s~YR+CSb+?p%b(>|kS$3Oc_mWys`^uEvhw8mLQm4w`
zyD!yM`B%>m%Hw-HzNcMvDf8&rQy$;r@jV{jYo5K!%6xlQ)s8a8UeooOZVYR}G?TsM
zUmNDL(NX@jJOBS!%Gq-H`a_FqRb8rE^{8Ier~1`^8dO7SSdFN$rv8_&ejeSqHiv&-
zy6*ey(qr%Zzb_s9``)MS(Z6nc8lU|4rMbPoE;(HN?@P0P-P_E4LQR&hEk@H~BrQhK
zVgxNl(Bi(9DK(?!)Ph=4D{5VBs%`aN9jOy_s?ODgx>DEb=4%UZ%dNUAUt7Drwps_R
zmY~(9>9qOUYR;|JRjWC-u6%79{@Ql?wap4^uIzqo8~^LxuWct^+h*MQc8_UxU0PLZ
zO?_?aZKh~7p5}^at@~CZZQW?<YwNZfT{7<0t){+OC#{FC3-iplZn3SFXQW=&Ut4#Y
zdYvUobIB5|fYxTIyRXY=rtknWZhITBEBjyDnn~TW*VO9`&0<|Lq=vfR)YrE4|C-R1
zzi!(%{MY$4%V}=@+P3_<+^#q5+fBy&*YXZbcJQ{OgRhOhS*}aQ@x$AdLpPY-c;T#;
zW<jo-5lz1<&FcCe%lzvS&D+ZI#I_za(`tA4+Ip-Si?#fA_WCSgSzgcU?af^0T{=}~
zO*IRB9l-0A*JtH*(GAcmrPV8?_2P9+{Pombx}jN?mrm<VQ?Ki-c`Dy7b-C0io7d94
zdMK<-{d<(n)85vq+SEJMt~yj_Q?E;GhW(eY%~Rjj{knD=LR-(*wgG3os!v%%uh$2@
z{<X^62Hi6Fx+<G_4>k1)^8D_);ywKK0{$Pv{p$tYOzyqaHrmwN#A7awef2W;N^Hi{
z%)s^L`Fgu#Ew_1}zP<(i{W@y~>@BhSy3__T{(4U{=|odsjnpb_-F%zNq}#lW+oqcN
zwm!T*s+s$gyWiH2w_)4#>)LD9r8i{rCbN2-H7}`V%r1F)JT1*7F9uIi+iX*>uaf4>
z^WMz$?NT$Axu#y1(Ts7vsn@mU)$)47>(km;yd~Nen!6gem*?A+*SRfxZPt3T#$ECf
zK76}mC|)vr&2ko-dVB0&H+VI@-qySi-=3RhWL|J>K4iR~nm6X_*?+wT{=e_BHXko-
zt7@&OS6DO3w@b~8e4@1ZL}~Mh;`6B)!lv3%+p1Z#<r?;1&!;QR?ad>t?=drcyJUrV
zs+yad;q1K)&{?y7ywl$1+^miUZOt@ubjjPb&6~2#Tk&<-r_Q~H+L~vid49ZY+HO>{
z6mO3aYj?!$BW@eDs>ZyUW_@bRx_`kNWy#!^iI-hp?NsDzbIN?SOt)=tyViv5>#uv$
z<?Ft6ap1{(?<IBQjdIF=?qz5D<g0Ctuf7X^^_}9Izjxn$m{M2eTT4gz);dwXS*zdd
zm-yz3z_+f=@~vm7eCumbr{!D!bon+ouAC3`mT$wm<=dDVpI6Ju<;h89ep6@V+su9W
zHrrXgZFtP)M)|gVpw7#;o#XOt@1%S?94OyTn%Y;F<=dI%+qVB^+g|se>Q+6<vTVP9
zvrYcZHug8$*WYYUf3yAk%{KEl+sof<FMqR*{LS|9H`~VFY`^L{EZ=M+ezR@(&9<Tb
zLp7$DY|DMKo%YSP*Eic)-)uX5vwfo@Q4N=GzC(ZWUH6;swcmV?{pS1QxBDCQP#%3(
zc>H0vJo?M|=<nmB7yqNT<fGTv<GUSY8vAn|+uK#Y8dGa(U75x{n@9U>bUdnkbx|HW
zOy6O?9j5ms?XhEAO{hs_866(qVS4*y9_@R1?C_Y*Zq=ve)Ph=4?&~tIt~S-FJicpQ
zSzed%cHNZ6?ha)+-N(v&e4~2onN_RmM4gt$UXSf{d#|zdns=||_nLR_l`@|`wm$Re
zGtNHC=;QCZROa8u-`}F%Dbw|vPyev89{Nq!Z@m4+)Bj#sUcdSF-zkq9unq>!%j2N&
z4|b^@#W=XAjBn679JC$=57m+K_#xvMGVKu4kmq8EcgR?VjCI)jhTS)8oqL}>j;tul
z8nukEq4GFx`tec4F>d}7#yr73VVzHy&xB=7SjL3uC#}~>^P4n}Da)K@nl|5Q>uc6J
zn6(aPEpyKEHs|`BG0!nB^p?lP0cE-+V_Y)sCF_3K{FbecW#d{gpB2yTie;@%DUV&X
z?9~Hh9;>Eb^<1r4{+ea4SubnGyyo#f3m(^uY0YbEZS!d#W6kPaM_ET-H_Uabe%&PN
zX1{6nn-;ZYxLY2w?e^`B^0?z^@DcL3XDEC3<#FHY*~i#--+`eXcq$KEKlIcbn#ZBX
z9va@Ec^+Ebhn~LohW6eneQ%!c-S*yXM@!1}BQK#NPv4Qp99ad&R)syzkN$^Yk7wq2
zW_f4GXYN0{Esy8O=hx-&!Z<IMm3drv{Dtvcc>IN@@xnNL96WmKKicK=XqVFC)yY$r
zS=p=fXb;k(y+@Ds8a>);^JtIEqdhT?_P{*abMk1<$)mj^kM@o{+7t3<55}Xt6puG%
zYcIs3y$z4{C_LIz@MtUi(bo5)E$c^H){nNJA8kE9-kG0Ih(}wxkG5bRZMi<$0)4dA
z`Dp9$@d4A8;p2nV`*2mBY&|^vFjt;h&dO73iyA9WzV<x%0`TOI-;+JiPwnqibKa*0
z)RNj#*X60hb(?cfJ`tWe4%Kma^6r1Km-VUhygYgLKXrMmb@0?Rs}9tua$k3^8ddY<
zsb^drl_&4=r`|#3_FiM@y;b++sn3}D?#ffY`ShE%-}3u!%F{rbGM<6$@-%3igYC*#
zhM0y{%F{64u=|EDl=+OzD9al)?U?b78H;z_)A)qiQ<vq*JML*>QcWxKns~3Qt4Z^o
zv@Gwqr^!{dUY@1~o{kM;%5bOLHMRdZVl7QmcppB^kk1S&qnUAiwnLfQY;)O?=ByoE
zrcZOG_a1$kvxerZ^*OqEYj)mj=Z$RM8lAV81tVB6-QtO|_Lp2<T2wntJ>Cv}yPeTU
zUY?e*S3I379<X9Y-q25LW94bRwLEPY`lbhLxx8foJMP<czH8xoR?8m8o<Z!Jejn94
z_-WtMxqqRoh64*c@H8G6#DPH^7?|TWPlx@=V-C&7yYA_|%kNLilRx@TN1Ns881(p|
zJe^oUC#F5I`c5p{&h@8Lk2!@rHT|japBej^r|rx*&n)-M%CfWl>B4dx_j$T-`-QRC
zDgI<9_LH5tPj<vUU0c!qh(6gN`D7>LlO2vvb|yaAG52Ig+>^J?lbvi&b}l{HiSqQY
ztM1G553_1PU6tpS9<{69s|$5kp8a8cwiWj53)OR*%MPeK`-A%2-mMPGbH`kHw%6jh
zbErIfsXu$EKX+}Z9c5l!XX;$tsE6`w_rr6y>AKCU+q8B;2m_R7yBMCm6`t)<c(yCy
zxz}}Xf#*K+@rUWT|GGR6yi*=OVA%t`YF!<d=RxE5$LD#-?OyuN!=@QFE-(M*;iK|A
zV!R{n8<{H4ql}~O8{H_+V;(c+wy{OEri^=xeat+^PLy>pX53@e)41`DTi&?)CoFqn
zq&#~&JWsJq%`5YmZdH~!%{y(~%~(G(%(Is1kCM29+Ei}$MtGiUQO52M)bl*=yy^Ur
zdR{QiqUjgS-wXbE+4^0!E?4^0aCu%e-Kxv0#<sSjtQ$u<o7exdzckP5tn0?*7goe7
zlx3_N`?|4gSms8*GS7`cWh@()%40Xp*I%XQP0QM{j<$_y+ilyX+csXm+WfrZ@jKS(
zu5s>q?5^j|`{wz;G>4`?v`!B#|GoM7i}QSB{71%lY<(TO?Zo+sah+HfC!F2~&%Sg%
zpL*=6F`qJ@dj8L>%QNfX%yj4GbMCPhp4SWGaKP!=m(phkX`X$_e0I>~*-qbQJ9eM#
zpnbN}^!d)X>@0n@Q}o$R&}Tb0pZ!UAzW2P?nfKk!rtd$rt3EZNPRe&*zQ5aT^S$Lz
zy;t|@p?q)cR=sLp9hdJ8Sbl%!zRoGNs<zaQa(}1!bl#TlUFL5O-gkTMzIU0<`{H}o
zV)<@Y+jqO#zWdVm-R`yTUi;sB%)4h;&8s729zA#Ed+)m1DBt_s-e);|=jD69vG$KD
z*9R<bz;uJg=L_0*U(mh}&Z$jh8NQf(A9DY&`-dmX_fg9pb=#=<j~e6HMEO3xR=!X6
zE9=CUsqa(fKV`mC+|zEK9#ZBxZJu7+-=}xWcOUNGXI!3LDc|P?6#u;2=UEoaW5K)@
zm&^Ah>%fQfcVDo+FB{vkb-QfbtF3BK8Pn>5GVQ8$vFdSambdP4>&CTi9ot3teZ%_M
zFxE|D+w}M?^V>169qW9@IK39X@49c-G`q&WYhJq^yJtRo#<Xw!`{wlhz&d(=QNACU
zr{nD3kH(d0kFDQhV>o6y<~Z@V6JvA?{<~xF-%qUXQ|CUbzn>Y`nb(HT=I`g$<GJ<d
zv-$gl^?l)X2cW+DI`iE@p6?E1e0T8Ty8{v59f0`m>&kaK)4$t^{@o7n?+(g;x6}LO
zhgEf1UVQ$(*iHD-a;$F4OY5K-R%7MGo|PAy|1UOZU)t`<i-X@U?`F%3&*2vbonLI6
zyf`5I(%GT<l*e|O##fG)F7x(l?l0Zu**mGG%1ht7^3rd4gIDEcXuQ0PUzC@LA5^c}
zSFTUZse|$|?KZE4m+3QQUeo3?V>vU%J!8Hzrk!#7jCswrsTJkE*)8S%S@+MmKF2lR
zrHsoT_ZNTMU*=6a&%9_EKJ#7{mz3rE+<RGkC@)?^FH7yp{Y%EZWZI>h^5V<-i@)tJ
zs~+P^`pc^MtR5<3TXVZ_-Y;G=FB{g&#-`dUFPp8(`rEuOFWc6czyB}0=DF+f4uZez
ztt<1`GfwRqFZ(mf^&{)$XtKQcyZv%v-JhA?x%GN+U0xiFesR$B#aG)G2Q6Q&&C9{b
zmuq8mu<*r!z!wJsU;I6Oaggoh#^ZgheQ{vz#euUIf1h7`wSBoYRtLmhyiQ&mjCyhK
z>BWJj7YA)#9JG0H0OrMkmlp?K>fC0j^M|GGU6i^%QUu~ui)vME>YZv=9ja4xsczMy
zdR3q5R|9HL4XI((%xhGQsc|)-Ce@UhRx@f=&8c~{pca)ey<b)<YE`YNb+w_4_5GIG
zRy%4}?Wuirpg7*YS4Zktov2fFrp^_kEtvZLN?ofPb*t{wy?Q9s^HY!bkNA)HkNA)H
zz0d0rzxR4Q;y>a);y>a);`jZm9`PUXAMqdYAMqdY`)sIwt*m<9>k<DEzh6MBNBl?p
zNBl?pNBsWI)gyjeSM`YB7FM-|Rgd_6|Eov*NBl?pNBl?pNBp+H>Jk4Dzb&!q|4vl>
zy{t$4NBl?pNBlmc>Jh*HdqVZMv>x#v@gMOY@gMUa^B?mc^B?mc^B?mc^ZWi;kNJ=J
zkNJ=JZS~b-{$u`Q{$qZBqw6vMG5<0DG5<0DG5<0DG5<0DF~85uddz>!f6VXmvmWyw
z^B?mc^B?mc^B?p3Y^}%qK40rG|1tkD|1tkD|1tkD|1tkD|1tkD|1tkD|1tkDzip0s
z%zw=9`$YA<y4rfJKC`RO?0Uj~!hgbl!tXP@`V6lp{CcUXUaIPQc=bKJ+UZqK_)qvx
z_<c92KI`iV{|UeEDfNW^g#U#9g#U#9g#U#9g#U#9g#U#9gx~j?dcyCsqMq>kj#GWd
zsVDp={3rY;{C2k06aEwa6aEwa6aEwa6aEwa6aEu^-;e4E{|WyI{|Ud(p?bo9!tc9N
zJ>}Q1Q&0I%`A_*z`A_-%?W(8zr~Ie<r~Ie<r~Ie<zIWAA{!{)_{!{)_e%mKi^G`kH
zw|AhP^7|WDH3L=6K-G3jJ>@^;Kjrt`t)B9q@@o{Tr~Ie<r~Ie<r~Ie<r~Ie<r~JO_
z)l>dc{!@NCNvmz4ddh#wf69N#f69N#f69N#f68xrsh;wm@!M{yzB5*PFzOlq8UGpo
z8UGpo8NVL7s)w%Xp{r;7XZ&aU_IT7Y{xkkF{xkkFemg?z8UGpo8UGo-@1*sN-~VF0
zp7EdYpYc0@S<m==U#-3))HD7wet#?K8NY41YMZXwvr^Ca&-m?Osb~DYPt-I1GyXIF
zGyXIFGk)JQs%^k}#(&1I`KsD0Q_uL%_|N#y`Oo>!`Oo>!`E4`SbN+MwbN+MwbN+Mw
zbN+MwbN+MwbN+MwbN+MwbN+LFJFDwC|2h9T|2h9T|2e<^&3rxQKj-)Tyq@!)^Plsd
z^Plsd^Plsd^Plsd^Plsd^Plsd^Plsd^PltE(^P$@s{XdtbN+MwbN+MwbAEfF>N&sf
zT=ksaHbFh-zu>>%zu>>%zu>>%zu>pcP%ro|_%HZ<SF0EN7yK9e7yK9e7yK9e7yK9e
z7yK9e7yK9e7yP!rtG!@V?`73{S@mAl3;qj!d&a6gWA%dng5MspY7bex;J@I%;J58w
zFZeI`FZeI`eQ&J3H&%Pn>IMG=zipg)!GFPj!GFPTo4@+LQ7`x}`0aJ8_PW&zemm={
zo%PjrQMJRqYQ(D=@v26=su8bh#H+TKs-5`NPW-A7uiBAcHR4r`cvT}_)reO$;#G}!
z^^#vBUe$<KHR4r`c-2n-YM(>Zh*vN9HR4s@ed;B@M!c#KuWH1r8u6+|ys8neI!I78
z;#GSFt46%45wB{*s~YjD0|!+jUbPRSYQ(D=@u~v}RU=;2h*ve@RR<NSZM^EhLUmxF
zYQ(D=@v26=YENU;h*uqGs2cIA?ZB!LuWH1r8u6-a!m1IkYQ(D=@v26=su8bh#H$+d
z>J`66ys8neYQ(D=@v26=su8bh#H$+dsz$u35wB{*s}5vTjd)cfUe$<K9n`2A@v26=
zsu8bh#H$+dsz$u35wB{*s~YjDM!c#KuU_$M#H$+dsz$u35wF^JQ#ImMjd)cfUbWY>
zYQ(E{nO5H^t46%45wB{*s~YjD-KbR~UbQQ=YQ(D=@v85g)&8HV5wAMPQZ?dLjd<0;
zmg-<j)reQG`8DEIjd)cfUUdMbYQ(D=@v26=su8bh#H$+dsz$u(;7rwsS2f~Q-)XBx
zys8neUh`k`+t*aD`LFqX->nYhR1JDngI>MncTlJL4qWZ(ts3;I2ED35uU_-pKB^9Y
zRSkO8fuE{Dui714HRx3hdev^>szI-6(5rS2R}FgA!Lq7BuiC#=eXp(>^r{BEszI-6
z(5ns{RSkMogI@KWyK2y@8uY3Ly=s40)u31HX0Eo&s(oYCL8j_^dG)=#YS611^s4XY
z)h_627j(7lRyF8V4SH3BUe%yi?Ut?@^s0SoRfAsDpjS2MRSkMogI?93S8w<==v4=>
zss_EPL9c4it2g`_^r{BEszI;Xy<IivRSkMogI?93S2gHW4SH3BUbXAHYS611^r{BE
zszI;XcBnV}H~csJwj-+Th<d}Xaj!aHR&V(=?p3?YtL=-baj$CJs~Y#}Ex*RSYFB#I
zxK}mqRofrc_D9vYR~>At8uzMgk!pWo)wow}lhj*&4SZDtU)8`@9ki?(_^JlJddq*y
zZ@Z;x<g4~ER{I#MM!u?%uiE~t8u{uizec`lTexcEs~Y*LM!u?%uWID08u_Y5zN(S0
zYUHaL`Km^~YM)Nk$X9JYSB-pCBVV<BR5kKdjeOPqo~n_rYUHaL`Km^~YTK!5<f|I_
z>K*?b{~f=EzG|DQYUryP`l^P$ddGjq?~p>Z->BN&stzqwjeS*PU%lhk*jMlPHTTs!
zehq&0j$e~sz2n#DR~@9R4o6hm_f?}`)#z6>`c;j7Rij_k=vN)4s2csMLlsq{U)AVW
z?Tf8<{2Kl09l!ln)nSaP;jiBDYx=8q{2KqN{aMxita`_<0kGcj-|^q^J3v}B16Iv|
zRWo4K3|Q^=st%e~&45)iVATv*?FXw4kyM9Bs_n{ZyRvEqteOF<X27Z$u-@})2CVj%
zRn36ap_Hl_uxbXZ_McVzFRE?Psu{4_k5O%t)@GYjGho#WSTzGy&45)iVATv*H3L@7
zfYqUxsu{3q2CSL^tHUzY!Q84Du-X?=H3L@7fK@YK)eKne8>#k<R1JYuLtxbqSTzJz
z2Y;)kz-m8Uz2|ozxIXYd@M{#T4)s)xg7tyle(tJSuxb{p5BwSi>jS^0!Rl~Oec*rK
z*Fab`5LOL@^@0C^|AAjKVRgu;Y9_3j39Dwp`oRCd|G@vi|G=-ouxc=@8Vu_Lzb3<~
z$*^iNteOm~Cc~=9uxc`__PbY&hV_B}f&YR3f&YPD^I?79*L+wVrm7kcs|LiX0kJw{
zRqeN|5B!=2>I1+1_x0hS<cA+ri)vME>YZv=9ja4xsczMydR3q5R|9HL4XI%@qDIx2
z8dno)QcbC8HKS(LoSIh)YEdnzWwoMK)tXvY8){Q+scp5RcGaHRR|o1)y;n!-Se>X-
zb*9eMg}PK%>RR2XTXm=I6@LqV3x5lL3%?(&NejRIxoP2V;cwx$e=Yj!qJ=Ll{Mz`^
z!mpJtE&ST~(!#H$FD?Aq`qIL$wJ$CF+WXSNuf;Dd{M!7|!f&5(TKKj5rG>wRU)x_=
z__h9}h2MU?wD3E=k{13J{uX}wnqyycv<1d~zi17N*1$L{8SQ~_940ONS_GrfAuas&
z4aTvV*gqZnr_;*s5N2BWTlri0Tlri0Tlri0?Zb{{!q|_UR{mE0R(|^!)5>o@V_NxJ
z`R#8^E5AmRwDPy|I}Q{Lh-u|-<!|NJjF?vbR{mE0R{mCgJ%-Utm{$H){#Jfng3%=y
z9f4`(Z{=_0Z{^olmsWoJ)}vW2I^v=uE}Gt==`C8QqM<7K+@jAddfKAtDtg+Yr!8&#
zI@!|3-^Q<#EjroK#_y1B9J`9%wP;$4Znd=WxAEJjnl^r&Y0-=p?OV}umNtH^WYHs*
zHhzbM<Jea;mqaU9^lL?zR_u?Bo~r1VibKZH4i$|%(GC@@Ptp1m%_!0E6pc;M@DyE4
z(Zv+qTWRChvJ{O<v0pbDm7-B8ZTxNgZTuRRqG2f-mZD)P8kVAADfR(J(^A^_g{dM;
z6=AA4HW6W}yyF+9iZE3i%8oEq-th}lMVKnWR1v0%Fja)9Vt;aksq&6rpeh1YdB^{b
z{~iB3egUi4=Nuuc2w6qQD)0COt>U;~-th}s<sJV!{&)OBS9!<(j$czwgs&oe72&H0
zUqzEs-toWV7sARr{&)QE_yw}^j{hD1JN|e4@A%*GzvFidCYpSr$tUgn?fmWh4%^2u
z%Cz$fbHyS2*iW8z{&s$c^P?ds4(X?zzn#CGzn#CGU!zdk`P=!0z#;?|A+WUbxAV92
zxAV92xASWYh;Ueh!y+6OO-9jV6ydOFG>VpiXf%rb^wDe-;jm~pisPpd4vTPDgu@~n
z7U8f6hebFn?fjaO($3$`uk9f1{O$bh{O$bh{F;{1!LM;C9sELN5h_aue+R#CSvvST
z_&fMJ_%%313qv~iwJ}6jdNe#mS9%1{qVXvjpQ7<89sG_jM7Mf$tH&|lbnpwQrGvkN
zUsx?2{2lySBGSR%!Qa8(!Qa6z%ofd4(L5D_wsi1oorqvt1lyv8B7$vk93z^m(!sC6
zDjobC{2lxq{2lxq{2lxq{GI%r{GI%r{GI%r{Emf0I4;6*(WI45{!V_!N21$4j$6lZ
zk~nr9$4b)4FFcn{{!ac*enGl8&KF_2bn<udYxIg>U9@{dxGoyL(#hY+?|5K3`8)YL
z`8)X?A515|V}$AC@8ox^FrECJ{GI#)deJ%(#}CuVuYoMuNYcsQ$=}J}$=}KE_+mQw
zJNY~LwVK57#%LCdhO;>Cm@fV<e$8jmh7u`)bn$obJ0_Vf{x1G5e#h-2T@dMlbn$ob
zcky@eYg~)tm+9i~;&&V~UHqEa(#7wXX1e&h_`CSK_$3t5#V@50&5O|w5RGrq6A(QC
z>EiF=*AkO1{x1G5{x1G5e#fk$Zy@>x(#7xiRl4{!&P5+Vy7;^JyZH4Kq?=#7A>I7l
z{9+F2=GR~s{RZjg7kNlGzt)>{^J~0IH-9(3euQ-Mck^q@NjHBte>Z<Oe>cB=g>>`l
zSx7g()}3_ock_4ick}CINH>2se>Z<Oe>Z<Oe>Z<Oe>Z<Ozt}|_3yx#K>E;)^h-1R(
z<`=(+_(dGEi^jk>P8`RH)6L(_?>Jt%`MdeM`SngjJR?2)J^W%C5z|Nye-FR7Ml=_u
zhu^X0^zira_we`d_wYOZoF4uj{vLk480q2f;qT$^;qT$spqL(h$EwrA-^1U--^1U-
zuX!;&{5||V{5||V{5||V{5|}7JR&v{v61NYNDsf`-s$0Y>^mAC)5G7x-^1U--^1U-
z-^1U-@A!Cn_<Q(!`Fr_$`Fr_$`Fr_$`880cmtT8Tdii_#d-;3$d-;3$d-;3$d-;3$
zwO^%|zn9<f()9B8^7r!h^7r!h@@vRUFTdDJ#9kuy60sLQN~!oAyH78FFMlt;I84N0
zA`TOAnDp}Z^7r!h^7r!h^7ry<@=Py(FMlt;hR<mDOfSE_nDp{%4@)n9AHU|{^zrxc
zJGL8*q0tx`?PKZV@8j>|@8j>|@8j>|@8j>|@8j>|@8j>|*HD^1eodw6<L~3wT$(=q
zK7Kt3>ErL?@8j>|@8j2}kUsuC{yzRb{yzRbe!V>D<L~3|<L~3|<L~3|<L~3|<JaVz
zK7RcU>EqYXnm+zMevPf^<JS_GK7LKE>F3wzntuL%ehshD@S1-9e*S)b&98C1JN^9q
z{Qdm>{Qdk|>!L9>dMl#0A{u0))h_+~{rvs>ny#a1HvRk>XVcH$&)?7A&)?7A&#x~d
z{rvs>{rvs>TK1yRHvRnl{Qdm>{Qdm>{2Fl6&)?6l5jXw(dY;nH-_PIA-_NfHDq8>2
z&#zH8{rsABGr&K<uW2{h0;5kV1N;O0`lT|!uV*R){Q9Ocz(2r0z(2r0z(2r0z(2r0
zz(2r0z(2r0z(2r0z(2r0z(2r0z(2sR4KV}!1N;O01N?fkGQdB;Kftd|F$4Sq`~&=Y
zS2Dmqz(2r0z^|7p1N;O01N<70Gr&K<Kfpi0Kfpi0ukR%T`~&=h{Db_1{2G>{9WvS>
zGsr*4uYoywWTHnVgZ%nrGRQy3Kgd7GKgd7GKgd7GufaZp{Db_1{F?1E$Un%h$0mdP
z`p%*uI)nU!{Db_OqoWNo8l*GGuNNnS{Db_1{F<gS$Un$G$gfW)gZz4RGRUuyI)nU!
z{Db_1{Db`ZG&0CP$Un$G$Un$G$UnqC#6QG8#6QG8#IGkVL;RYuqc<)?`~m|R;@6~|
zA^su$A^su$A^su$A$~m}8R8$}AL1Y4AL1Y4*J2vY-5KH^;veE4;@5kZA^su$A^su$
zA^su$A^su$A^su$A^su$A^su$A^su$A^su$A^su$A%0Ed(L|mh{vrM$etk_D;veE4
z<{#!C<{#!C<{#!C=GP0DVSYVO8Rj46ALbwCALbwCALbwC*D{-7{$c)M{$c)M{$c)M
z{$c)Me!Ybm<{#!C<{#!C<{#!C<{#!C<{#!C=GXKdy@(m+7uLuy|1keB|1keBzdpqb
z^AGb6^AGb6^K0MDF#j<BF#j<BF#j;Ww%!c$kMNK1kMNK1kMQex%n1Jo{|LYS$Bgig
z@N3k~2>%HG2>%HG2>%HG2>%HG2>%HG2*19`jPPqA&Itbq{|LWE{*3UC@Q?6o>dy%O
z2>%HG2>%HG2>%HG2>%HG2>%HG2>%HG2>%HG2>%HG2>%HG2>%HG2)|%XM)-wuGQuxB
zkP-e7{!#u>{!#u>el5-!<saoA<saoA<=6X}QT|c>QT|c>QT|c>QT|c>QT|c>QT|c>
zQT|c>QT|c>QGWfS8RZ}4ALSqAALSqAALSqAALZ9$no<5y{!#u>{!#u>{!xDYcp2p%
z<=2;&QGUI78RZ}47mmm%|0w?`|0w?`zuvu!@{jV5@{jS4@r&1HjDL)Oj9-6i#`wqh
z$N0zi$N0zi$N0zi$N0zi$N0zi$N0zig`c87Fk}3BWHZJ;#y`eC#y`eC#;<oaWBg<M
zWBg<MdTBGpKgK`Cudg;^{A2urN*UuH;~(Q6<JWVWF@Alw(RUlI=^5i6<JW_mG5#_B
zF@F8H8RH-0*O!|y{&D_s{&D_s{&D_s{&D_s{&D_s{&D_s{&D_s{&D_s{&D_s{&D_s
z{&9XmlxU65IR7}m{@;xAkMoc73tMHJf1H1uU+^m9{Nw!N{Cb5m&OgpS&OgpS&Ogqt
z-9F>|<NV|N<NV|N<NV|N<NV|N<NW&aGR{BFKh8hSKh8hSKh8hSuWv6C{1f~W{K7Mt
z;Gf{v(w_<b3H}Lw{e79>pWvV1pWvV1pWxTioC*F3{t5mGesO?I@QVawf`5X4f?p^n
z6Z{kW6Z{kW6Z{kWq63-WpWvV1pWvV1pWvV1pWxSHm<j#~eldeg@K5jy`DB8Bf?wDt
z`Vce0FNTl_{t5mG{t5mG{t5m`{z-nJpiJ^l@{25Fl7Etal7Etal7Etal7Etal7Eta
zl7Etal7Etal7Etal7Etal7Etal7Etal7Etal7Etal7Et4KX}9=GRZ&5KgmDIKgmDI
zKgmDIKgmDIKgmDIKgmDIKgmDIKgmDIKgmDIKgmDIKgmDIKgmDIKgmDIKgB=AKgB=A
zKgB=Aub($l{8Ri>{8Ri>{8Ri>{8Ri>{Ce0k#jp1_@*J7spW>h5pW>h5pW>h5pW@dS
z8a?ls;@9^c`HxKTPw`LjPw`LjPw`LjPw`LjPw`LjPw`Lj>n+U`{}lfe{}lfe{}jLA
zVy5_~_^0@%_^0@%_^0@%_^0^wDQB90nqU8YrunD&r}?M(r}?M(1yeH3ua`B`{Q6ll
z%|Fer$2me|ndYD7pXQ(DpXQ(DpXQ(DpXQ(D*AJa({%QVc{%QVc{%QVc{%QVc{%L-F
z(-BUK-sw#9>z~dv|1|$J|1|$J|1|$J|1`e<O{V$f2r|vD=Qh*))BMx?`fnp&kZJyD
z{u%xm{u%xm{u%xmem%OG;n%O78U7jm8U7jm8U7jm8U7i5VV%tI&+yOi&+yOi&+yOi
z&+yOi&+rTNWQKo+e};dCe};dCe};dCe};dCe};dCU%z-}_-FWM_-FX#BQnE3!#~46
z!#~3>Fq9d7y~WX69KFTSTbvpG8U7jm8Ggam$X8^Rf0loif0loif0ke9D6{;t{ImSC
z{ImSC{ImSC{ImSC{ImSC{ImSC{ImSC{ImSC{ImSC{ImSC{ImSC{ImSC{ImS}o}=$M
zv;4FCv;6WN(FdJb{#pK6{#pK6emRiL@(V^~mVcIimR~j`v;2CeGs`b9m0A8-{#pK6
z{#pK6{#kxus>qULj(?7Sj(?7Sj(?6{&?<sfnd6`1pW~n7*SnuN{yF|R{yF|R{yF|R
zegVn|XJw9mj(?6{P%Cr%bNsR^kzL6gzwlP(`1N^bj(?7Sj$f~G<O4FtKgU1EKgU1E
zKgU1EFPxb<e*NK@<DcW7<DcW7<DcW7<DcW7<DcW7<DcW7<DciB=bz`F=bz`F=bz`F
z=ht(ddH#9+dH#9+d455&%=6Fl>q*Z%zu;Ns`RDodsAryko`0Tyo`0Tyo`0TS|9a;6
z=lSRP^|EK4f1ZDyf1ZDyf1Y2Dd*=D&Br?xG&p*$vk2~}H^ZfJt^ZfJt^ZfJt^ZfJt
zvK5)<7l!k{J5l`e{PX+^`~vD(;9uZh;1`&SEJhai7x)+W7x?9ivcSK<zrer1zrer1
zzrer1zrer1zrZhamj(U>{ssO8{ssO8{ssO8{ssO8{ssO8{ssO8{sn$nrY!I;@GtN$
z@GtN$@GtN$@GtN$@GtN$@GtN$@GtN$@GtN$@XM8Cfq#L2fq#L2fq#*Ik$;hYk$;hY
zk$;hYk$;hYk$;h2&LE5Yi~PdCS>#{jU*uopU*uopU*uopU*uopU*uopmr=+f|04e)
z|02K4LgZZ{ua!mqMgB$pMgB$pMgB$pMgB$pMgB$pMgB$pMgB$pMSi)52q<Tff02KY
zf02KYf02KYf02KYf02KYe~DjCB1`;B{7d{Y6ItS4;$Pxl;$PyIqsS7!0Cbl4m-v_X
zm-v_Xm-v_Xm-v_X1uY|YktO~m{w4k;{w4k;e)*m(@h|Z&@h|Z&@h|Z&@h|Z&@yl&w
ziGPWIiGPWIiQkVZWQkuGJHpsm;$Pxl;$Pxl;$Pxl;$PyI{m2sk68{qa62E*=mid?Y
zm-(0Zm-(0Zm-(0Zm-(0Zm-%Ipvdk}6mu3EC{$+mI(Jb>X^Dpx+^Dpx+^DpxYiD#LA
znSYsI1|`e<a!pz0U*?x@$};~l|1$qF|1$qF|1$qF|1$qF|1$qF|1$qF|1$qF|1$qF
z|1$qF|1$qF|1$qF|1$qF|1$qF{|dj%YgYJI_*eMlU$er$!oR}5!oR}5!Y@ad6@K~H
ztnjb!ukf$%ukf$%ukf$%ukf$%ukf$%ukf$%ukf$%ukf$%ukf$%ukf$%ukf$%ukg$1
zWQBi)e}#XAe}#XAe}#XAe}#XAU+_LF{44w`{44w`{44w`{44w`{44w`{44yc{Hy$O
zWm)B4<zMAr<zMAr<zMBOo6IWzD*r0~D*r0~D*r0~D*r0~D*r0~D!+_cR{2-?SNY|Q
zvdX{8zsfIrlvREiw#Xo5m4B6gm4B6gm4B6gm4B6gm4B6gm4B6AW+{GHAgla-Tp+9b
ztNg3{tNea!AhK~;<(H3(%x7ervdX{8zskSHzsB!J39`n&#=pkz2MV&rzsA4DzsA4D
zzsA4DzsA4DFLReQ{x$wJetGHmQG=}Uuko+(ukrhlgRJrUp@XdPuko+(uko+(`|*RU
z@vrf(@vrgwF@(rtMII}%s#)V-<6q-n<6q-n<6q-n<6q-n<6q-n<6q-n<6q;KbIlsR
ztZUZz*Z9}?*ZE}zBjcTQ{&oI!{&oI!{&oI!{&oI!{&oI!{&oI!ep%W0b%w0-uk-tr
zhOG0;&t{!}oqwHwoqwHwonNjt>-_8d>-_8de%T@G{IZLYx6L~LI{!NVI{!NVI{!NV
zI{!NVI{!NVI{!MqUxvs!|2qFV|2qFV|2qFV|2qFV|2n^XWMq1?!7taF4gL-O4Sqkb
z7TL*c@Ne*M@Ne+TQf7mHgI@+XGQip3-{9Zi-{9Zi-{6;@%Le}jzZ_jQ_&4}B_&4}B
z_&4}B_&4}B_+{*~!N0-3!7qE44gL-O4gL*&S-foU%N}Qge}jL6e}jL6e}jL6e}jL6
ze}jL6e}jL6-!C#}gI{Jj8~mI6oBZ;9+2r5k-{jxq-{jxq-{hBX&L;mR|0e$?|0e$?
z|0chDX*T&c`8WAD`8WAD`8WAD`8WAD`8WAD`8WAD`8WAD`Q=ly$-l|J$-l|J$-l|J
z$-l|J$-l|J$-l|J$uF0fP5w>(P5w>(P5w>(P5w>(P5w>(P5w=OImT@A%fn`i-|re^
zi+_uMi+_uMi+_uMi+_uMi+_uMi+_uMi+_uMi+_vX?;>Q2e~W*Me~W*Me~W*Me~Vu>
zJX`!*{9F86{9F86{9F86{9F86{C?RaTl`!6Tl`!6Tl`!6el#gt{9F86{9F86{9F86
z{9F86{9F86{9F86{C;#hTl`!6Tl{kB+2-Ho-{#-u-{#-u-{#-u-{#-u-{#-u-{#-u
z-{#-u-{#-u-{#-u-{#-u-{#-u-{#-u-{#-u-{#-u-{#-u-{#-u-{#-u-{$vgF4^Ya
z=HKR*In6fzHvcyNHvcyNHvcyNHvcxitbMlmxB0jExB0jE<?pl2zs<kRzs<kRzr(-7
zzr(-7zr(-7zr(-7zr(-7zr(-7zr(-7zr(-7zr(-7zr(-7zr(-7zr(-7zr(-7zr(-7
zzr(-7zr(-7zr(-7zr(-7zr(-7zr(-7zr(-7zr(-7zr(-7zr(-7zr(-7zr(-7zr(-7
zzr(-7zr(-7zr(-7zr(-7zr(-7zstYNzstYNzstYNzstYNzstYNzstYNzstYNzstYN
zzstYNzstYNzstYNzstYNzstYNzstYNzstYNzstYNzstYNzstYNzstYNzstYNzstYN
zzstYNzstYNzstYNzstYNzstYNzstYNzstYNFV7yovyff>UH)DEU49w&?DFsN%erTe
ze~;g9nPiWDkAIKfuL)(3-;aK0kKYe}XOG{He`k+>kAII}20DBEd;EL+d;EL+d;EUT
zEPMQW{CoU+{CoU+{C@u-em5n1{CoU+{Ib;9<KN@=OGeq_-{aro-{aro-{Y6J&L00B
zzua~FAX)bK_xShtWt_9eFUOxfem_(m+3fhS^7wV6?D6mO@AL2T@AL2T@AL2T@AL2T
z@AL2T``!Kc-TmzI@AL2T@AL2T%Ti~b-|xR<pMRf!pWp8XWS?IiJp26n{QLa-{QLa-
z{QLZVGba1|`~3U-ez7V0{QLa-{QLa-{QLa-{QLa!<=N-o=ilew=ilew=ile|+cVkc
z_xm&1=ilew=ilew=ile|+Xgw{Kj1&$Kj8N(Q90m0;6LC$;P?9nIp9CwKj1&$_q!cA
z;6LC$;6LE^|0Ku({{g>Wn#uwH0sjI20sjHNAGXT@zu&~k0sjI20sjI20sjI20sjI2
z0sjI20sjI20sjHN-_pqe{{jC2{{g?>*2w|?0sjI20sjI20l(keiQnAG0sjI20sjHN
z-yO*z{~^C$#mXW7A-~@u$szwC{~`Y&zu)W0A^#!&A^#!&A^#!&A^#!2-}1>J{~`Y&
zzhBwPA^#!2-}uQP{~`Y&{~`Y&{~`Y&{~`Y&{~`Y&{~`Y&{~`Y&{~`Y&{~^C$?aCql
zA^#!&A-`Yo$|3(D{~`Y&zhCvrA^#!&A^#!&A^#!&A;0{8<o_f8ANl{t|405m^8b<l
zkNkh+|0Dk&`TxlONB%$Z|B?TX{D0*CBmW=yfBy?z#V`LK`TxlONB%$Z|B?TX{D0*C
zBmW=y|H%JG{y*~nk^hhUf8_ro{~!7P$p1(FKl1;P|Bw8C<o_f8ANl{t|405m^8b<l
zkNkh+|0Dk&`TxlONB%$Z|B?TX{D0*CBmW=y|H%JG{y*~nk^hhUf8_ro{~!7P$p1(F
zKl1;P|Bw8C<o_f8ANl{t|405m^8b<lkNkh+|0Dk&`TxlONB%$Z|B?TX{D0*CBmW=y
z|H%JG{y*~nk^hhUf8_ro{~!7P$p1(FKl1;P|Bw8C<o_f8ANl{t|405m^8b<lkNkh+
z|0Dk&`TxlONB%$Z|B?TX{D0*CBmW=y|H%JG{y*~nk^hhUf8_ro{~!7P$p1(FKl1;P
z|Bw8C<o_f8ANl{t|405m^8b<lkNkh+|0Dk&`TxlONB%$Z|B?TX{D0*CBmW=y|H%JG
z{y*~nk^hhUf8_ro{~!7P$p1(FKl1;P|Bw8C<o_f8ANl{t|405m^8b<lkNkh+|0Dk&
z`TxlONB%$Z|B?TX{D0*CBmW=y|H%JG{y*~nk^hhUf8_ro{~!7P$p1(FKl1;P|Bw8C
z<o_f8ANl{t|405m^8b<lkNkh+|0Dk&`TxlONB%$Z|B?TX{D0*CBmW=y|H%JG{y*~n
zk^hhUf8_ro{~!7P$p1(FKl1;P|Bw8C<o_f8ANl{t|405m^8b<lkNkh+|0Dk&`TxlO
zNB%$Z|B?TX{D0*CBmW=y|H%JG{y*~nk^hhUf8_ro{~!7P$p1(FKl1;P|Bw8C<o_f8
zANl{t|405m^8b<lkNkh+|0Dk&`TxlONB%$Z|B?TX{D0*CBmW=y|H%JG{y*~nk^hhU
zf8_ro{~!7P$p1(FKl1;P|Bw8C<o_f8ANl{t|405m^8b<lkNkh+|0Dk&`TxlONB%$Z
z|B?TX{D0*CBmW=y|H%JG{y*~nk^hhUf8_ro{~!7P$p1(FKl1;P|Bw8C<o_f8ANl{t
z|405m^8b<lkNkh+|0Dk&`TxlONB%$Z|B?TX{D0*CBmW=y|H%JG{y*~nk^hhUf8_ro
z{~!7P$p1(FKl1;P|Bw8C<o_f8ANl{t|405m^8b<lkNkh+|0Dk&`TxlONB%$Z|B?TX
z{D0*CBmW=y|H%JG{y*~nk^hhUf8_ro{~!7P$p1(FKl1;P|Bw8C<o_f8ANl{t|405m
z^8b<lkNkh+|0Dk&`TxlONB%$Z|B?TX{D0*CBmW=y|H%JG{y*~nk^hhUf8_ro{~!7P
z$p1(FKl1;P|Bw8C<o_f8ANl{t|405m^8b<lkNkh+|0Dk&`TxlONB%$Z|B?TX{D0*C
zBmW=y|H%JG{y*~nk^hhUf8_ro{~!7P$p1(FKl1;P|Bw8C<o_f8ANl{t|405m^8b<l
zkNkh+|0Dk&`TxlONB%$Z|B?TX{D0*CBmW=y|H%JG{y*~nk^hhUf8_ro{~!7P$p1(F
zKl1;P|Bw8C<o_f8ANl{t|405m^8b<lkNkh+|0Dk&`TxlONB%$Z|B?TX{D0*CBmW=y
z|H%JG{y*~nk^hhUf8_ro{~!7P$p1(FKl1;P|Bw8C<o_f8ANl{t|405m^8b<lkNkh+
z|0Dk&`TxlONB%$Z|B?TX{D0*CBmW=y|H%JG{y*~nk^hhUf8_ro{~!7P$p1(FKl1;P
z|Bw8C<o_f8ANl{t|405m^8b<lkNkh+|0Dk&`TxlONB%$Z|B?TX{D0*CBmW=y|H%JG
z{y*~nk^hhUf8_ro{~!7P$p1(FKl1;P|Bw8C<o_f8ANl{t|405m^8b<lkNkh+|0Dk&
z`TxlONB%$Z|B?TX{D0*CBmW=y|H%JG{y*~nk^hhUf8_ro{~!7P$p1(FKl1;P|Bw8C
z<o_f8ANl{t|405m^8b<lkNkh+|0Dk&`TxlONB%$Z|B?TX{D0*CBmW=y|H%JG{y*~n
zk^hhUf8_ro{~!7P$p1(FKl1;P|Bw8C<o_f8ANl{t|405m^8b<lkNkh+|0Dk&`TxlO
zNB%$Z|B?TX{D0*CBmW=y|H%JG{y*~nk^hhUf8_ro{~!7P$p1(FKl1;P|Bw8C<o_f8
zANl{t|405m^8b<lkNkh+|0Dk&`TxlONB%$Z|B?TX{D0*CBmW=y|H%JG{y*~nk^hhU
zf8_ro{~!7P$p1(FKl1;P|Bw8C<o_f8ANl{t|405m^8b<lkNkh+|0Dk&`TxlONB%$Z
z|B?TX{D0*CBmW=y|H%JG{y*~nk^hhUf8_ro{~!7P$p1(FKl1;P|Bw8C<o_f8ANl{t
z|405m^8b<lkNkh+|0Dk&`TxlONB%$Z|B?TX{D0*CBmW=y|H%JG{y*~nk^hhUf8_ro
z{~!7P$p1(FKl1;P|Bw8C<o_f8ANl{t|405m^8b<lkNkh+|0Dk&`TxlONB%$Z|B?TX
z{D0*CBmW=y|H%JG{y*~nk^hhUf8_ro{~!7P$p1(FKl1;P|Bw8C<o_f8ANl{t|405m
z^8b<lkNkh+|0Dk&`TxlONB%$Z|B?TX{D0*CBmW=y|H%JG{y*~nk^hhUf8_ro{~!7P
z$p1(FKl1;P|Bw8C<o_f8ANl{t|405m^8b<lkNkh+|0Dk&`TxlONB%$Z|B?TX{D0*C
zBmW=y|H%JG{y*~nk^hhUf8_ro{~!7P$p1(FKl1;P|Bw8C<o_f8ANl{t|405m^8b<l
zkNkh+|0Dk&`TxlONB%$Z|B?TX{D0*CBmW=y|H%JG{y*~nk^hhUf8_ro{~!7P$p1(F
zKl1;P|Bw8C<o_f8ANl{t|405m^8b<lkNkh+|0Dk&`TxlONB%$Z|B?TX{D0*CBmW=y
z|H%JG{y*~nk^hhUf8_ro{~!7P$p1(FKl1;P|Bw8C<o_f8ANl{t|405m^8b<lkNkh+
z|0Dk&`TxlONB%$Z|B?TX{D0*CBmW=y|H%JG{y*~nk^hhUf8_ro{~!7P$p1(FKl1;P
z|Bw8C<o_f8ANl{t|407+|E=O~cH>wOG7NL=Gh!CqRSQ4}79ett6XzVeo1b9?LR=92
zeo|cmPX>mcg)8pJ>iCNiyIoc9^V0vn^#3pY|4aY>(*M8o|1bUjOaK4U|G)JAFa7^Z
z|Nqkezx4kv{r^k<|I+`z^#3pY|4aY>(*M8o|1bUjOaK4U|G)JAFa7^Z|Nqkezx4kv
z{r^k<|I+`z^#3pY|4aY>(*M8o|1bUjOaK4U|G)JAFa7^Z|Nqkezx4m<|I`1c|4;v)
z{y+VH`v3I*>HpLJr~gm?pZ-7nfBOIQ|LOnJ|EK>?|DXOp{eSxZ^#AGq)BmUcPye6(
zKmC9D|MdUq|I`1c|4;v){y+VH`v3I*>HpLJr~gm?pZ-7nfBOIQ|LOnJ|EK>?|DXOp
z{eSxZ^#AGq)BmUcPye6(KmC9D|MdUq|I`1c|4;v){y+VH`v3I*>HpLJr~gm?pZ-7n
zfBOIQ|LOnJ|EK>?|DXOp{eSxZ^#AGq)BmUcPye6(KmC9D|MdUq|I`1c|4;v){y+VH
z`v3I*>HpLJr~gm?pZ-7nfBOIQ|LOnJ|EK>?|DXOp{eSxZ^#AGq)BmUcPye6(KmC9D
z|MdUq|I`1c|4;v){y+VH`v3I*>HpLJr~gm?pZ-7nfBOIQ|LOnJ|EK>?|DXOp{eSxZ
z^#AGq)BmUcPye6(KmC9D|MdUq|I`1c|4;v){y+VH`v3I*{l<R%fBOIQ|LOnJ|MwgF
z_5bPr)BmUcPye6(KmC9D|MdUq|I`1c|4;v){y+VH`v3I*>HpLJr~gm?pZ-7nfBOIQ
z|NYi}{eSxZ^#AGq)BpEd`}P0n|I`1c|4;v){y+VH`v3I*>HpLJr~gm?pZ-7nfBOIQ
z|LOnJ|EK>?|DXOp{eSxZ^#AGq)BmUcPye6(KmC9D|MdUq|I`1c|4;v){y+VH`v3I*
z>HpLJr~gm?pZ-7nfBOIQ|LOnJ|EK>?|DXOp{eSxZ^#AGq)BmUcPye6(KmC9D|MdUq
z|I`1c|4;v){y+VH`v3I*>HpLJr~gm?pZ-7nfBOIQ|LOnJ|EK>?|DXOp{eSxZ^#AGq
z)BmUcPye6(KmC9D|MdUq|I`1c|4;v){y+VH`v3I*>HpLJr~gm?pZ-7nfBOIQ|LOnJ
z|EK>?|DXOp{eSxZ^#AGq)BmUcPye6(KmC9D|MdUq|I`1c|4;v){y+VH`v3I*>HpLJ
zr~gm?pZ-7nfBOIQ|LOnJ|EK>?|DXOp{eSxZ^#AGq)BmUcPye6(KmC9D|MdUq|I`1c
z|4;v){y+VH`v3I*>HpLJr~gm?pZ-7nfBOIQ|LOnJ|EK>?|DXOp{eSxZ^#AGq)BmUc
zPye6(KmC9D|MdUq|I`1c|4;v){y+VH`v3I*>HpLJr~gm?pZ-7nfBOIQ|LOnJ|EK>?
z|DXOp{eSxZ^#AGq)BmUcPye6(KmC9D|MdUq|I`1c|4;v){y+VH`v3I*>HpLJr~gm?
zpZ-7nfBOIQ|LOnJ|EK>?|DXOp{eSxZ^#AGq)BmUcPye6(KmC9D|MdUq|I`1c|4;v)
z{y+VH`v3I*>HpLJr~gm?pZ-7nfBOIQ|LOnJ|EK>?|DXOp{eSxZ^#AGq)BmUcPye6(
zKmC9D|MdUq|I`1c|4;v){y+VH`v3I*>HpLJr~gm?pZ-7nfBOIQ|LOnJ|EK>?|DXOp
z{eSxZ^#AGq)BmUcPye6(KmC9D|MdUq|I`1c|4;v){y+VH`v3I*>HpLJr~gm?pZ-7n
zfBOIQ|LOnJ|EK>?|DXOp{eSxZ^#AGq)BmUcPye6(KmC9D|MdUq|I`1c|4;v){y+VH
z`v3I*>HpLJr~gm?pZ-7nfBOIQ|LOnJ|EK>?|DXOp{eSxZ^#AGq)BmUcPye6(KmC9D
z|MdUq|I`1c|4;v){y+VH`v3I*>HpLJr~gm?pZ-7nfBOIQ|LOnJ|EK>?|DXOp{eSxZ
z^#AGq)BmUcPye6(KmC9D|MdUq|I`1c|4;v){y+VH`v3I*>HpLJr~gm?pZ-7nfBOIQ
z|K04r*?+VDX8+CpoBcQYZ}#8pzuAAY|7QQq{+s<b`)~H&?7!K6v;SuQ&HkJHH~Vk)
z-|WBHf3yE)|IPlJ{Wtq>_Ur%C|EK>?|DXOp{eSxZ^#9%LzuAAY|7O4bKmC6<`)~H&
z?7!K6v;SuQ&HkJHH~Vk)-|WBHf3yE)|IPlJ{Wtq>_Ur%C|EK>?|DXOp{eSxZ^#9%M
zzuSMe|8BqjKmC7q`}P0b?Z4Z9xBqVc-Tu4%cl+=5-|fHKf4Bc`|K0w({dfEC_TTNl
z+kdzJZvWl>yZv|j@AlvAzuSMe|8D=?{=5Bm`|tMO?Z4Z9xBqVc-Tu4%cl+=5-|fHK
zf4Bc`|K0w({dfEC_8;~i_8;~i_8;~i_8;~i_8<1^|I`1c|L?H>u>Y|CuwVb5{y+VH
z`v3I*>HpLJr~gm?pZ-7nfBOIQ|LOnJ|EK>?|DXOp{eSxZ^#AGq)BmUcPye6(KmC9D
z|MdUq|I`1c|4;v){y+VH`v3I*>HpLJr~gm?pZ-7nfBOIQ|LOnJ|EK>?|DXOp{eSxZ
z^#AGq)BmUcPye6(KmC9D|MdUq|I`1c|4;v){y+VH`v3I*>HpLJr~gm?pZ-7nfBOIQ
z|LOnJ|EK@&wEwjKwEwjKwEwjKwEwjKwEwhU|DXQ9)Be-`)Be+b{eSxZ^#AGq)BmUc
zPye6(KmC9D|MdUq|I`1c|4;v){y+VH`v3I*>HpLJr~gm?pZ-7nfBOIQ|LOnJ|EK>?
z|DXOp{eSxZ^#AGq)BmUcPye6(KmC9D|MdUq|I`1c|4;v){y+VH`v3I*>HpLJr~gm?
zpZ-7nfBOIQ|LOnJ|EK>?|DXOp{eSxZ^#AGq)BmUcPye6(KmC9D|MdUq|I`1c|4;v)
z{y+VH`v3I*>HpLJr~gm?pZ-7nfBOIQ|LOnJ|EK>?|DXOp{eSxZ^#AGq)BmUcPye6(
zKmC9D|MdUq|I`1c|4;v){y+VH`v3I*>HpLJr~gm?pZ-7nfBOIQ|LOnJ|EK>?|DXOp
z{eSxZ^#AGq)BmUcPye6(KmC9D|MdUq|I`1c|4;v){y+VH`v3I*>HpLJr~gm?pZ-7n
zfBOIQ|LOnJ|EK>?|DXOp{eSxZ^#AGq)BmUcPye6(KmC9D|MdUq|I`1c|4;v){y+VH
z`v3I*>HpLJr~gm?pZ-7nfBOIQ|LOnJ|EK>?|DXOp{eSxZ^#AGq)BmUcPye6(KmC9D
z|MdUq|I`1c|4;v){y+VH`v3I*>HpLJr~gm?pZ-7nfBOIQ|LOnJ|EK>?|DXOp{eSxZ
z^#AGq)BmUcPye6(KmC9D|MdUq|I`1c|4;v){y+VH`v3I*>HpLJr~gm?pZ-7nfBOIQ
z|LOnJ|EK>?|DXOp{eSxZ^#AGq)BmUcPye6(KmC9D|MdUq|I`1c|4;v){y+VH`v3I*
z>HpLJr~gm?pZ-7nfBOIQ|LOnJ|EK>?|DXOp{eSxZ^#AGq)BmUcPye6(KmC9D|MdUq
z|I`1c|4;v){y+VH`v3I*>HpLJr~gm?pZ-7nfBOIQ|LOnJ|EK>?|DXOp{eSxZ^#AGq
z)BmUcPye6(KmC9D|MdUq|I`1c|4;v){y+VH`v3I*>HpLJr~gm?pZ-7nfBOIQ|LOnJ
z|EK>?|DXOp{eSxZ^#AGq)BmUcPye6(KmC9D|MdUq|I`1c|4;v){y+VH`v3I*>HpLJ
zr~gm?pZ-7nfBOIQ|LOnJ|EK>?|DXOp{eSxZ^#AGq)BmUcPye6(KmC9D|MdUq|I`1c
z|4;v){y+VH`v3I*>HpLJr~gm?pZ-7nfBOIQ|LOnJ|EK>?|DXOp{eSxZ^#AGq)BmUc
zPye6(KmC9D|MdUq|I`1c|4;v){y+VH`v3I*>HpLJr~gm?pZ-7nfBOIQ|LOnJ|EK>?
z|DXOp{eSxZ^#AGq)BmUcPye6(KmC9D|MdUq|I`1c|4;v){y+VH`v3I*>HpLJr~gm?
zpZ-7nfBOIQ|LOnJ|EK>?|DXOp{eSxZ^#AGq)BmUcPye6(KmC9D|MdUq|I`1c|4;v)
z{y+VH`v3I*>HpLJr~gm?pZ-7nfBOIQ|LOnJ|EK>?|DXOp{eSxZ^#AGq)BmUcPye6(
zKmC9D|MdUq|I`1c|4;v){y+VH`v3I*>HpLJr~gm?pZ-7nfBOIQ|LOnJ|EK>?|DXOp
z{eSxZ^#AGq)BmUcPye6(KmC9D|MdUq|I`1c|4;v){y+VH`v3I*>HpLJr~gm?pZ-7n
zfBOIQ|LOnJ|EK>?|DXOp{eSxZ^#AGq)BmUcPye6(KmC9D|MdUq|I`1c|4;v){y+VH
z`v3I*>HpLJr~gm?pZ-7nfBOIQ|LOnJ|EK>?|DXOp{eSxZ^#AGq)BmUcPye6(KmC9D
z|MdUq|I`1c|4;v){y+VH`v3I*>HpLJr~gm?pZ-7nfBOIQ|LOnJ|EK>?|DXOp{eSxZ
z^#AGq)BmUcPye6(KmC9D|MdUq|I`1c|4;v){y+VH`v3I*>HpLJr~gm?pZ-7nfBOIQ
z|LOnJ|EK>?|DXOp{eSxZ^#AGq)BmUcPye6(KmC9D|MdUq|I`1c|4;v){y+VH`v3I*
z>HpLJr~gm?pZ-7nfBOIQ|LOnJ|EK>?|DXOp{eSxZ^#AGq)BmUcPye6(KmC9D|MdUq
z|I`1c|4;v){y+VH`v3I*>HpLJr~gm?pZ-7nfBOIQ|LOnJ|EK>?|DXOp{eSxZ^#AGq
z)BmUcPye6(KmC9D|MdUq|I`1c|4;v){y+VH`v3I*>HpLJr~gm?pZ-7nfBOIQ|LOnJ
z|EK>?|DXOp{eSxZ^#AGq)BmUcPye6(KmC9D|MdUq|I`1c|4;v){y+VH`v3I*>HpLJ
zr~gm?pZ-7nfBOIQ|LOnJ|EK>?|DXOp{eSxZ^#AGq)BmUcPye6(KmC9D|MdUq|I`1c
z|4;v){y+VH`v3I*>HpLJr~gm?pZ-7nfBOIQ|LOnJ|EK>?|DXOp{eSxZ^#AGq)BmUc
zPye6(KmC9D|MdUq|I`1c|4;v){y+VH`v3I*>HpLJr~gm?pZ-7nfBOIQ|LOnJ|EK>?
z|DXOp{eSxZ^#AGq)BmUcPye6(KmC9D|MdUq|I`1c|4;v){y+VH`v3I*>HpLJr~gm?
zpZ-7nfBOIQ|LOnJ|EK>?|DXOp{eSxZ^#AGq)BmUcPye6(KmC9D|MdUq|I`1c|4;v)
z{y+VH`v3I*>HpLJr~gm?pZ-7nfBOIQ|LOnJ|EK>?|DXOp{eSxZ^#AGq)BmUcPye6(
zKmC9D|MdUq|I`1c|4;v){y+VH`v3I*>HpLJr~gm?pZ-7nfBOIQ|LOnJ|EK>?|DXOp
z{eSxZ^#AGq)BmUcPye6(KmC9D|MdUq|I`1c|4;v){y+VH`v3I*>HpLJr~gm?pZ-7n
zfBOIQ|LOnJ|EK>?|DXOp{eSxZ^#AGq)BmUcPye6(KmC9D|MdUq|I`1c|4;v){y+VH
z`v3I*>HpLJr~gm?pZ-7nfBOIQ|LOnJ|EK>?|DXOp{eSxZ^#AGq)BmUcPye6(KmC9D
z|MdUq|I`1c|4;v){y+VH`v3I*>HpLJr~gm?pZ-7nfBOIQ|LOnJ|EK>?|DXOp{eSxZ
z^#AGq)BmUcPye6(KmC9D|MdUq|I`1c|4;v){y+VH`v3I*>HpLJr~gm?pZ-7nfBOIQ
z|LOnJ|EK>?|DXOp{eSxZ^#AGq)BmUcPye6(KmC9D|MdUq|I`1c|4;v){y+VH`v3I*
z>HpLJr~gm?pZ-7nfBOIQ|LOnJ|EK>?|DXOp{eSxZ^#AGq)BmUcPye6(KmC9D|MdUq
z|I`1c|4;v){y+VH`v3I*>HpLJr~gm?pZ-7nfBOIQ|LOnJ|EK>?|DXOp{eSxZ^#AGq
z)BmUcPye6(KmC9D|MdUq|I`1c|4;v){y+VH`v3I*>HpLJr~gm?pZ-7nfBOIQ|LOnJ
z|EK>?|DXOp{eSxZ^#AGq)BmUcPye6(KmC9D|MdUq|I`1c|4;v){y+VH`v3I*>HpLJ
zr~gm?pZ-7nfBOIQ|LOnJ|EK>?|DXOp{eSxZ^#AGq)BmUcPye6(KmC9D|MdUq|I`1c
z|4;v){y+VH`v3I*>HpLJr~gm?pZ-7nfBOIQ|LOnJ|EK>?|DXOp{eSxZ^#AGq)BmUc
zPye6(KmC9D|MdUq|I`1c|4;v){y+VH`v3I*>HpLJr~gm?pZ-7nfBOIQ|LOnJ|EK>?
z|DXOp{eSxZ^#AGq)BmUcPye6(KmC9D|MdUq|I`1c|4;v){y+VH`v3I*>HpLJr~gm?
zpZ-7nfBOIQ|LOnJ|EK>?|DXOp{eSxZ^#AGq)BmUcPye6(KmC9D|MdUq|I`1c|4;v)
z{y+VH`v3I*>HpLJr~gm?pZ>p}?f=<+{eSxZ^#AGq)BmUcPye6(KmC9D|MdUq|I`1c
z|4;v){y+VH`v3I*>HpLJr~gm?pZ-7nfBOIQ|LOnJ|EK@&&Hgw0_5bPr)BmUcPye6(
zKmC9D|MdUq|I`1c|4;v){y+VH`v3I*>HpLJr~gm?pZ-7nfBOIQ|LOnJ|EK>?|DXOp
z{eSxZ^#AGq)BmUcPye6(KmC9D|MdUq|I`1c|4;v){y+VH`v3I*>HpLJd;7z;B@93q
zfG_}I0Kx!-0SE&S1|SST7=SPUVF1DagaHTx5C$L&Kp2290AT>a0E7Vu0}uuv3_uuw
zFaTiy!T^K;2m=rXAPhhlfG_}I0Kx!-0SE&S1|SST7=SPUVF1DagaHTx5C$L&Kp229
z0AT>a0E7Vu0}uuv3_uuwFaTiy!T^K;2m=rXAPhhlfG_}I0Kx!-0SE&S1|SST7=SPU
zVF1DagaHTx5C$L&Kp2290AT>a0E7Vu0}uuv3_uuwFaTiy!T^K;2m=rXAPhhlfG_}I
z0Kx!-0SE&S1|SST7=SPUVF1DagaHTx5C$L&Kp2290AT>a0E7Vu0}uuv3_uuwFaTiy
z!T^K;2m=rXAPhhlfG_}I0Kx!-0SE&S1|SST7=SPUVF1DagaHTx5C$L&Kp2290AT>a
z0E7Vu0}uuv3_uuwFaTiy!T^K;2m=rXAPhhlfG_}I0Kx!-0SE&S1|SST7=SPUVF1Da
zgaHTx5C$L&Kp2290AT>a0E7Vu0}uuv3_uuwFaTiy!T^K;2m=rXAPhhlfG_}I0Kx!-
z0SE&S1|SST7=SPUVF1DagaHTx5C$L&Kp2290AT>a0E7Vu0}uuv3_uuwFaTiy!T^K;
z2m=rXAPhhlfG_}I0Kx!-0SE&S1|SST7=SPUVF1DagaHTx5C$L&Kp2290AT>a0E7Vu
z0}uuv3_uuwFaTiy!T^K;2m=rXAPhhlfG_}I0Kx!-0SE&S1|SST7=SPUVF1DagaHTx
z5C$L&Kp2290AT>a0E7Vu0}uuv3_uuwFaTiy!T^K;2m=rXAPhhlfG_}I0Kx!-0SE&S
z1|SST7=SPUVF1DagaHTx5C$L&Kp2290AT>a0E7Vu0}uuv3_uuwFaTiy!T^K;2m=rX
zAPhhlfG_}I0Kx!-0SE&S1|SST7=SPUVF1DagaHTx5C$L&Kp2290AT>a0E7Vu0}uuv
z3_uuwFaTiy!T^K;2m=rXAPhhlfG_}I0Kx!-0SE&S1|SST7=SPUVF1DagaHTx5C$L&
zKp2290AT>a0E7Vu0}uuv3_uuwFaTiy!T^K;2m=rXAPhhlfG_}I0Kx!-0SE&S1|SST
z7=SPUec1nD|A+k__G19T0Q6!1hy5S+f7t(F|A+k__J7#_VgHByANGIP|6%`!{U7#!
z*#BYwhy56UFaTiy!T^K;2m=rXAPhhlfG_}I0Kx!-0SE&S1|SST7=SPUVF1DagaHTx
z5C$L&Kp2290AT>a0E7Vu0}uuv3_uuwFaTiy!T^K;2m=rXAPhhlfG_}I0Kx!-0SE&S
z1|SST7=SPUVF1DagaHTx5C$L&Kp2290AT>a0E7Vu0}uuv3_uuwFaTiy!T^K;2m=rX
zAPhjC_J7*{Y5%AF7=SPUVF1Da^lAU6{h#)K+W%?)r~RMyf7<_P|EK+*_J7*{Y5%AF
zpZ0&+|FHjI|HJ-={SW&e_CM@@*#EHqVgJMahy4%xAND`&f7t)9|6%{b{)hb!`yci{
z?0?w*u>WEI!~Tc;5BneXKkR?l|FHjI|HJ-={SW&e_CM@@*#EHqVLt{S3_uV2AND`&
zf7p)!2m=rXAPhhlfG_}I0Kx!-0SE&S1|SST7=SPUVF1Da^tAtJ|I_}b{ZIR!_CM`^
z+W)lwX+H)a3_wr&pY}iPf7*`$2m=rXAPhhlfG_}I0Kx!-0SE&S1|SST7=SPUVF1Da
z^tAtJ|I_}b{ZIR!_CM`^+W)lwY5&vyr~Oa+pY}iPf7<`F|7riz{-^y<`=9nd?SI<;
zwEt=U)BdOZPy3(tKka|oj{yh+5C$L&Kp2290AT>a0E7Vu0}uuv3_uuwFaTiy!T^K;
z2m=rXAPhhlfG_}I0Kx!-0SE&S1|SST7=SPUVF1DagaHTx5C$L&Kp2290AT>a0E7Vu
z0}uuv3_uuwFaTiy!T^K;2m=rXAPhhlfG_}I0Kx!-0SE&S1|SST7=SPUVF1DagaHTx
z5C$L&Kp2290AT>a0E7Vu0}uuv3_uuwFaTiy!T^K;2m=rXAPhhlfG_}I0Kx!-0SE&S
z1|SST7=SPUVF1DagaHTx5C$L&Kp2290AT>a0E7Vu0}uuv3_uuwFaTiy!T^K;2m=rX
zAPhhlfG_}I0Kx!-0SE&S1|SST7=SPUVF1DagaHTx5C$L&Kp2290AT>a0E7Vu0}uuv
z3_uuwFaTiy!T^K;2m=rXAPhhlfG_}I0Kx!-0SE&S1|SST7=SPUVF1DagaHTx5C$L&
zKp2290AT>a0E7Vu0}uuv3_uuwFaTiy!T^K;2m=rXAPhhlfG_}I0Kx!-0r2)8YU^kK
z(Ey?WL<5Kh5Dg$2Ks1180MP)V0Yn3c1`rJ(8bCCFXaLawq5(t$hz1Z1AR0h4fM@{G
z0HOgz1BeC?4ImmoG=OLT(Ey?WL<5Kh5Dg$2Ks1180MP)V0Yn3c1`rJ(8bCCFXaLaw
zq5(t$hz1Z1AR0h4fM@{G0HOgz1BeC?4ImmoG=OLT(Ey?WL<5Kh5Dg$2K)<)21`rJ(
z8bCCFXaLawq5(t$hz1Z1AR0h4fM@{G0HOgz1BeC?4ImmoG=OLT(Ey?WL<5Kh5Dg$2
zKs1180MP)V0Yn3c1`rJ(8bCCFXaLawq5(t$hz1Z1AR0h4fM@{G0HOgz1BeC?4Immo
zG=OLT(Ey?WL<5Kh5Dg$2Ks1180MP)V0Yn3c1`rJ(8bCCFXaLawq5(t$hz1Z1AR0h4
zfM@{G0HOgz1BeC?4ImmoG=OLT(Ey?WL<5Kh5Dg$2Ks1180MP)V0rcbFCk-GPKs118
z0MP)V0Yn3c1`rJ(8bCCFXaLawq5(t$hz1Z1AR0h4fM@{G0HOgz1BeC?4ImmoG=OLT
z(Ey?WL<5Kh5Dg$2Ks1180MP)V0Yn3c1`rJ(8bCCFXaLawq5(t$hz1Z1AR0h4fM@{G
z0HOgz1BeC?4ImmoG=OLT(Ey?WL<5Kh5Dg$2Ks1180MP)V0Yn3c1`rJ(8bCCFXaLaw
zq5(t$hz1Z1AR0h4fM@{G0HOgz1BeC?4ImmoG=OLT(Ey?WL<5Kh5Dg$2Ks1180MP)V
z0Yn3c1`rJ(8bCCFXaLawq5(t$hz1Z1AR0h4fM@{G0HOgz1BeC?4ImmoG=OLT(Ey?W
zL<5Kh5Dg$2Ks1180MP)V0Yn3c1`rJ(8bCCFXaLawq5(t$hz1Z1AR0h4fM@{G0HOgz
z1BeC?4ImmoG=OLT(Ey?WL<5Kh5Dg$2Ks1180MP)V0Yn3c1`rJ(8bCCFXaLawq5(t$
zhz1Z1AR0h4fM@{G0HOgz1BeC?4ImmoG=OLT(Ey?WL<5Kh5Dg$2Ks1180MP)V0Yn3c
z1`rJ(8bCCFXaLawq5(t$hz1Z1AR0h4fM@{G0HOgz1BeC?4ImmoG=OLT(Ey?WL<5Kh
z5Dg$2Ks1180MP)V0Yn3c1`rJ(8bCCFXaLawq5(t$hz1Z1AR0h4fM@{G0HOgz1BeC?
z4ImmoG=OLT(Ey?WL<5Kh5Dg$2Ks1180MP)V0Yn3c1`rJ(8bCCFXaLawq5(t$hz1Z1
zAR0h4fM@{G0HOgz1BeC?4ImmoG=OLT(Ey?WL<5Kh5Dg$2Ks1180MP)V0Yn3c1`rJ(
z8bCCFXaLawq5(t$hz1Z1AR0h4fM@{G0HOgz1BeC?4ImmoG=OLT(Ey?WL<5Kh5Dg$2
zKs1180MP)V0Yn3c1`rJ(8bCCFXaLawq5(t$hz1Z1AR0h4fM@{G0HOgz1BeC?4Immo
zG=OLT(Ey?WL<5Kh5Dg$2Ks1180MP)V0Yn3c1`rJ(8bCCFXaLawq5(t$hz1Z1AR0h4
zfM@{G0HOgz1BeC?4ImmoG=OLT(Ey?WL<5Kh5Dg$2Ks1180MP)V0Yn3c1`rJ(8bCCF
zXaLawq5(t$hz1Z1AR0h4fM@{G0HOgz1BeC?4ImmoG=OLT(Ey?WL<5Kh5Dg$2Ks118
z0MP)V0Yn3c1`rJ(8bCCFXaLawq5(t$hz1Z1AR0h4fM@{G0HOgz1BeC?4ImmoG=OLT
z(Ey?WL<5Kh5Dg$2Ks1180MP)V0Yn3c1`rJ(8bCCFXaLawq5(t$hz1Z1AR0h4fM@{G
z0HOgz1BeC?4ImmoG=OLT(Ey?WL<5Kh5Dg$2Ks1180MP)V0Yn3c1`rJ(8bCCFXaLaw
zq5(t$hz1Z1AR0h4fM@{G0HOgz1BeC?4ImmoG=OLT(Ey?WL<5Kh5Dg$2Ks1180MP)V
z0Yn3c1`rJ(8bCCFXaLawq5(t$hz1Z1AR0h4fM@{G0HOgz1BeC?4ImmoG=OLT(Ey?W
zL<5Kh5Dg$2Ks1180MP)V0Yn3c1`rJ(8bCCFXaLawq5(t$hz1Z1AR0h4fM@{G0HOgz
z1BeC?4ImmoG=OLT(Ey?WL<5Kh5Dg$2Ks1180MP)V0Yn3c1`rJ(8bCCFXaLawq5(t$
zhz1Z1AR0h4fM@{G0HOgz1BeC?4ImmoG=OLT(Ey?WL<5Kh5Dg$2Ks1180MP)V0Yn3c
z1`rJ(8bCCFXaLawq5(t$hz1Z1AR0h4fM@{G0HOgz1BeC?4ImmoG=OLT(Ey?WL<5Kh
z5Dg$2Ks1180MP)V0Yn3c1`rJ(8bCCFXaLawq5(t$hz1Z1AR0h4fM@{G0HOgz1BeC?
z4ImmoG=OLT(Ey?WL<5Kh5Dg$2Ks1180MP)V0Yn3c1`rJ(8bCCFXaLawq5(t$hz1Z1
zAR0h4fM@{G0HOgz1BeC?4ImmoG=OLT(Ey?WL<5Kh5Dg$2Ks1180MP)V0Yn3c1`rJ(
z8bCCFXaLawq5(t$hz1Z1AR0h4fM@{G0HOgz1BeC?4ImmoG=OLT(Ey?WL<5Kh5Dg$2
zKs1180MP)V0Yn3c1`rJ(8bCCFXaLawq5(t$hz1Z1AR0h4fM@{G0HOgz1BeC?4Immo
zG=OLT(Ey?WL<5Kh5Dg$2Ks1180MP)V0Yn3c1`rJ(8bCCFXaLawq5(t$hz1Z1AR0h4
zfM@{G0HOgz1BeC?4ImmoG=OLT(Ey?WL<1NNU^IZy07e5C4PZ2Y(EvsR7!6=FfYAU(
z0~ifpG=R|nMgtfPU^IZy07e5C4PZ2Y(EvsR7!6=FfYAU(0~ifpG=R|nMgtfPU^IZy
z07e5C4PZ2Y(EvsR7!6=FfYAU(0~ifpG=R|nMgtfPU^IZy07e5C4PZ2Y(EvsR7!6=F
zfYAU(0~ifpG=R|nMgtfPU^IZy07e5C4PZ2Y(EvsR7!6=FfYAU(0~ifpG=R|nMgtfP
zU^IZy07e5C4PZ2Y(EvsR7!6=FfYAU(0~ifpG=R|nMgtfPU^IZy07e5C4PZ2Y(EvsR
z7!6=FfYAU(0~ifpG=R|nMgtfPU^IZy07e5C4PZ2Y(EvsR7!6=FfYAU(0~ifpG=R|n
zMgtfPU^IZy07e5C4PZ2Y(EvsR7!6=FfYAU(0~ifpG=R|nMgtfPU^IZy07e5C4PZ2Y
z(EvsR7!6=FfYAU(0~ifpG=R|nMgtfPU^IZy07e5C4PZ2Y(EvsR7!6>*v7ZJo8o+1(
zqXCQtFdD#U0HXnn1~3}HXaJ)Dj0P|oz-R!Y0gMJP8o+1(qXCQtFdD#U0HXnn1~3}H
zXaJ)Dj0P|oz-R#bt^G8B(EvsR7!6=FfYAU(0~ifpG=R|nMgtfPU^IZy07e5C4PZ2Y
z(EvsR7!6=FfYAU(0~ifpG=R|nMgtfPU^IZy07e5C4PZ2Y(EvsR7!6=FfYAU(0~ifp
zG=R|nMgtfPU^IZy07e5C4PZ2Y(EvsR7!6=FfYAU(0~ifpG=R|nMgtfPU^IZy07e5C
z4PZ2Y(EvsR7!6=FfYAU(0~ifpG=R|nMgtfPU^IZy07e5C4PZ2Y(EvsR7!6=FfYAU(
z0~ifpG=R|nMgtfPU^IZy07e5C4PZ2Y(EvsR7!6=FfYAU(0~ifpG=R|nMgtfPU^IZy
z07e5C4PZ2Y(EvsR7!6=FfYAU(0~ifpG=R|nMgtfPU^IZy07e5C4PZ2Y(EvsR7!6=F
zfYAU(0~ifpG=R|nMgtfPU^IZy07e5C4PZ2Y(EvsR7!6=FfYAU(0~ifpG=R|nMgtfP
zU^IZy07e5C4PZ2Y(EvsR7!6=FfYAU(0~ifpG=R|nMgtfPU^IZy07e5C4PZ2Y(EvsR
z7!6=FfYAU(0~ifpG=R|nMgtfPU^IZy07e5C4PZ2Y(EvsR7!6=FfYAU(0~ifpG=R|n
zMgtfPU^IZy07e5C4PZ2Y(EvsR7!6=FfYAU(0~ifpG=R|nMgtfPU^IZy07e5C4PZ2Y
z(EvsR7!6=FfYAU(0~ifpG=R|nMgtfPU^IZy07e5C4PZ2Y(EvsR7!6=FfYAU(0~ifp
zG=R|nMgtfPU^IZy07e5C4PZ2Y(EvsR7!6=FfYAU(0~ifpG=R|nMgtfPU^IZy07e5C
z4PZ2Y(EvsR7!6=FfYAU(0~ifpG=R|nMgtfPU^IZy07e5C4PZ2Y(EvsR7!6=FfYAU(
z0~ifpG=R|nMgtfPU^IZy07e5C4PZ2Y(EvsR7!6=FfYAU(0~ifpG=R|nMgtfPU^IZy
z07e5C4PZ2Y(EvsR7!6=FfYAU(0~ifpG=R|nMgtfPU^IZy07e5C4PZ2Y(EvsR7!6=F
zfYAU(0~ifpG=R|nMgtfPU^IZy07e5C4PZ2Y(EvsR7!6=FfYAU(0~ifpG=R|nMgtfP
zU^IZy07e5C4PZ2Y(EvsR7!6=FfYAU(0~ifpG=R|nMgtfPU^IZy07e5C4PZ2Y(EvsR
z7!6=FfYAU(0~ifpG=R|nMgtfPU^IZy07e5C4PZ2Y(EvsR7!6=FfYAU(0~ifpG=R|n
zMgtfPU^IZy07e5C4PZ2Y(EvsR7!6=FfYAU(0~ifpG=R|nMgtfPU^IZy07e5C4PZ2Y
z(EvsR7!6=FfYAU(0~ifpG=R|nMgtfPU^IZy07e5C4PZ2Y(EvsR7!6=FfYAU(0~ifp
zG=R|nMgtfPU^IZy07e5C4PZ2Y(EvsR7!6=FfYAU(0~ifpG=R|nMgtfPU^IZy07e5C
z4PZ2Y(EvsR7!6=FfYAU(0~ifpG=R|nMgtfPU^n||0HXnn1~3}HXaJ)Dj0P|oz-R!Y
z0gMJP8o+1(qXCQtFdD#U0HXnn1~3}HXaJ)Dj0P|oz-R!Y0gMJP8o+1(qXCQtu)F;<
zfYAU(0~ifpG=R|nMgtfPU^IZy07e5C4PZ2Y(EvsR7!6=FfYAU(0~ifpG=R|nMgtfP
zU^IZy07e5C4PZ2Y(EvsR7!6=FfYAU(0~ifpG=R|nMgtfPU^IZy07e5C4PZ2Y(EvsR
z7!6=FfF1T9_8;~i_R|1H0~ifpG=LrUANC*iANC*iANC*iANC*iANC*iANC*iANC*i
zANC*iANC*iANC*iANC*iANC*iANC*iANC*iANC*iANC*iANC*iANC*iANC*iANC*i
zANC*iANC*iANC*iANHU2pZ1^jpZ1^jpZ1^jpZ1^jpZ1^jpZ1^jpZ1^jpZ1^jpZ1^j
zpZ1^jpZ1^jpZ1^jpZ1^jpZ1^jpZ1^jpZ3!LMgtfPU^IZy07e5C4PZ2Y(EvsR*lGW1
zKMi0sfYAU(1K4T*Y5!^eY5!^eY5!^eY5!^eY5!^eY5!^eY5!^eX+I5MG=R|nMgtfP
zU^IZy07e5C4PZ2Y(EvsR7!6=FfYAU(0~ifpG=R|nMgtfPU^IZy07e5C4PZ2Y(EvsR
z7!6=FfYAU(0~ifpG=R|nMgtfPU^IZy07e5C4PZ2Y(EvsR7!6=FfYAU(0~ifpG=R|n
zMgtfPU^IZy07e5C4PZ2Y(EvsR7!6=FfYAU(0~ifpG=R|nMgtfPU^IZy07e5C4PZ2Y
z(EvsR7!6=FfYAU(0~ifpG=R|nMgtfPU^IZy07e5C4PZ2Y(EvsR7!6=FfYAU(0~ifp
zG=R|nMgtfPU^IZy07e5C4PZ2Y(EvsR7!6=FfYAU(0~ifpG=R|nMgtfPU^IZy07e5C
z4PZ2Y(EvsR7!6=FfYAU(0~ifpG=R|nMgtfPU^IZy07e5C4PZ2Y(EvsR7!6=FfYAU(
z0~ifpG=R|nMgtfPU^IZy07e5C4PZ2Y(EvsR7!6=FfYAU(0~ifpG=R|nMgtfPU^IZy
z07e5C4PZ2Y(EvsR7!6=FfYAU(0~ifpG=R|nMgtfPU^IZy07e5C4PZ2Y(EvsR7!6=F
zfYAU(0~ifpG=R|nMgtfPU^IZy07e5C4PZ2Y(EvsR7!6=FfYAU(0~ifpG=R|nMgtfP
zU^IZy07e5C4PZ2Y(EvsR7!6=FfYAU(0~ifpG=R|nMgtfPU^IZy07e5C4PZ2Y(EvsR
z7!6=FfYAU(0~ifpG=R|nMgtfPU^IZy07e5C4PZ2Y(EvsR7!6=FfYAU(1K6M2PXibY
zU^IZy07e5C4PZ2Y(EvsR7!6=FfYAU(0~ifpG=R|nMgtfPU^IZy07e5C4PZ2Y(EvsR
z7!6=FfYAU(0~ifpG=R|nMgtfPU^IZy07e5C4PZ2Y(EvsR7!6=FfYAU(0~ifpG=R|n
zMgtfPU^IZy07e5C4PZ2Y(EvsR7!6=FfYAU(0~ifpG=R|nMgtfPU^IZy07e5C4PZ2Y
z(EvsR7!6=FfYAU(0~ifpG=R|nMgtfPU^IZy07e5C4PZ2Y(EvsR7!6=FfYAU(0~ifp
zG=Tku{WO5l07e5C4PZ2Y(EvsR7!6=FfYAU(0~ifpG=R|nMgtfPU^IZy07e5C4PZ2Y
z(EvsR7!6=FfYAU(0~ifpG=R|nMgtfPU^IZy07e5C4PZ2Y(EvsR7!6=FfYAU(0~ifp
zG=R|nMgtfPU^IZy07e5C4PZ2Y(EvsR7!6=FfYAU(0~ifpG=R|nMgtfPU^IZy07e5C
z4PZ2Y(EvsR7!6=FfYAU(0~ifpG=R|nMgtfPU^IZy07e5C4PZ2Y(EvsR7!6=FfYAU(
z0~ifpG=R|nMgtfPU^IZy07e5C4PZ2Y(EvsR7!6=FfYAU(0~ifpG=R|nMgtfPU^IZy
z07e5C4PZ2Y(EvsR7!6=FfYAU(0~ifpG=R|nMgtfPU^IZy07e5C4PZ2Y(EvsR7!6=F
zfYAU(0~ifpG=R|nMgtfPU^IZy07e5C4PZ2Y(EvsR7!6=FfYAU(0~ifpG=R|nMgtfP
zU^IZy07e5C4PZ2Y(EvsR7!6=FfYAU(0~ifpG=R|nMgtfPU^IZy07e5C4PZ2Y(EvsR
z7!6=FfYAU(0~ifpG=R|nMgtfPU^IZy07e5C4PZ2Y(EvsR7!6=FfYAU(0~ifpG=R|n
zMgtfPU^IZy07e5C4PZ2Y(EvsR7!6=FfYAU(0~ifpG=R|nMgtfPU^IZy07e5C4PZ2Y
z(EvsR7!6=FfYAU(0~ifpG=R|nMgtfPU^IZy07e5C4PZ2Y(EvsR7!6=FfYAU(0~ifp
zG=R|nMgtfPU^IZy07e5C4PZ2Y(EvsR7!6=FfYAU(0~ifpG=R|nMgtfPU^IZy07e5C
z4PZ2Y(EvsR7!6=FfYAU(0~ifpG=R|nMgtfPU^IZy07e5C4PZ2Y(EvsR7!6=FfYAU(
z0~ifpG=R|nMgtfPU^IZy07e5C4PZ2Y(EvsR7!6=FfYAU(0~ifpG=R|nMgtfPU^IZy
z07e5C4PZ2Y(EvsR7!6=FfYAU(0~ifpG=R|nMgtfPU^IZy07e5C4PZ2Y(EvsR7!6=F
zfYAU(0~ifpG=R|nMgtfPU^IZy07e5C4PZ2Y(EvsR7!6=FfYAU(0~ifpG=R|nMgtfP
zU^IZy07e5C4PZ2Y(EvsR7!6=FfYAU(0~ifpG=R|nMgtfPU^IZy07e5C4PZ2Y(EvsR
z7!6=FfYAU(0~ifpG=R|nMgtfPU^IZy07e5C4PZ2Y(EvsR7!6=FfYAU(0~ifpG=R|n
zMgtfPU^IZy07e5C4PZ2Y(EvsR7!6=FfYAU(0~ifpG=R|nMgtfPU^IZy07e5C4PZ2Y
z(EvsR7!6=FfYAU(0~ifpG=R|nMgtfPU^IZy07e5C4PZ2Y(EvsR7!6=FfYAU(0~ifp
zG=R|nMgtfPU^IZy07e5C4PZ2Y(EvsR7!6=FfYAU(0~ifpG=R|nMgtfPU^IZy07e5C
z4PZ2Y(EvsR7!6=FfYAU(0~ifpG=R|nMgtfPU^IZy07e5C4PZ2Y(EvsR7!6=FfYAU(
z0~ifpG=R|nMgtfPU^IZy07e5C4PZ2Y(EvsR7!6=FfYAU(0~ifpG=R|nMgtfPU^IZy
z07e5C4PZ2Y(EvsR7!6=FfYAU(0~ifpG=R|nMgtfPU^IZy07e5C4PZ2Y(EvsR7!6=F
zfYAU(0~ifpG=R|nMgtfPU^IZy07e5C4PZ2Y(EvsR7!6=FfYAU(0~ifpG=R|nMgtfP
zU^IZy07e5C4PZ2Y(EvsR7!6=FfYAU(0~ifpG=R|nMgtfPU^IZy07e5C4PZ2Y(EvsR
z7!6=FfYAU(0~ifpG=R|nMgtfPU^IZy07e5C4PZ2Y(EvsR7!6=FfYAU(0~ifpG=R|n
zMgtfPU^IZy07e5C4PZ2Y(EvsR7!6=FfYAU(0~ifpG=R|nMgtfPU^IZy07e5C4PZ2Y
z(EvsR7!6=FfYAU(0~ifpG=R|nMgtfPU^IZy07e5C4PZ2Y(EvsR7!6=FfYAU(0~ifp
zG=R|nMgtfPU^IZy07e5C4PZ2Y(EvsR7!6=FfYAU(0~ifpG=R|nMgtfPU^IZy07e5C
z4PZ2Y(EvsR7!6=FfYAU(0~ifpG=R|nMgtfPU^IZy07e72{>+6!hrju8_;hfc$Kf9w
zoPBXP9K4Qq__rU2e|PY$-NEnu?VtXYw{Jh*`Ro7pjt8FRzkk=Sp6-7i{Nw+9@K68u
z!S6lz-M{$IA5ZyT?|AUznf~X&k7xSbzy9$<{_xHpKje=Oe8?A9x2`W<`Z{>7FJ4Gq
zUwlWpzKHLyFUsKSi@xppA_Td9@*}|YldpT%PyW+iKM4`9pVTzhPwv0HesY7L!~36H
zKf7Gx@OKVQfL%ZP%5(iJW_P$b=yk521p?QbOZV5C^Ptz86HeEgvosDqmpA7-9RAY5
zPn6f2FTdBD0{eQ?d0lV9q3iAU{+8d|t9!k<$?1A?*UI(g){pC}bJf>ZCn2w|&dOY0
zy-c{i`hRzQ#Q|Jj-GqOAbxZj5)x8}K{_t<EtY6<;rgZpc2Nwk$e!S=O`t{8@f$N)Z
z`qww#DXwq61Yh4ItJgOz*!Au2{Tbifs(yWQzxDOa%_-NrD=OE!%LCWD^YqudldRXf
zvwGLNAGNM`KTBNiUOZm!ObG{{+WW)vz5n>s-rb>ny}O<EdUrq3_3j3e>$@ws*LSCR
zuJ69@Uf<>B*LN}Q_1!J#*LU}JUEkesbp7I$o`b)zUwjR^e$iK4zqr%(`qgE_>sQxw
zu3ug3xPEoU<oeZ*!Pl=|i(bDz{5$(_)%f~wdG-2m{p9*^;{5t>=Ir|LGVDJA<n?Pm

literal 0
HcmV?d00001

diff --git a/setup.py b/setup.py
index fe7990447d..28e85a0068 100644
--- a/setup.py
+++ b/setup.py
@@ -54,7 +54,8 @@ setup(
                       'boto3',
                       'requests',
                       'tqdm',
-                      'regex'],
+                      'regex',
+                      'sentencepiece'],
     entry_points={
       'console_scripts': [
         "pytorch_pretrained_bert=pytorch_pretrained_bert.__main__:main",
diff --git a/tests/modeling_xlnet_test.py b/tests/modeling_xlnet_test.py
index 30a6bfbec7..ec9cc2cda1 100644
--- a/tests/modeling_xlnet_test.py
+++ b/tests/modeling_xlnet_test.py
@@ -35,8 +35,8 @@ class XLNetModelTest(unittest.TestCase):
                      parent,
                      batch_size=13,
                      seq_length=7,
-                     mem_len=30,
-                     clamp_len=15,
+                     mem_len=10,
+                     clamp_len=-1,
                      reuse_len=15,
                      is_training=True,
                      use_labels=True,
@@ -78,6 +78,27 @@ class XLNetModelTest(unittest.TestCase):
             input_ids_2 = XLNetModelTest.ids_tensor([self.seq_length, self.batch_size], self.vocab_size)
             segment_ids = XLNetModelTest.ids_tensor([self.seq_length, self.batch_size], self.type_vocab_size)
 
+            # inp_k: int32 Tensor in shape [len, bsz], the input token IDs.
+            # seg_id: int32 Tensor in shape [len, bsz], the input segment IDs.
+            # input_mask: float32 Tensor in shape [len, bsz], the input mask.
+            #     0 for real tokens and 1 for padding.
+            # mems: a list of float32 Tensors in shape [mem_len, bsz, d_model], memory
+            #     from previous batches. The length of the list equals n_layer.
+            #     If None, no memory is used.
+            # perm_mask: float32 Tensor in shape [len, len, bsz].
+            #     If perm_mask[i, j, k] = 0, i attend to j in batch k;
+            #     if perm_mask[i, j, k] = 1, i does not attend to j in batch k.
+            #     If None, each position attends to all the others.
+            # target_mapping: float32 Tensor in shape [num_predict, len, bsz].
+            #     If target_mapping[i, j, k] = 1, the i-th predict in batch k is
+            #     on the j-th token.
+            #     Only used during pretraining for partial prediction.
+            #     Set to None during finetuning.
+            # inp_q: float32 Tensor in shape [len, bsz].
+            #     1 for tokens with losses and 0 for tokens without losses.
+            #     Only used during pretraining for two-stream attention.
+            #     Set to None during finetuning.
+
             lm_labels = None
             if self.use_labels:
                 lm_labels = XLNetModelTest.ids_tensor([self.seq_length, self.batch_size], self.vocab_size)
@@ -106,44 +127,15 @@ class XLNetModelTest(unittest.TestCase):
             random.seed(self.seed)
             torch.manual_seed(self.seed)
 
-        def create_transfo_xl_model(self, config, input_ids_1, input_ids_2, segment_ids, lm_labels):
-            model = XLNetLMHeadModel(config)
-            model.eval()
-
-            hidden_states_1, mems_1 = model(input_ids_1, seg_id=segment_ids)
-            hidden_states_2, mems_2 = model(input_ids_2, seg_id=segment_ids, mems=mems_1)
-            outputs = {
-                "hidden_states_1": hidden_states_1,
-                "mems_1": mems_1,
-                "hidden_states_2": hidden_states_2,
-                "mems_2": mems_2,
-            }
-            return outputs
-
-        def check_transfo_xl_model_output(self, result):
-            self.parent.assertListEqual(
-                list(result["hidden_states_1"].size()),
-                [self.seq_length, self.batch_size, self.d_model])
-            self.parent.assertListEqual(
-                list(result["hidden_states_2"].size()),
-                [self.seq_length, self.batch_size, self.d_model])
-            self.parent.assertListEqual(
-                list(list(mem.size()) for mem in result["mems_1"]),
-                [[self.mem_len, self.batch_size, self.d_model]] * self.n_layer)
-            self.parent.assertListEqual(
-                list(list(mem.size()) for mem in result["mems_2"]),
-                [[self.mem_len, self.batch_size, self.d_model]] * self.n_layer)
-
-
         def create_transfo_xl_lm_head(self, config, input_ids_1, input_ids_2, segment_ids, lm_labels):
             model = XLNetLMHeadModel(config)
             model.eval()
 
-            loss_1, mems_1a = model(input_ids_1, target=lm_labels)
-            lm_logits_1, mems_1b = model(input_ids_1)
+            loss_1, mems_1a = model(input_ids_1, seg_id=segment_ids, target=lm_labels)
+            lm_logits_1, mems_1b = model(input_ids_1, seg_id=segment_ids)
 
-            loss_2, mems_2a = model(input_ids_2, target=lm_labels, mems=mems_1a)
-            lm_logits_2, mems_2b = model(input_ids_2, mems=mems_1b)
+            loss_2, mems_2a = model(input_ids_2, seg_id=segment_ids, target=lm_labels, mems=mems_1a)
+            lm_logits_2, mems_2b = model(input_ids_2, seg_id=segment_ids, mems=mems_1b)
 
             outputs = {
                 "loss_1": loss_1,
@@ -160,23 +152,23 @@ class XLNetModelTest(unittest.TestCase):
         def check_transfo_xl_lm_head_output(self, result):
             self.parent.assertListEqual(
                 list(result["loss_1"].size()),
-                [self.seq_length, self.batch_size])
+                [])
             self.parent.assertListEqual(
                 list(result["lm_logits_1"].size()),
                 [self.seq_length, self.batch_size, self.vocab_size])
             self.parent.assertListEqual(
                 list(list(mem.size()) for mem in result["mems_1a"]),
-                [[self.mem_len, self.batch_size, self.d_model]] * self.n_layer)
+                [[self.seq_length, self.batch_size, self.d_model]] * self.n_layer)
             self.parent.assertListEqual(
                 list(list(mem.size()) for mem in result["mems_1b"]),
-                [[self.mem_len, self.batch_size, self.d_model]] * self.n_layer)
+                [[self.seq_length, self.batch_size, self.d_model]] * self.n_layer)
             self.parent.assertListEqual(
                 list(mem[~torch.isnan(mem)].sum() for mem in result["mems_1a"]),
                 list(mem[~torch.isnan(mem)].sum() for mem in result["mems_1b"]))
 
             self.parent.assertListEqual(
                 list(result["loss_2"].size()),
-                [self.seq_length, self.batch_size])
+                [])
             self.parent.assertListEqual(
                 list(result["lm_logits_2"].size()),
                 [self.seq_length, self.batch_size, self.vocab_size])
@@ -218,10 +210,6 @@ class XLNetModelTest(unittest.TestCase):
     def run_tester(self, tester):
         config_and_inputs = tester.prepare_config_and_inputs()
 
-        tester.set_seed()
-        output_result = tester.create_transfo_xl_model(*config_and_inputs)
-        tester.check_transfo_xl_model_output(output_result)
-
         tester.set_seed()
         output_result = tester.create_transfo_xl_lm_head(*config_and_inputs)
         tester.check_transfo_xl_lm_head_output(output_result)
@@ -242,6 +230,22 @@ class XLNetModelTest(unittest.TestCase):
 
         return torch.tensor(data=values, dtype=torch.long).view(shape).contiguous()
 
+    @classmethod
+    def mask_tensor(cls, shape, vocab_size, rng=None, name=None):
+        """Creates a tensor with padding on the right (0.0 for )."""
+        if rng is None:
+            rng = random.Random()
+
+        total_dims = 1
+        for dim in shape:
+            total_dims *= dim
+
+        values = []
+        for _ in range(total_dims):
+            values.append(rng.randint(0, vocab_size - 1))
+
+        return torch.tensor(data=values, dtype=torch.long).view(shape).contiguous()
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/tokenization_test.py b/tests/tokenization_test.py
index fe120a522c..249f71f984 100644
--- a/tests/tokenization_test.py
+++ b/tests/tokenization_test.py
@@ -49,7 +49,7 @@ class TokenizationTest(unittest.TestCase):
             tokenizer.convert_tokens_to_ids(tokens), [7, 4, 5, 10, 8, 9])
 
         vocab_file = tokenizer.save_vocabulary(vocab_path="/tmp/")
-        tokenizer.from_pretrained(vocab_file)
+        tokenizer = tokenizer.from_pretrained(vocab_file)
         os.remove(vocab_file)
 
         tokens = tokenizer.tokenize(u"UNwant\u00E9d,running")
diff --git a/tests/tokenization_transfo_xl_test.py b/tests/tokenization_transfo_xl_test.py
index bf0ac5db2f..226db4598e 100644
--- a/tests/tokenization_transfo_xl_test.py
+++ b/tests/tokenization_transfo_xl_test.py
@@ -44,7 +44,7 @@ class TransfoXLTokenizationTest(unittest.TestCase):
             tokenizer.convert_tokens_to_ids(tokens), [0, 4, 8, 7])
 
         vocab_file = tokenizer.save_vocabulary(vocab_path="/tmp/")
-        tokenizer.from_pretrained(vocab_file)
+        tokenizer = tokenizer.from_pretrained(vocab_file)
         os.remove(vocab_file)
 
         tokens = tokenizer.tokenize(u"<unk> UNwanted , running")
diff --git a/tests/tokenization_xlnet_test.py b/tests/tokenization_xlnet_test.py
new file mode 100644
index 0000000000..e383dd7877
--- /dev/null
+++ b/tests/tokenization_xlnet_test.py
@@ -0,0 +1,88 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import os
+import unittest
+from io import open
+import shutil
+import pytest
+
+from pytorch_pretrained_bert.tokenization_xlnet import (XLNetTokenizer, PRETRAINED_VOCAB_ARCHIVE_MAP)
+
+SAMPLE_VOCAB = os.path.join(os.path.dirname(
+                    os.path.dirname(os.path.abspath(__file__))),
+                    'samples/test_sentencepiece.model')
+
+class XLNetTokenizationTest(unittest.TestCase):
+
+    def test_full_tokenizer(self):
+        tokenizer = XLNetTokenizer(SAMPLE_VOCAB)
+
+        tokens = tokenizer.tokenize('This is a test')
+        self.assertListEqual(tokens, ['▁This', '▁is', '▁a', '▁t', 'est'])
+
+        self.assertListEqual(
+            tokenizer.convert_tokens_to_ids(tokens), [285, 46, 10, 170, 382])
+
+        vocab_path = "/tmp/"
+        vocab_file, special_tokens_file = tokenizer.save_vocabulary(vocab_path)
+        tokenizer = tokenizer.from_pretrained(vocab_path,
+                                              keep_accents=True)
+        os.remove(vocab_file)
+        os.remove(special_tokens_file)
+
+        tokens = tokenizer.tokenize("I was born in 92000, and this is falsé.")
+        self.assertListEqual(tokens, ['▁I', '▁was', '▁b', 'or', 'n', '▁in', '▁',
+                                      '9', '2', '0', '0', '0', ',', '▁and', '▁this',
+                                      '▁is', '▁f', 'al', 's', 'é', '.'])
+        ids = tokenizer.convert_tokens_to_ids(tokens)
+        self.assertListEqual(
+            ids, [8, 21, 84, 55, 24, 19, 7, 0,
+                            602, 347, 347, 347, 3, 12, 66,
+                            46, 72, 80, 6, 0, 4])
+
+        back_tokens = tokenizer.convert_ids_to_tokens(ids)
+        self.assertListEqual(back_tokens, ['▁I', '▁was', '▁b', 'or', 'n', '▁in',
+                                           '▁', '<unk>', '2', '0', '0', '0', ',',
+                                           '▁and', '▁this', '▁is', '▁f', 'al', 's',
+                                           '<unk>', '.'])
+
+    @pytest.mark.slow
+    def test_tokenizer_from_pretrained(self):
+        cache_dir = "/tmp/pytorch_pretrained_bert_test/"
+        for model_name in list(PRETRAINED_VOCAB_ARCHIVE_MAP.keys())[:1]:
+            tokenizer = XLNetTokenizer.from_pretrained(model_name, cache_dir=cache_dir)
+            shutil.rmtree(cache_dir)
+            self.assertIsNotNone(tokenizer)
+
+    def test_tokenizer_lower(self):
+        tokenizer = XLNetTokenizer(SAMPLE_VOCAB, do_lower_case=True)
+        tokens = tokenizer.tokenize(u"I was born in 92000, and this is falsé.")
+        self.assertListEqual(tokens, ['▁', 'i', '▁was', '▁b', 'or', 'n', '▁in', '▁',
+                                      '9', '2', '0', '0', '0', ',', '▁and', '▁this',
+                                      '▁is', '▁f', 'al', 'se', '.'])
+        self.assertListEqual(tokenizer.tokenize(u"H\u00E9llo"), ["▁he", "ll", "o"])
+
+    def test_tokenizer_no_lower(self):
+        tokenizer = XLNetTokenizer(SAMPLE_VOCAB, do_lower_case=False)
+        tokens = tokenizer.tokenize(u"I was born in 92000, and this is falsé.")
+        self.assertListEqual(tokens, ['▁I', '▁was', '▁b', 'or', 'n', '▁in', '▁',
+                                      '9', '2', '0', '0', '0', ',', '▁and', '▁this',
+                                      '▁is', '▁f', 'al', 'se', '.'])
+
+
+if __name__ == '__main__':
+    unittest.main()

From 24d806898223d596773f14d02ff15a9197dbc9a3 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Fri, 21 Jun 2019 12:33:44 +0200
Subject: [PATCH 006/139] weights loading script ok

---
 .../convert_xlnet_checkpoint_to_pytorch.py    |  26 +-
 pytorch_pretrained_bert/modeling_xlnet.py     | 251 ++++++++++++------
 tests/modeling_xlnet_test.py                  |   6 +-
 3 files changed, 193 insertions(+), 90 deletions(-)

diff --git a/pytorch_pretrained_bert/convert_xlnet_checkpoint_to_pytorch.py b/pytorch_pretrained_bert/convert_xlnet_checkpoint_to_pytorch.py
index eb89745be2..63f296ad83 100755
--- a/pytorch_pretrained_bert/convert_xlnet_checkpoint_to_pytorch.py
+++ b/pytorch_pretrained_bert/convert_xlnet_checkpoint_to_pytorch.py
@@ -18,23 +18,31 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import os
 import argparse
 import torch
 
-from pytorch_pretrained_bert.modeling_xlnet import XLNetConfig, XLNetRunConfig, XLNetLMHeadModel, load_tf_weights_in_xlnet
+from pytorch_pretrained_bert.modeling_xlnet import (CONFIG_NAME, WEIGHTS_NAME,
+                                                    XLNetConfig, XLNetRunConfig,
+                                                    XLNetLMHeadModel, load_tf_weights_in_xlnet)
 
-def convert_xlnet_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, pytorch_dump_path):
+def convert_xlnet_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, pytorch_dump_folder_path):
     # Initialise PyTorch model
     config = XLNetConfig.from_json_file(bert_config_file)
     print("Building PyTorch model from configuration: {}".format(str(config)))
     model = XLNetLMHeadModel(config)
 
     # Load weights from tf checkpoint
-    load_tf_weights_in_xlnet(model, tf_checkpoint_path)
+    load_tf_weights_in_xlnet(model, config, tf_checkpoint_path)
 
     # Save pytorch-model
-    print("Save PyTorch model to {}".format(pytorch_dump_path))
-    torch.save(model.state_dict(), pytorch_dump_path)
+    pytorch_weights_dump_path = os.path.join(pytorch_dump_folder_path, WEIGHTS_NAME)
+    pytorch_config_dump_path = os.path.join(pytorch_dump_folder_path, CONFIG_NAME)
+    print("Save PyTorch model to {}".format(os.path.abspath(pytorch_weights_dump_path)))
+    torch.save(model.state_dict(), pytorch_weights_dump_path)
+    print("Save configuration file to {}".format(os.path.abspath(pytorch_config_dump_path)))
+    with open(pytorch_config_dump_path, "w", encoding="utf-8") as f:
+        f.write(config.to_json_string())
 
 
 if __name__ == "__main__":
@@ -50,13 +58,13 @@ if __name__ == "__main__":
                         type = str,
                         required = True,
                         help = "The config json file corresponding to the pre-trained XLNet model. \n"
-                            "This specifies the model architecture.")
-    parser.add_argument("--pytorch_dump_path",
+                               "This specifies the model architecture.")
+    parser.add_argument("--pytorch_dump_folder_path",
                         default = None,
                         type = str,
                         required = True,
-                        help = "Path to the output PyTorch model.")
+                        help = "Path to the folder to store the PyTorch model or dataset/vocab.")
     args = parser.parse_args()
     convert_xlnet_checkpoint_to_pytorch(args.tf_checkpoint_path,
                                      args.xlnet_config_file,
-                                     args.pytorch_dump_path)
+                                     args.pytorch_dump_folder_path)
diff --git a/pytorch_pretrained_bert/modeling_xlnet.py b/pytorch_pretrained_bert/modeling_xlnet.py
index 08b193acfd..a165db1768 100644
--- a/pytorch_pretrained_bert/modeling_xlnet.py
+++ b/pytorch_pretrained_bert/modeling_xlnet.py
@@ -45,70 +45,122 @@ PRETRAINED_CONFIG_ARCHIVE_MAP = {
 XLNET_CONFIG_NAME = 'xlnet_config.json'
 TF_WEIGHTS_NAME = 'model.ckpt'
 
-def load_tf_weights_in_xlnet(model, tf_checkpoint_path):
+
+def build_tf_xlnet_to_pytorch_map(model, config):
+    """ A map of modules from TF to PyTorch.
+        I use a map to keep the PyTorch model as
+        identical to the original PyTorch model as possible.
+    """
+
+    tf_to_pt_map = {}
+
+    if hasattr(model, 'transformer'):
+        # We are loading pre-trained weights in a XLNetLMHeadModel => we will load also the output bias
+        tf_to_pt_map['model/lm_loss/bias'] = model.lm_loss.bias
+        # Now load the rest of the transformer
+        model = model.transformer
+
+    # Embeddings and output
+    tf_to_pt_map.update({'model/transformer/word_embedding/lookup_table': model.word_embedding.weight,
+                    'model/transformer/mask_emb/mask_emb': model.mask_emb})
+
+    # Transformer blocks
+    for i, b in enumerate(model.layer):
+        layer_str = "model/transformer/layer_%d/" % i
+        tf_to_pt_map.update({
+            layer_str + "rel_attn/LayerNorm/gamma": b.rel_attn.layer_norm.weight,
+            layer_str + "rel_attn/LayerNorm/beta": b.rel_attn.layer_norm.bias,
+            layer_str + "rel_attn/o/kernel": b.rel_attn.o,
+            layer_str + "rel_attn/q/kernel": b.rel_attn.q,
+            layer_str + "rel_attn/k/kernel": b.rel_attn.k,
+            layer_str + "rel_attn/r/kernel": b.rel_attn.r,
+            layer_str + "rel_attn/v/kernel": b.rel_attn.v,
+            layer_str + "ff/LayerNorm/gamma": b.ff.layer_norm.weight,
+            layer_str + "ff/LayerNorm/beta": b.ff.layer_norm.bias,
+            layer_str + "ff/layer_1/kernel": b.ff.layer_1.weight,
+            layer_str + "ff/layer_1/bias": b.ff.layer_1.bias,
+            layer_str + "ff/layer_2/kernel": b.ff.layer_2.weight,
+            layer_str + "ff/layer_2/bias": b.ff.layer_2.bias,
+        })
+
+    # Relative positioning biases
+    if config.untie_r:
+        r_r_list = []
+        r_w_list = []
+        r_s_list = []
+        seg_embed_list = []
+        for b in model.layer:
+            r_r_list.append(b.rel_attn.r_r_bias)
+            r_w_list.append(b.rel_attn.r_w_bias)
+            r_s_list.append(b.rel_attn.r_s_bias)
+            seg_embed_list.append(b.rel_attn.seg_embed)
+    else:
+        r_r_list = [model.r_r_bias]
+        r_w_list = [model.r_w_bias]
+        r_s_list = [model.r_s_bias]
+        seg_embed_list = [model.seg_embed]
+    tf_to_pt_map.update({
+        'model/transformer/r_r_bias': r_r_list,
+        'model/transformer/r_w_bias': r_w_list,
+        'model/transformer/r_s_bias': r_s_list,
+        'model/transformer/seg_embed': seg_embed_list})
+    return tf_to_pt_map
+
+def load_tf_weights_in_xlnet(model, config, tf_path):
     """ Load tf checkpoints in a pytorch model
     """
     try:
-        import re
         import numpy as np
         import tensorflow as tf
     except ImportError:
         print("Loading a TensorFlow models in PyTorch, requires TensorFlow to be installed. Please see "
             "https://www.tensorflow.org/install/ for installation instructions.")
         raise
-    tf_path = os.path.abspath(tf_checkpoint_path)
-    print("Converting TensorFlow checkpoint from {}".format(tf_path))
+    # Build TF to PyTorch weights loading map
+    tf_to_pt_map = build_tf_xlnet_to_pytorch_map(model, config)
+
     # Load weights from TF model
     init_vars = tf.train.list_variables(tf_path)
-    names = []
-    arrays = []
+    tf_weights = {}
     for name, shape in init_vars:
         print("Loading TF weight {} with shape {}".format(name, shape))
         array = tf.train.load_variable(tf_path, name)
-        names.append(name)
-        arrays.append(array)
+        tf_weights[name] = array
 
-    for name, array in zip(names, arrays):
-        name = name.split('/')
+    for name, pointer in tf_to_pt_map.items():
+        print("Importing {}".format(name))
+        assert name in tf_weights
+        array = tf_weights[name]
         # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
         # which are not required for using pretrained model
-        if any(n in ["adam_v", "adam_m", "global_step"] for n in name):
-            print("Skipping {}".format("/".join(name)))
-            continue
-        pointer = model
-        for m_name in name:
-            if re.fullmatch(r'[A-Za-z]+_\d+', m_name):
-                l = re.split(r'_(\d+)', m_name)
-            else:
-                l = [m_name]
-            if l[0] == 'kernel' or l[0] == 'gamma':
-                pointer = getattr(pointer, 'weight')
-            elif l[0] == 'output_bias' or l[0] == 'beta':
-                pointer = getattr(pointer, 'bias')
-            elif l[0] == 'output_weights':
-                pointer = getattr(pointer, 'weight')
-            elif l[0] == 'squad':
-                pointer = getattr(pointer, 'classifier')
-            else:
-                try:
-                    pointer = getattr(pointer, l[0])
-                except AttributeError:
-                    print("Skipping {}".format("/".join(name)))
-                    continue
-            if len(l) >= 2:
-                num = int(l[1])
-                pointer = pointer[num]
-        if m_name[-11:] == '_embeddings':
-            pointer = getattr(pointer, 'weight')
-        elif m_name == 'kernel':
+        if 'kernel' in name and 'ff' in name:
+            print("Transposing")
             array = np.transpose(array)
-        try:
-            assert pointer.shape == array.shape
-        except AssertionError as e:
-            e.args += (pointer.shape, array.shape)
-            raise
-        print("Initialize PyTorch weight {}".format(name))
-        pointer.data = torch.from_numpy(array)
+        if isinstance(pointer, list):
+            # Here we will split the TF weigths
+            assert len(pointer) == array.shape[0]
+            for i, p_i in enumerate(pointer):
+                arr_i = array[i, ...]
+                try:
+                    assert p_i.shape == arr_i.shape
+                except AssertionError as e:
+                    e.args += (p_i.shape, arr_i.shape)
+                    raise
+                print("Initialize PyTorch weight {} for layer {}".format(name, i))
+                p_i.data = torch.from_numpy(arr_i)
+        else:
+            try:
+                assert pointer.shape == array.shape
+            except AssertionError as e:
+                e.args += (pointer.shape, array.shape)
+                raise
+            print("Initialize PyTorch weight {}".format(name))
+            pointer.data = torch.from_numpy(array)
+        tf_weights.pop(name, None)
+        tf_weights.pop(name + '/Adam', None)
+        tf_weights.pop(name + '/Adam_1', None)
+
+    print("Weights not copied to PyTorch model: {}".format(', '.join(tf_weights.keys())))
     return model
 
 
@@ -181,7 +233,18 @@ class XLNetConfig(XLNetBaseConfig):
 
                  max_position_embeddings=512,
                  initializer_range=0.02,
-                 layer_norm_eps=1e-12):
+                 layer_norm_eps=1e-12,
+
+                 dropout=0.1,
+                 dropatt=0.1,
+                 init="normal",
+                 init_range=0.1,
+                 init_std=0.02,
+                 mem_len=None,
+                 reuse_len=None,
+                 bi_data=False,
+                 clamp_len=-1,
+                 same_length=False):
         """Constructs XLNetConfig.
 
         Args:
@@ -207,6 +270,22 @@ class XLNetConfig(XLNetBaseConfig):
             initializer_range: The sttdev of the truncated_normal_initializer for
                 initializing all weight matrices.
             layer_norm_eps: The epsilon used by LayerNorm.
+
+            dropout: float, dropout rate.
+            dropatt: float, dropout rate on attention probabilities.
+            init: str, the initialization scheme, either "normal" or "uniform".
+            init_range: float, initialize the parameters with a uniform distribution
+                in [-init_range, init_range]. Only effective when init="uniform".
+            init_std: float, initialize the parameters with a normal distribution
+                with mean 0 and stddev init_std. Only effective when init="normal".
+            mem_len: int, the number of tokens to cache.
+            reuse_len: int, the number of tokens in the currect batch to be cached
+                and reused in the future.
+            bi_data: bool, whether to use bidirectional input pipeline.
+                Usually set to True during pretraining and False during finetuning.
+            clamp_len: int, clamp all relative distances larger than clamp_len.
+                -1 means no clamping.
+            same_length: bool, whether to use the same attention length for each token.
         """
         if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
                         and isinstance(vocab_size_or_config_json_file, unicode)):
@@ -215,7 +294,7 @@ class XLNetConfig(XLNetBaseConfig):
             for key, value in json_config.items():
                 self.__dict__[key] = value
         elif isinstance(vocab_size_or_config_json_file, int):
-            self.vocab_size = vocab_size_or_config_json_file
+            self.n_token = vocab_size_or_config_json_file
             self.d_model = d_model
             self.n_layer = n_layer
             self.n_head = n_head
@@ -225,9 +304,21 @@ class XLNetConfig(XLNetBaseConfig):
             self.d_inner = d_inner
             self.untie_r = untie_r
             self.attn_type = attn_type
+
             self.max_position_embeddings = max_position_embeddings
             self.initializer_range = initializer_range
             self.layer_norm_eps = layer_norm_eps
+
+            self.init = init
+            self.init_range = init_range
+            self.init_std = init_std
+            self.dropout = dropout
+            self.dropatt = dropatt
+            self.mem_len = mem_len
+            self.reuse_len = reuse_len
+            self.bi_data = bi_data
+            self.clamp_len = clamp_len
+            self.same_length = same_length
         else:
             raise ValueError("First argument must be either a vocabulary size (int)"
                              "or the path to a pretrained model config file (str)")
@@ -327,7 +418,7 @@ class XLNetRelativeAttention(nn.Module):
         self.r_w_bias = nn.Parameter(torch.Tensor(self.n_head, self.d_head))
         self.seg_embed = nn.Parameter(torch.Tensor(2, self.n_head, self.d_head))
 
-        self.LayerNorm = XLNetLayerNorm(config.d_model, eps=config.layer_norm_eps)
+        self.layer_norm = XLNetLayerNorm(config.d_model, eps=config.layer_norm_eps)
         self.dropout = nn.Dropout(config.dropout)
 
     def prune_heads(self, heads):
@@ -385,7 +476,7 @@ class XLNetRelativeAttention(nn.Module):
         attn_out = self.dropout(attn_out)
         if residual:
             attn_out = attn_out + h
-        output = self.LayerNorm(attn_out)
+        output = self.layer_norm(attn_out)
 
         return output
 
@@ -483,7 +574,7 @@ class XLNetRelativeAttention(nn.Module):
 class XLNetFeedForward(nn.Module):
     def __init__(self, config):
         super(XLNetFeedForward, self).__init__()
-        self.LayerNorm = XLNetLayerNorm(config.d_model, eps=config.layer_norm_eps)
+        self.layer_norm = XLNetLayerNorm(config.d_model, eps=config.layer_norm_eps)
         self.layer_1 = nn.Linear(config.d_model, config.d_inner)
         self.layer_2 = nn.Linear(config.d_inner, config.d_model)
         self.dropout = nn.Dropout(config.dropout)
@@ -499,7 +590,7 @@ class XLNetFeedForward(nn.Module):
         output = self.dropout(output)
         output = self.layer_2(output)
         output = self.dropout(output)
-        output = self.LayerNorm(output + inp)
+        output = self.layer_norm(output + inp)
         return output
 
 class XLNetLayer(nn.Module):
@@ -691,11 +782,26 @@ class XLNetModel(XLNetPreTrainedModel):
         self.bi_data = config.bi_data
         self.clamp_len = config.clamp_len
 
+        self.word_embedding = nn.Embedding(config.n_token, config.d_model)
+        self.mask_emb = nn.Parameter(torch.Tensor(1, 1, config.d_model))
         layer = XLNetLayer(config, output_attentions=output_attentions,
                                    keep_multihead_output=keep_multihead_output)
         self.layer = nn.ModuleList([copy.deepcopy(layer) for _ in range(config.n_layer)])
         self.dropout = nn.Dropout(config.dropout)
 
+    def prune_heads(self, heads_to_prune):
+        """ Prunes heads of the model.
+            heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
+        """
+        for layer, heads in heads_to_prune.items():
+            self.layer[layer].attention.prune_heads(heads)
+
+    def get_multihead_outputs(self):
+        """ Gather all multi-head outputs.
+            Return: list (layers) of multihead module outputs with gradients
+        """
+        return [layer.attention.self.multihead_output for layer in self.layer]
+
     def create_mask(self, qlen, mlen):
         """ create causal attention mask.
             float mask where 1.0 indicate masked, 0.0 indicated not-masked.
@@ -783,12 +889,12 @@ class XLNetModel(XLNetPreTrainedModel):
         pos_emb = pos_emb.to(next(self.parameters()))
         return pos_emb
 
-    def forward(self, word_emb_k, seg_id=None, input_mask=None,
+    def forward(self, inp_k, seg_id=None, input_mask=None,
                 mems=None, perm_mask=None, target_mapping=None, inp_q=None,
                 output_all_encoded_layers=True, head_mask=None):
         """
         Args:
-            word_emb_k: float32 Tensor in shape [len, bsz, d_model], the input token embeddings.
+            inp_k: int32 Tensor in shape [len, bsz], the input token IDs.
             seg_id: int32 Tensor in shape [len, bsz], the input segment IDs.
             input_mask: [optional] float32 Tensor in shape [len, bsz], the input mask.
                 0 for real tokens and 1 for padding.
@@ -820,11 +926,12 @@ class XLNetModel(XLNetPreTrainedModel):
             summary_type: str, "last", "first", "mean", or "attn". The method
                 to pool the input to get a vector representation.
         """
-        qlen, bsz = word_emb_k.shape[0], word_emb_k.shape[1]
+        qlen, bsz = inp_k.shape[0], inp_k.shape[1]
         mlen = mems[0].shape[0] if mems is not None else 0
         klen = mlen + qlen
-        dtype_float = word_emb_k.dtype
-        device = word_emb_k.device
+
+        dtype_float = next(self.parameters()).dtype
+        device = next(self.parameters()).device
 
         ##### Attention mask
         # causal attention mask
@@ -865,7 +972,8 @@ class XLNetModel(XLNetPreTrainedModel):
         else:
             non_tgt_mask = None
 
-        ##### Process Word embeddings and prepare h & g hidden states
+        ##### Word embeddings and prepare h & g hidden states
+        word_emb_k = self.word_embedding(inp_k)
         output_h = self.dropout(word_emb_k)
         if inp_q is not None:
             if target_mapping is not None:
@@ -983,30 +1091,19 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
         self.attn_type = config.attn_type
         self.same_length = config.same_length
 
-        self.word_embedding = nn.Embedding(config.vocab_size, config.d_model)
-        self.mask_emb = nn.Parameter(torch.Tensor(1, 1, config.d_model))
         self.transformer = XLNetModel(config, output_attentions=output_attentions,
                                               keep_multihead_output=keep_multihead_output)
-        self.lm_loss = nn.Linear(config.d_model, config.vocab_size, bias=True)
-        self.dropout = nn.Dropout(config.dropout)
+        self.lm_loss = nn.Linear(config.d_model, config.n_token, bias=True)
 
         # Tie weights
-        self.lm_loss.weight = self.word_embedding.weight
 
         self.apply(self.init_xlnet_weights)
+        self.tie_weights()
 
-    def prune_heads(self, heads_to_prune):
-        """ Prunes heads of the model.
-            heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
+    def tie_weights(self):
+        """ Make sure we are sharing the embeddings
         """
-        for layer, heads in heads_to_prune.items():
-            self.encoder.layer[layer].attention.prune_heads(heads)
-
-    def get_multihead_outputs(self):
-        """ Gather all multi-head outputs.
-            Return: list (layers) of multihead module outputs with gradients
-        """
-        return [layer.attention.self.multihead_output for layer in self.encoder.layer]
+        self.lm_loss.weight = self.transformer.word_embedding.weight
 
     def forward(self, inp_k, seg_id=None, input_mask=None,
                 mems=None, perm_mask=None, target_mapping=None, inp_q=None,
@@ -1037,9 +1134,7 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
             summary_type: str, "last", "first", "mean", or "attn". The method
                 to pool the input to get a vector representation.
         """
-        word_emb_k = self.word_embedding(inp_k)
-
-        output, new_mems = self.transformer(word_emb_k, seg_id, input_mask,
+        output, new_mems = self.transformer(inp_k, seg_id, input_mask,
                                             mems, perm_mask, target_mapping, inp_q,
                                             output_all_encoded_layers, head_mask)
 
@@ -1059,5 +1154,5 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
         # if not output_all_encoded_layers:
         #     encoded_layers = encoded_layers[-1]
         # if self.output_attentions:
-        #     return all_attentions, encoded_layers, pooled_output
         return logits, new_mems
+        #     return all_attentions, encoded_layers, pooled_output
diff --git a/tests/modeling_xlnet_test.py b/tests/modeling_xlnet_test.py
index ec9cc2cda1..31e2baa4e4 100644
--- a/tests/modeling_xlnet_test.py
+++ b/tests/modeling_xlnet_test.py
@@ -186,13 +186,13 @@ class XLNetModelTest(unittest.TestCase):
         self.run_tester(XLNetModelTest.XLNetModelTester(self))
 
     def test_config_to_json_string(self):
-        config = XLNetConfig(vocab_size_or_config_json_file=96, d_model=37)
+        config = XLNetConfig(vocab_size_or_config_json_file=96, d_model=16*4)
         obj = json.loads(config.to_json_string())
         self.assertEqual(obj["n_token"], 96)
-        self.assertEqual(obj["d_model"], 37)
+        self.assertEqual(obj["d_model"], 16*4)
 
     def test_config_to_json_file(self):
-        config_first = XLNetConfig(vocab_size_or_config_json_file=96, d_model=37)
+        config_first = XLNetConfig(vocab_size_or_config_json_file=96, d_model=16*4)
         json_file_path = "/tmp/config.json"
         config_first.to_json_file(json_file_path)
         config_second = XLNetConfig.from_json_file(json_file_path)

From 483cbc36a9bac028054335d64a4e68646feab442 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Fri, 21 Jun 2019 16:38:01 +0200
Subject: [PATCH 007/139] test deviation with tf model: max ~1e-3 should be ok

---
 hubconfs/xlnet_hubconf.py                 | 169 +++++++++++++++++++
 pytorch_pretrained_bert/__init__.py       |   2 +-
 pytorch_pretrained_bert/modeling_xlnet.py | 193 +++++++++++++++++++---
 tests/modeling_xlnet_test.py              |  12 +-
 tests/tokenization_xlnet_test.py          |  28 ++--
 5 files changed, 358 insertions(+), 46 deletions(-)
 create mode 100644 hubconfs/xlnet_hubconf.py

diff --git a/hubconfs/xlnet_hubconf.py b/hubconfs/xlnet_hubconf.py
new file mode 100644
index 0000000000..155e9ffa42
--- /dev/null
+++ b/hubconfs/xlnet_hubconf.py
@@ -0,0 +1,169 @@
+from pytorch_pretrained_bert.tokenization_xlnet import XLNetTokenizer
+from pytorch_pretrained_bert.modeling_xlnet import (
+    XLNetConfig,
+    XLNetModel,
+    XLNetLMHeadModel,
+    XLNetForSequenceClassification
+)
+
+# A lot of models share the same param doc. Use a decorator
+# to save typing
+xlnet_docstring = """
+    Params:
+        pretrained_model_name_or_path: either:
+            - a str with the name of a pre-trained model to load selected in the list of:
+                . `xlnet-large-cased`
+            - a path or url to a pretrained model archive containing:
+                . `config.json` a configuration file for the model
+                . `pytorch_model.bin` a PyTorch dump of a XLNetForPreTraining instance
+            - a path or url to a pretrained model archive containing:
+                . `xlnet_config.json` a configuration file for the model
+                . `model.chkpt` a TensorFlow checkpoint
+        from_tf: should we load the weights from a locally saved TensorFlow checkpoint
+        cache_dir: an optional path to a folder in which the pre-trained models will be cached.
+        state_dict: an optional state dictionary (collections.OrderedDict object) to use instead of pre-trained models
+        *inputs, **kwargs: additional input for the specific XLNet class
+"""
+
+
+def _append_from_pretrained_docstring(docstr):
+    def docstring_decorator(fn):
+        fn.__doc__ = fn.__doc__ + docstr
+        return fn
+    return docstring_decorator
+
+
+def xlnetTokenizer(*args, **kwargs):
+    """
+    Instantiate a XLNet sentencepiece tokenizer for XLNet from a pre-trained vocab file.
+    Peculiarities:
+        - require Google sentencepiece (https://github.com/google/sentencepiece)
+
+    Args:
+    pretrained_model_name_or_path: Path to pretrained model archive
+                                   or one of pre-trained vocab configs below.
+                                       * xlnet-large-cased
+    Keyword args:
+    special_tokens: Special tokens in vocabulary that are not pretrained
+                    Default: None
+    max_len: An artificial maximum length to truncate tokenized sequences to;
+             Effective maximum length is always the minimum of this
+             value (if specified) and the underlying model's
+             sequence length.
+             Default: None
+
+    Example:
+        >>> import torch
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'xlnetTokenizer', 'xlnet-large-cased')
+
+        >>> text = "Who was Jim Henson ?"
+        >>> indexed_tokens = tokenizer.encode(tokenized_text)
+    """
+    tokenizer = XLNetTokenizer.from_pretrained(*args, **kwargs)
+    return tokenizer
+
+
+@_append_from_pretrained_docstring(xlnet_docstring)
+def xlnetModel(*args, **kwargs):
+    """
+    xlnetModel is the basic XLNet Transformer model from
+        "XLNet: Generalized Autoregressive Pretraining for Language Understanding"
+        by Zhilin Yang, Zihang Dai1, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le
+
+    Example:
+        # Load the tokenizer
+        >>> import torch
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'xlnetTokenizer', 'xlnet-large-cased')
+
+        #  Prepare tokenized input
+        >>> text_1 = "Who was Jim Henson ?"
+        >>> text_2 = "Jim Henson was a puppeteer"
+        >>> indexed_tokens_1 = tokenizer.encode(text_1)
+        >>> indexed_tokens_2 = tokenizer.encode(text_2)
+        >>> tokens_tensor_1 = torch.tensor([indexed_tokens_1])
+        >>> tokens_tensor_2 = torch.tensor([indexed_tokens_2])
+
+        # Load xlnetModel
+        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'xlnetModel', 'xlnet-large-cased')
+        >>> model.eval()
+
+        # Predict hidden states features for each layer
+        >>> with torch.no_grad():
+                hidden_states_1, mems = model(tokens_tensor_1)
+                hidden_states_2, mems = model(tokens_tensor_2, past=mems)
+    """
+    model = XLNetModel.from_pretrained(*args, **kwargs)
+    return model
+
+
+@_append_from_pretrained_docstring(xlnet_docstring)
+def xlnetLMHeadModel(*args, **kwargs):
+    """
+    xlnetModel is the basic XLNet Transformer model from
+        "XLNet: Generalized Autoregressive Pretraining for Language Understanding"
+        by Zhilin Yang, Zihang Dai1, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le
+    with a tied (pre-trained) language modeling head on top.
+
+    Example:
+        # Load the tokenizer
+        >>> import torch
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'xlnetTokenizer', 'xlnet-large-cased')
+
+        #  Prepare tokenized input
+        >>> text_1 = "Who was Jim Henson ?"
+        >>> text_2 = "Jim Henson was a puppeteer"
+        >>> indexed_tokens_1 = tokenizer.encode(text_1)
+        >>> indexed_tokens_2 = tokenizer.encode(text_2)
+        >>> tokens_tensor_1 = torch.tensor([indexed_tokens_1])
+        >>> tokens_tensor_2 = torch.tensor([indexed_tokens_2])
+
+        # Load xlnetLMHeadModel
+        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'xlnetLMHeadModel', 'xlnet-large-cased')
+        >>> model.eval()
+
+        # Predict hidden states features for each layer
+        >>> with torch.no_grad():
+                predictions_1, mems = model(tokens_tensor_1)
+                predictions_2, mems = model(tokens_tensor_2, mems=mems)
+
+        # Get the predicted last token
+        >>> predicted_index = torch.argmax(predictions_2[0, -1, :]).item()
+        >>> predicted_token = tokenizer.decode([predicted_index])
+        >>> assert predicted_token == ' who'
+    """
+    model = XLNetLMHeadModel.from_pretrained(*args, **kwargs)
+    return model
+
+
+@_append_from_pretrained_docstring(xlnet_docstring)
+def xlnetForSequenceClassification(*args, **kwargs):
+    """
+    xlnetModel is the basic XLNet Transformer model from
+        "XLNet: Generalized Autoregressive Pretraining for Language Understanding"
+        by Zhilin Yang, Zihang Dai1, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le
+
+    Example:
+        # Load the tokenizer
+        >>> import torch
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'xlnetTokenizer', 'xlnet-large-cased')
+
+        #  Prepare tokenized input
+        >>> text1 = "Who was Jim Henson ? Jim Henson was a puppeteer"
+        >>> text2 = "Who was Jim Henson ? Jim Henson was a mysterious young man"
+        >>> tokenized_text1 = tokenizer.tokenize(text1)
+        >>> tokenized_text2 = tokenizer.tokenize(text2)
+        >>> indexed_tokens1 = tokenizer.convert_tokens_to_ids(tokenized_text1)
+        >>> indexed_tokens2 = tokenizer.convert_tokens_to_ids(tokenized_text2)
+        >>> tokens_tensor = torch.tensor([[indexed_tokens1, indexed_tokens2]])
+        >>> mc_token_ids = torch.LongTensor([[len(tokenized_text1)-1, len(tokenized_text2)-1]])
+
+        # Load xlnetForSequenceClassification
+        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'xlnetForSequenceClassification', 'xlnet-large-cased')
+        >>> model.eval()
+
+        # Predict sequence classes logits
+        >>> with torch.no_grad():
+                lm_logits, mems = model(tokens_tensor)
+    """
+    model = XLNetForSequenceClassification.from_pretrained(*args, **kwargs)
+    return model
diff --git a/pytorch_pretrained_bert/__init__.py b/pytorch_pretrained_bert/__init__.py
index 7be5031d0e..89639820b8 100644
--- a/pytorch_pretrained_bert/__init__.py
+++ b/pytorch_pretrained_bert/__init__.py
@@ -3,7 +3,7 @@ from .tokenization import BertTokenizer, BasicTokenizer, WordpieceTokenizer
 from .tokenization_openai import OpenAIGPTTokenizer
 from .tokenization_transfo_xl import (TransfoXLTokenizer, TransfoXLCorpus)
 from .tokenization_gpt2 import GPT2Tokenizer
-from .tokenization_xlnet import XLNetTokenizer
+from .tokenization_xlnet import XLNetTokenizer, SPIECE_UNDERLINE
 
 from .modeling import (BertConfig, BertModel, BertForPreTraining,
                        BertForMaskedLM, BertForNextSentencePrediction,
diff --git a/pytorch_pretrained_bert/modeling_xlnet.py b/pytorch_pretrained_bert/modeling_xlnet.py
index a165db1768..6b7562e48f 100644
--- a/pytorch_pretrained_bert/modeling_xlnet.py
+++ b/pytorch_pretrained_bert/modeling_xlnet.py
@@ -165,12 +165,12 @@ def load_tf_weights_in_xlnet(model, config, tf_path):
 
 
 def gelu(x):
-    """Implementation of the gelu activation function.
-        For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
-        0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
+    """ Implementation of the gelu activation function.
+        XLNet is using OpenAI GPT's gelu (not exactly the same as BERT)
         Also see https://arxiv.org/abs/1606.08415
     """
-    return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))
+    cdf = 0.5 * (1.0 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
+    return x * cdf
 
 
 def swish(x):
@@ -657,7 +657,7 @@ class XLNetPreTrainedModel(nn.Module):
                 - a str with the name of a pre-trained model to load selected in the list of:
                     . `xlnet-large-cased`
                 - a path or url to a pretrained model archive containing:
-                    . `xlnet_config.json` a configuration file for the model
+                    . `config.json` a configuration file for the model
                     . `pytorch_model.bin` a PyTorch dump of a XLNetForPreTraining instance
                 - a path or url to a pretrained model archive containing:
                     . `xlnet_config.json` a configuration file for the model
@@ -767,6 +767,8 @@ class XLNetPreTrainedModel(nn.Module):
         if len(error_msgs) > 0:
             raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format(
                                model.__class__.__name__, "\n\t".join(error_msgs)))
+        if isinstance(model, XLNetLMHeadModel):
+            model.tie_weights()  # make sure word embedding weights are still tied
         return model
 
 
@@ -894,23 +896,23 @@ class XLNetModel(XLNetPreTrainedModel):
                 output_all_encoded_layers=True, head_mask=None):
         """
         Args:
-            inp_k: int32 Tensor in shape [len, bsz], the input token IDs.
-            seg_id: int32 Tensor in shape [len, bsz], the input segment IDs.
-            input_mask: [optional] float32 Tensor in shape [len, bsz], the input mask.
+            inp_k: int32 Tensor in shape [bsz, len], the input token IDs.
+            seg_id: int32 Tensor in shape [bsz, len], the input segment IDs.
+            input_mask: [optional] float32 Tensor in shape [bsz, len], the input mask.
                 0 for real tokens and 1 for padding.
             mems: [optional] a list of float32 Tensors in shape [mem_len, bsz, d_model], memory
                 from previous batches. The length of the list equals n_layer.
                 If None, no memory is used.
-            perm_mask: [optional] float32 Tensor in shape [len, len, bsz].
-                If perm_mask[i, j, k] = 0, i attend to j in batch k;
-                if perm_mask[i, j, k] = 1, i does not attend to j in batch k.
+            perm_mask: [optional] float32 Tensor in shape [bsz, len, len].
+                If perm_mask[k, i, j] = 0, i attend to j in batch k;
+                if perm_mask[k, i, j] = 1, i does not attend to j in batch k.
                 If None, each position attends to all the others.
-            target_mapping: [optional] float32 Tensor in shape [num_predict, len, bsz].
-                If target_mapping[i, j, k] = 1, the i-th predict in batch k is
+            target_mapping: [optional] float32 Tensor in shape [bsz, num_predict, len].
+                If target_mapping[k, i, j] = 1, the i-th predict in batch k is
                 on the j-th token.
                 Only used during pretraining for partial prediction.
                 Set to None during finetuning.
-            inp_q: [optional] float32 Tensor in shape [len, bsz].
+            inp_q: [optional] float32 Tensor in shape [bsz, len].
                 1 for tokens with losses and 0 for tokens without losses.
                 Only used during pretraining for two-stream attention.
                 Set to None during finetuning.
@@ -926,6 +928,16 @@ class XLNetModel(XLNetPreTrainedModel):
             summary_type: str, "last", "first", "mean", or "attn". The method
                 to pool the input to get a vector representation.
         """
+        # the original code for XLNet uses shapes [len, bsz] with the batch dimension at the end
+        # but we want a unified interface in the library with the batch size on the first dimension
+        # so we move here the first dimension (batch) to the end
+        inp_k = inp_k.transpose(0, 1).contiguous()
+        seg_id = seg_id.transpose(0, 1).contiguous() if seg_id is not None else None
+        input_mask = input_mask.transpose(0, 1).contiguous() if input_mask is not None else None
+        perm_mask = perm_mask.permute(1, 2, 0).contiguous() if perm_mask is not None else None
+        target_mapping = target_mapping.permute(1, 2, 0).contiguous() if target_mapping is not None else None
+        inp_q = inp_q.transpose(0, 1).contiguous() if inp_q is not None else None
+
         qlen, bsz = inp_k.shape[0], inp_k.shape[1]
         mlen = mems[0].shape[0] if mems is not None else 0
         klen = mlen + qlen
@@ -1020,6 +1032,7 @@ class XLNetModel(XLNetPreTrainedModel):
         if mems is None:
             mems = [None] * len(self.layer)
 
+        hidden_states = []
         for i, layer_module in enumerate(self.layer):
             # cache new mems
             new_mems.append(self.cache_mem(output_h, mems[i]))
@@ -1029,10 +1042,14 @@ class XLNetModel(XLNetPreTrainedModel):
                                               r=pos_emb, seg_mat=seg_mat,
                                               mems=mems[i], target_mapping=target_mapping,
                                               head_mask=head_mask)
-
+            hidden_states.append(output_h)
         output = self.dropout(output_g if output_g is not None else output_h)
 
-        return output, new_mems
+        # We transpose back here to shape [bsz, len, hidden_dim] (cf. beginning of forward() method)
+        output = output.permute(1, 0, 2).contiguous()
+        hidden_states = [hs.permute(1, 0, 2).contiguous() for hs in hidden_states]
+
+        return output, hidden_states, new_mems
 
 
 class XLNetLMHeadModel(XLNetPreTrainedModel):
@@ -1110,23 +1127,23 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
                 target=None, output_all_encoded_layers=True, head_mask=None):
         """
         Args:
-            inp_k: int32 Tensor in shape [len, bsz], the input token IDs.
-            seg_id: int32 Tensor in shape [len, bsz], the input segment IDs.
-            input_mask: float32 Tensor in shape [len, bsz], the input mask.
+            inp_k: int32 Tensor in shape [bsz, len], the input token IDs.
+            seg_id: int32 Tensor in shape [bsz, len], the input segment IDs.
+            input_mask: float32 Tensor in shape [bsz, len], the input mask.
                 0 for real tokens and 1 for padding.
             mems: a list of float32 Tensors in shape [mem_len, bsz, d_model], memory
                 from previous batches. The length of the list equals n_layer.
                 If None, no memory is used.
-            perm_mask: float32 Tensor in shape [len, len, bsz].
-                If perm_mask[i, j, k] = 0, i attend to j in batch k;
-                if perm_mask[i, j, k] = 1, i does not attend to j in batch k.
+            perm_mask: float32 Tensor in shape [bsz, len, len].
+                If perm_mask[k, i, j] = 0, i attend to j in batch k;
+                if perm_mask[k, i, j] = 1, i does not attend to j in batch k.
                 If None, each position attends to all the others.
-            target_mapping: float32 Tensor in shape [num_predict, len, bsz].
-                If target_mapping[i, j, k] = 1, the i-th predict in batch k is
+            target_mapping: float32 Tensor in shape [bsz, num_predict, len].
+                If target_mapping[k, i, j] = 1, the i-th predict in batch k is
                 on the j-th token.
                 Only used during pretraining for partial prediction.
                 Set to None during finetuning.
-            inp_q: float32 Tensor in shape [len, bsz].
+            inp_q: float32 Tensor in shape [bsz, len].
                 1 for tokens with losses and 0 for tokens without losses.
                 Only used during pretraining for two-stream attention.
                 Set to None during finetuning.
@@ -1134,7 +1151,131 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
             summary_type: str, "last", "first", "mean", or "attn". The method
                 to pool the input to get a vector representation.
         """
-        output, new_mems = self.transformer(inp_k, seg_id, input_mask,
+        output, hidden_states, new_mems = self.transformer(inp_k, seg_id, input_mask,
+                                            mems, perm_mask, target_mapping, inp_q,
+                                            output_all_encoded_layers, head_mask)
+
+        logits = self.lm_loss(output)
+
+        if target is not None:
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss(ignore_index=-1)
+            loss = loss_fct(logits.view(-1, logits.size(-1)),
+                            target.view(-1))
+            return loss, new_mems
+
+        # if self.output_attentions:
+        #     all_attentions, encoded_layers = encoded_layers
+        # sequence_output = encoded_layers[-1]
+        # pooled_output = self.pooler(sequence_output)
+        # if not output_all_encoded_layers:
+        #     encoded_layers = encoded_layers[-1]
+        # if self.output_attentions:
+        return logits, new_mems
+        #     return all_attentions, encoded_layers, pooled_output
+
+
+class XLNetForSequenceClassification(XLNetPreTrainedModel):
+    """XLNet model ("XLNet: Generalized Autoregressive Pretraining for Language Understanding").
+
+    Params:
+        `config`: a XLNetConfig class instance with the configuration to build a new model
+        `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
+        `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
+            This can be used to compute head importance metrics. Default: False
+        `summary_type`: str, "last", "first", "mean", or "attn". The method
+            to pool the input to get a vector representation. Default: last
+
+    Inputs:
+        inp_k: int32 Tensor in shape [bsz, len], the input token IDs.
+        seg_id: int32 Tensor in shape [bsz, len], the input segment IDs.
+        input_mask: float32 Tensor in shape [bsz, len], the input mask.
+            0 for real tokens and 1 for padding.
+        mems: a list of float32 Tensors in shape [mem_len, bsz, d_model], memory
+            from previous batches. The length of the list equals n_layer.
+            If None, no memory is used.
+        perm_mask: float32 Tensor in shape [bsz, len, len].
+            If perm_mask[k, i, j] = 0, i attend to j in batch k;
+            if perm_mask[k, i, j] = 1, i does not attend to j in batch k.
+            If None, each position attends to all the others.
+        target_mapping: float32 Tensor in shape [bsz, num_predict, len].
+            If target_mapping[k, i, j] = 1, the i-th predict in batch k is
+            on the j-th token.
+            Only used during pretraining for partial prediction.
+            Set to None during finetuning.
+        inp_q: float32 Tensor in shape [bsz, len].
+            1 for tokens with losses and 0 for tokens without losses.
+            Only used during pretraining for two-stream attention.
+            Set to None during finetuning.
+        `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
+            It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
+
+
+    Outputs: Tuple of (logits or loss, mems)
+        `logits or loss`:
+            if target is None:
+                Token logits with shape [batch_size, sequence_length] 
+            else:
+                CrossEntropy loss with the targets
+        `new_mems`: list (num layers) of updated mem states at the entry of each layer
+            each mem state is a torch.FloatTensor of size [self.config.mem_len, batch_size, self.config.d_model]
+            Note that the first two dimensions are transposed in `mems` with regards to `input_ids` and `target`
+
+    Example usage:
+    ```python
+    # Already been converted into WordPiece token ids
+    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
+    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
+    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
+
+    config = modeling.XLNetConfig(vocab_size_or_config_json_file=32000, d_model=768,
+        n_layer=12, num_attention_heads=12, intermediate_size=3072)
+
+    model = modeling.XLNetModel(config=config)
+    all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)
+    ```
+    """
+    def __init__(self, config, summary_type="last", output_attentions=False, keep_multihead_output=False):
+        super(XLNetForSequenceClassification, self).__init__(config)
+        self.output_attentions = output_attentions
+        self.attn_type = config.attn_type
+        self.same_length = config.same_length
+        self.summary_type = summary_type
+
+        self.transformer = XLNetModel(config, output_attentions=output_attentions,
+                                              keep_multihead_output=keep_multihead_output)
+        self.lm_loss = nn.Linear(config.d_model, config.n_token, bias=True)
+
+        self.apply(self.init_xlnet_weights)
+        self.tie_weights()
+
+    def forward(self, inp_k, seg_id=None, input_mask=None,
+                mems=None, perm_mask=None, target_mapping=None, inp_q=None,
+                target=None, output_all_encoded_layers=True, head_mask=None):
+        """
+        Args:
+            inp_k: int32 Tensor in shape [bsz, len], the input token IDs.
+            seg_id: int32 Tensor in shape [bsz, len], the input segment IDs.
+            input_mask: float32 Tensor in shape [bsz, len], the input mask.
+                0 for real tokens and 1 for padding.
+            mems: a list of float32 Tensors in shape [mem_len, bsz, d_model], memory
+                from previous batches. The length of the list equals n_layer.
+                If None, no memory is used.
+            perm_mask: float32 Tensor in shape [bsz, len, len].
+                If perm_mask[k, i, j] = 0, i attend to j in batch k;
+                if perm_mask[k, i, j] = 1, i does not attend to j in batch k.
+                If None, each position attends to all the others.
+            target_mapping: float32 Tensor in shape [bsz, num_predict, len].
+                If target_mapping[k, i, j] = 1, the i-th predict in batch k is
+                on the j-th token.
+                Only used during pretraining for partial prediction.
+                Set to None during finetuning.
+            inp_q: float32 Tensor in shape [bsz, len].
+                1 for tokens with losses and 0 for tokens without losses.
+                Only used during pretraining for two-stream attention.
+                Set to None during finetuning.
+        """
+        output, hidden_states, new_mems = self.transformer(inp_k, seg_id, input_mask,
                                             mems, perm_mask, target_mapping, inp_q,
                                             output_all_encoded_layers, head_mask)
 
diff --git a/tests/modeling_xlnet_test.py b/tests/modeling_xlnet_test.py
index 31e2baa4e4..65d2c6648d 100644
--- a/tests/modeling_xlnet_test.py
+++ b/tests/modeling_xlnet_test.py
@@ -74,9 +74,9 @@ class XLNetModelTest(unittest.TestCase):
             self.type_vocab_size = type_vocab_size
 
         def prepare_config_and_inputs(self):
-            input_ids_1 = XLNetModelTest.ids_tensor([self.seq_length, self.batch_size], self.vocab_size)
-            input_ids_2 = XLNetModelTest.ids_tensor([self.seq_length, self.batch_size], self.vocab_size)
-            segment_ids = XLNetModelTest.ids_tensor([self.seq_length, self.batch_size], self.type_vocab_size)
+            input_ids_1 = XLNetModelTest.ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+            input_ids_2 = XLNetModelTest.ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+            segment_ids = XLNetModelTest.ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
 
             # inp_k: int32 Tensor in shape [len, bsz], the input token IDs.
             # seg_id: int32 Tensor in shape [len, bsz], the input segment IDs.
@@ -101,7 +101,7 @@ class XLNetModelTest(unittest.TestCase):
 
             lm_labels = None
             if self.use_labels:
-                lm_labels = XLNetModelTest.ids_tensor([self.seq_length, self.batch_size], self.vocab_size)
+                lm_labels = XLNetModelTest.ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
 
             config = XLNetConfig(
                 vocab_size_or_config_json_file=self.vocab_size,
@@ -155,7 +155,7 @@ class XLNetModelTest(unittest.TestCase):
                 [])
             self.parent.assertListEqual(
                 list(result["lm_logits_1"].size()),
-                [self.seq_length, self.batch_size, self.vocab_size])
+                [self.batch_size, self.seq_length, self.vocab_size])
             self.parent.assertListEqual(
                 list(list(mem.size()) for mem in result["mems_1a"]),
                 [[self.seq_length, self.batch_size, self.d_model]] * self.n_layer)
@@ -171,7 +171,7 @@ class XLNetModelTest(unittest.TestCase):
                 [])
             self.parent.assertListEqual(
                 list(result["lm_logits_2"].size()),
-                [self.seq_length, self.batch_size, self.vocab_size])
+                [self.batch_size, self.seq_length, self.vocab_size])
             self.parent.assertListEqual(
                 list(list(mem.size()) for mem in result["mems_2a"]),
                 [[self.mem_len, self.batch_size, self.d_model]] * self.n_layer)
diff --git a/tests/tokenization_xlnet_test.py b/tests/tokenization_xlnet_test.py
index e383dd7877..e1267d3f9f 100644
--- a/tests/tokenization_xlnet_test.py
+++ b/tests/tokenization_xlnet_test.py
@@ -20,7 +20,9 @@ from io import open
 import shutil
 import pytest
 
-from pytorch_pretrained_bert.tokenization_xlnet import (XLNetTokenizer, PRETRAINED_VOCAB_ARCHIVE_MAP)
+from pytorch_pretrained_bert.tokenization_xlnet import (XLNetTokenizer,
+                                                        PRETRAINED_VOCAB_ARCHIVE_MAP,
+                                                        SPIECE_UNDERLINE)
 
 SAMPLE_VOCAB = os.path.join(os.path.dirname(
                     os.path.dirname(os.path.abspath(__file__))),
@@ -45,9 +47,9 @@ class XLNetTokenizationTest(unittest.TestCase):
         os.remove(special_tokens_file)
 
         tokens = tokenizer.tokenize("I was born in 92000, and this is falsé.")
-        self.assertListEqual(tokens, ['▁I', '▁was', '▁b', 'or', 'n', '▁in', '▁',
-                                      '9', '2', '0', '0', '0', ',', '▁and', '▁this',
-                                      '▁is', '▁f', 'al', 's', 'é', '.'])
+        self.assertListEqual(tokens, [SPIECE_UNDERLINE + 'I', SPIECE_UNDERLINE + 'was', SPIECE_UNDERLINE + 'b', 'or', 'n', SPIECE_UNDERLINE + 'in', SPIECE_UNDERLINE + '',
+                                      '9', '2', '0', '0', '0', ',', SPIECE_UNDERLINE + 'and', SPIECE_UNDERLINE + 'this',
+                                      SPIECE_UNDERLINE + 'is', SPIECE_UNDERLINE + 'f', 'al', 's', 'é', '.'])
         ids = tokenizer.convert_tokens_to_ids(tokens)
         self.assertListEqual(
             ids, [8, 21, 84, 55, 24, 19, 7, 0,
@@ -55,9 +57,9 @@ class XLNetTokenizationTest(unittest.TestCase):
                             46, 72, 80, 6, 0, 4])
 
         back_tokens = tokenizer.convert_ids_to_tokens(ids)
-        self.assertListEqual(back_tokens, ['▁I', '▁was', '▁b', 'or', 'n', '▁in',
-                                           '▁', '<unk>', '2', '0', '0', '0', ',',
-                                           '▁and', '▁this', '▁is', '▁f', 'al', 's',
+        self.assertListEqual(back_tokens, [SPIECE_UNDERLINE + 'I', SPIECE_UNDERLINE + 'was', SPIECE_UNDERLINE + 'b', 'or', 'n', SPIECE_UNDERLINE + 'in',
+                                           SPIECE_UNDERLINE + '', '<unk>', '2', '0', '0', '0', ',',
+                                           SPIECE_UNDERLINE + 'and', SPIECE_UNDERLINE + 'this', SPIECE_UNDERLINE + 'is', SPIECE_UNDERLINE + 'f', 'al', 's',
                                            '<unk>', '.'])
 
     @pytest.mark.slow
@@ -71,17 +73,17 @@ class XLNetTokenizationTest(unittest.TestCase):
     def test_tokenizer_lower(self):
         tokenizer = XLNetTokenizer(SAMPLE_VOCAB, do_lower_case=True)
         tokens = tokenizer.tokenize(u"I was born in 92000, and this is falsé.")
-        self.assertListEqual(tokens, ['▁', 'i', '▁was', '▁b', 'or', 'n', '▁in', '▁',
-                                      '9', '2', '0', '0', '0', ',', '▁and', '▁this',
-                                      '▁is', '▁f', 'al', 'se', '.'])
+        self.assertListEqual(tokens, [SPIECE_UNDERLINE + '', 'i', SPIECE_UNDERLINE + 'was', SPIECE_UNDERLINE + 'b', 'or', 'n', SPIECE_UNDERLINE + 'in', SPIECE_UNDERLINE + '',
+                                      '9', '2', '0', '0', '0', ',', SPIECE_UNDERLINE + 'and', SPIECE_UNDERLINE + 'this',
+                                      SPIECE_UNDERLINE + 'is', SPIECE_UNDERLINE + 'f', 'al', 'se', '.'])
         self.assertListEqual(tokenizer.tokenize(u"H\u00E9llo"), ["▁he", "ll", "o"])
 
     def test_tokenizer_no_lower(self):
         tokenizer = XLNetTokenizer(SAMPLE_VOCAB, do_lower_case=False)
         tokens = tokenizer.tokenize(u"I was born in 92000, and this is falsé.")
-        self.assertListEqual(tokens, ['▁I', '▁was', '▁b', 'or', 'n', '▁in', '▁',
-                                      '9', '2', '0', '0', '0', ',', '▁and', '▁this',
-                                      '▁is', '▁f', 'al', 'se', '.'])
+        self.assertListEqual(tokens, [SPIECE_UNDERLINE + 'I', SPIECE_UNDERLINE + 'was', SPIECE_UNDERLINE + 'b', 'or', 'n', SPIECE_UNDERLINE + 'in', SPIECE_UNDERLINE + '',
+                                      '9', '2', '0', '0', '0', ',', SPIECE_UNDERLINE + 'and', SPIECE_UNDERLINE + 'this',
+                                      SPIECE_UNDERLINE + 'is', SPIECE_UNDERLINE + 'f', 'al', 'se', '.'])
 
 
 if __name__ == '__main__':

From ebd2cb8d74f62e0dd3c2ebc3411ee55d7f5a7b8c Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Fri, 21 Jun 2019 21:08:44 +0200
Subject: [PATCH 008/139] update from_pretrained to load XLNetModel as well

---
 examples/generation_xlnet.py                  | 21 ++++++++
 pytorch_pretrained_bert/modeling_xlnet.py     | 51 ++++++++++++-------
 pytorch_pretrained_bert/tokenization_xlnet.py | 15 ++++++
 tests/modeling_xlnet_test.py                  | 48 ++++++++++-------
 4 files changed, 99 insertions(+), 36 deletions(-)
 create mode 100644 examples/generation_xlnet.py

diff --git a/examples/generation_xlnet.py b/examples/generation_xlnet.py
new file mode 100644
index 0000000000..7d83d1bf20
--- /dev/null
+++ b/examples/generation_xlnet.py
@@ -0,0 +1,21 @@
+import torch
+from torch.nn import functional as F
+from pytorch_pretrained_bert import XLNetModel, XLNetLMHeadModel, XLNetTokenizer
+
+import logging
+logging.basicConfig(level=logging.INFO)
+
+tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
+model = XLNetModel.from_pretrained('xlnet-large-cased')
+model = XLNetLMHeadModel.from_pretrained('xlnet-large-cased')
+
+tokens = tokenizer.encode('I am very ')
+for i in range(len(tokens), 20):
+    mask = torch.tensor([[[0.0] * i + [1.0]]])
+    logits, _ = model(torch.tensor([tokens + [0]]),
+                      perm_mask=mask.expand(-1, i+1, -1),
+                      target_mapping=mask,
+                      inp_q=mask.squeeze(1))
+    output = torch.multinomial(F.softmax(logits[0, 0, :]), 1)
+    tokens.append(output.item())
+    print(tokenizer.decode(tokens))
diff --git a/pytorch_pretrained_bert/modeling_xlnet.py b/pytorch_pretrained_bert/modeling_xlnet.py
index 6b7562e48f..f825043e8c 100644
--- a/pytorch_pretrained_bert/modeling_xlnet.py
+++ b/pytorch_pretrained_bert/modeling_xlnet.py
@@ -727,16 +727,24 @@ class XLNetPreTrainedModel(nn.Module):
                 archive_file, resolved_archive_file))
             logger.info("loading configuration file {} from cache at {}".format(
                 config_file, resolved_config_file))
+
         # Load config
         config = XLNetConfig.from_json_file(resolved_config_file)
         logger.info("Model config {}".format(config))
+
+        # Update config with kwargs if needed
+        for key, value in kwargs:
+            if hasattr(config, key):
+                setattr(config, key, value)
+
         # Instantiate model.
         model = cls(config, *inputs, **kwargs)
         if state_dict is None and not from_tf:
             state_dict = torch.load(resolved_archive_file, map_location='cpu')
         if from_tf:
             # Directly load from a TensorFlow checkpoint
-            return load_tf_weights_in_xlnet(model, resolved_archive_file)
+            return load_tf_weights_in_xlnet(model, config, resolved_archive_file)
+
         # Load from a PyTorch state_dict
         missing_keys = []
         unexpected_keys = []
@@ -755,8 +763,8 @@ class XLNetPreTrainedModel(nn.Module):
                 if child is not None:
                     load(child, prefix + name + '.')
         start_prefix = ''
-        if not hasattr(model, 'xlnet') and any(s.startswith('xlnet.') for s in state_dict.keys()):
-            start_prefix = 'xlnet.'
+        if not hasattr(model, 'transformer') and any(s.startswith('transformer') for s in state_dict.keys()):
+            start_prefix = 'transformer.'
         load(model, prefix=start_prefix)
         if len(missing_keys) > 0:
             logger.info("Weights of {} not initialized from pretrained model: {}".format(
@@ -989,10 +997,10 @@ class XLNetModel(XLNetPreTrainedModel):
         output_h = self.dropout(word_emb_k)
         if inp_q is not None:
             if target_mapping is not None:
-                word_emb_q = mask_emb.expand(target_mapping.shape[0], bsz, -1)
+                word_emb_q = self.mask_emb.expand(target_mapping.shape[0], bsz, -1)
             else:
                 inp_q_ext = inp_q[:, :, None]
-                word_emb_q = inp_q_ext * mask_emb + (1 - inp_q_ext) * word_emb_k
+                word_emb_q = inp_q_ext * self.mask_emb + (1 - inp_q_ext) * word_emb_k
             output_g = self.dropout(word_emb_q)
         else:
             output_g = None
@@ -1062,19 +1070,26 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
             This can be used to compute head importance metrics. Default: False
 
     Inputs:
-        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
-            with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
-            `extract_features.py`, `run_classifier.py` and `run_squad.py`)
-        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
-            types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
-            a `sentence B` token (see XLNet paper for more details).
-        `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
-            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
-            input sequence length in the current batch. It's the mask that we typically use for attention when
-            a batch has varying length sentences.
-        `output_all_encoded_layers`: boolean which controls the content of the `encoded_layers` output as described below. Default: `True`.
-        `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
-            It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
+        inp_k: int32 Tensor in shape [bsz, len], the input token IDs.
+        seg_id: int32 Tensor in shape [bsz, len], the input segment IDs.
+        input_mask: [optional] float32 Tensor in shape [bsz, len], the input mask.
+            0 for real tokens and 1 for padding.
+        mems: [optional] a list of float32 Tensors in shape [mem_len, bsz, d_model], memory
+            from previous batches. The length of the list equals n_layer.
+            If None, no memory is used.
+        perm_mask: [optional] float32 Tensor in shape [bsz, len, len].
+            If perm_mask[k, i, j] = 0, i attend to j in batch k;
+            if perm_mask[k, i, j] = 1, i does not attend to j in batch k.
+            If None, each position attends to all the others.
+        target_mapping: [optional] float32 Tensor in shape [bsz, num_predict, len].
+            If target_mapping[k, i, j] = 1, the i-th predict in batch k is
+            on the j-th token.
+            Only used during pretraining for partial prediction.
+            Set to None during finetuning.
+        inp_q: [optional] float32 Tensor in shape [bsz, len].
+            1 for tokens with losses and 0 for tokens without losses.
+            Only used during pretraining for two-stream attention.
+            Set to None during finetuning.
 
 
     Outputs: Tuple of (encoded_layers, pooled_output)
diff --git a/pytorch_pretrained_bert/tokenization_xlnet.py b/pytorch_pretrained_bert/tokenization_xlnet.py
index c9a3d40631..3cc5053338 100644
--- a/pytorch_pretrained_bert/tokenization_xlnet.py
+++ b/pytorch_pretrained_bert/tokenization_xlnet.py
@@ -37,6 +37,11 @@ VOCAB_NAME = 'spiece.model'
 SPECIAL_TOKENS_NAME = 'special_tokens.txt'
 
 SPIECE_UNDERLINE = '▁'
+SEG_ID_A   = 0
+SEG_ID_B   = 1
+SEG_ID_CLS = 2
+SEG_ID_SEP = 3
+SEG_ID_PAD = 4
 
 class XLNetTokenizer(object):
     """
@@ -52,6 +57,16 @@ class XLNetTokenizer(object):
         if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP:
             vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[pretrained_model_name_or_path]
             special_tokens_file = None
+            if '-cased' in pretrained_model_name_or_path and kwargs.get('do_lower_case', True):
+                logger.warning("The pre-trained model you are loading is a cased model but you have not set "
+                               "`do_lower_case` to False. We are setting `do_lower_case=False` for you but "
+                               "you may want to check this behavior.")
+                kwargs['do_lower_case'] = False
+            elif '-cased' not in pretrained_model_name_or_path and not kwargs.get('do_lower_case', True):
+                logger.warning("The pre-trained model you are loading is an uncased model but you have set "
+                               "`do_lower_case` to False. We are setting `do_lower_case=True` for you "
+                               "but you may want to check this behavior.")
+                kwargs['do_lower_case'] = True
         else:
             vocab_file = os.path.join(pretrained_model_name_or_path, VOCAB_NAME)
             special_tokens_file = os.path.join(pretrained_model_name_or_path, SPECIAL_TOKENS_NAME)
diff --git a/tests/modeling_xlnet_test.py b/tests/modeling_xlnet_test.py
index 65d2c6648d..c99cfe25dd 100644
--- a/tests/modeling_xlnet_test.py
+++ b/tests/modeling_xlnet_test.py
@@ -78,23 +78,30 @@ class XLNetModelTest(unittest.TestCase):
             input_ids_2 = XLNetModelTest.ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
             segment_ids = XLNetModelTest.ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
 
-            # inp_k: int32 Tensor in shape [len, bsz], the input token IDs.
-            # seg_id: int32 Tensor in shape [len, bsz], the input segment IDs.
-            # input_mask: float32 Tensor in shape [len, bsz], the input mask.
+            input_ids_q = XLNetModelTest.ids_tensor([self.batch_size, self.seq_length + 1], self.vocab_size)
+            perm_mask = torch.zeros(self.batch_size, self.seq_length + 1, self.seq_length + 1, dtype=torch.float)
+            perm_mask[:, :, -1] = 1.0  # Previous tokens don't see last token
+            target_mapping = torch.zeros(self.batch_size, 1, self.seq_length + 1, dtype=torch.float)
+            target_mapping[:, 0, -1] = 1.0  # predict last token
+            inp_q = target_mapping[:, 0, :].clone()  # predict last token
+
+            # inp_k: int32 Tensor in shape [bsz, len], the input token IDs.
+            # seg_id: int32 Tensor in shape [bsz, len], the input segment IDs.
+            # input_mask: float32 Tensor in shape [bsz, len], the input mask.
             #     0 for real tokens and 1 for padding.
-            # mems: a list of float32 Tensors in shape [mem_len, bsz, d_model], memory
+            # mems: a list of float32 Tensors in shape [bsz, mem_len, d_model], memory
             #     from previous batches. The length of the list equals n_layer.
             #     If None, no memory is used.
-            # perm_mask: float32 Tensor in shape [len, len, bsz].
-            #     If perm_mask[i, j, k] = 0, i attend to j in batch k;
-            #     if perm_mask[i, j, k] = 1, i does not attend to j in batch k.
+            # perm_mask: float32 Tensor in shape [bsz, len, len].
+            #     If perm_mask[k, i, j] = 0, i attend to j in batch k;
+            #     if perm_mask[k, i, j] = 1, i does not attend to j in batch k.
             #     If None, each position attends to all the others.
-            # target_mapping: float32 Tensor in shape [num_predict, len, bsz].
-            #     If target_mapping[i, j, k] = 1, the i-th predict in batch k is
+            # target_mapping: float32 Tensor in shape [bsz, num_predict, len].
+            #     If target_mapping[k, i, j] = 1, the i-th predict in batch k is
             #     on the j-th token.
             #     Only used during pretraining for partial prediction.
             #     Set to None during finetuning.
-            # inp_q: float32 Tensor in shape [len, bsz].
+            # inp_q: float32 Tensor in shape [bsz, len].
             #     1 for tokens with losses and 0 for tokens without losses.
             #     Only used during pretraining for two-stream attention.
             #     Set to None during finetuning.
@@ -121,30 +128,35 @@ class XLNetModelTest(unittest.TestCase):
 
             config.update(run_config)
 
-            return (config, input_ids_1, input_ids_2, segment_ids, lm_labels)
+            return (config, input_ids_1, input_ids_2, input_ids_q, perm_mask, target_mapping, inp_q, segment_ids, lm_labels)
 
         def set_seed(self):
             random.seed(self.seed)
             torch.manual_seed(self.seed)
 
-        def create_transfo_xl_lm_head(self, config, input_ids_1, input_ids_2, segment_ids, lm_labels):
+        def create_transfo_xl_lm_head(self, config, input_ids_1, input_ids_2, input_ids_q, perm_mask, target_mapping, inp_q, segment_ids, lm_labels):
             model = XLNetLMHeadModel(config)
             model.eval()
 
             loss_1, mems_1a = model(input_ids_1, seg_id=segment_ids, target=lm_labels)
-            lm_logits_1, mems_1b = model(input_ids_1, seg_id=segment_ids)
+            all_logits_1, mems_1b = model(input_ids_1, seg_id=segment_ids)
 
             loss_2, mems_2a = model(input_ids_2, seg_id=segment_ids, target=lm_labels, mems=mems_1a)
-            lm_logits_2, mems_2b = model(input_ids_2, seg_id=segment_ids, mems=mems_1b)
+            all_logits_2, mems_2b = model(input_ids_2, seg_id=segment_ids, mems=mems_1b)
+
+            logits, _ = model(input_ids_q,
+                                    perm_mask=perm_mask,
+                                    target_mapping=target_mapping,
+                                    inp_q=inp_q)
 
             outputs = {
                 "loss_1": loss_1,
                 "mems_1a": mems_1a,
-                "lm_logits_1": lm_logits_1,
+                "all_logits_1": all_logits_1,
                 "mems_1b": mems_1b,
                 "loss_2": loss_2,
                 "mems_2a": mems_2a,
-                "lm_logits_2": lm_logits_2,
+                "all_logits_2": all_logits_2,
                 "mems_2b": mems_2b,
             }
             return outputs
@@ -154,7 +166,7 @@ class XLNetModelTest(unittest.TestCase):
                 list(result["loss_1"].size()),
                 [])
             self.parent.assertListEqual(
-                list(result["lm_logits_1"].size()),
+                list(result["all_logits_1"].size()),
                 [self.batch_size, self.seq_length, self.vocab_size])
             self.parent.assertListEqual(
                 list(list(mem.size()) for mem in result["mems_1a"]),
@@ -170,7 +182,7 @@ class XLNetModelTest(unittest.TestCase):
                 list(result["loss_2"].size()),
                 [])
             self.parent.assertListEqual(
-                list(result["lm_logits_2"].size()),
+                list(result["all_logits_2"].size()),
                 [self.batch_size, self.seq_length, self.vocab_size])
             self.parent.assertListEqual(
                 list(list(mem.size()) for mem in result["mems_2a"]),

From 181075635d6f8d0596bf2e205fb611389c760ea4 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Fri, 21 Jun 2019 23:23:37 +0200
Subject: [PATCH 009/139] updating model loading and adding special tokens ids

---
 examples/generation_xlnet.py                  |  7 +++---
 pytorch_pretrained_bert/modeling_xlnet.py     |  9 +++++--
 pytorch_pretrained_bert/tokenization_xlnet.py | 24 ++++++++++++++++++-
 xlnet                                         |  1 +
 4 files changed, 34 insertions(+), 7 deletions(-)
 create mode 160000 xlnet

diff --git a/examples/generation_xlnet.py b/examples/generation_xlnet.py
index 7d83d1bf20..e54f6a365f 100644
--- a/examples/generation_xlnet.py
+++ b/examples/generation_xlnet.py
@@ -6,14 +6,13 @@ import logging
 logging.basicConfig(level=logging.INFO)
 
 tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
-model = XLNetModel.from_pretrained('xlnet-large-cased')
-model = XLNetLMHeadModel.from_pretrained('xlnet-large-cased')
+model = XLNetLMHeadModel.from_pretrained('xlnet-large-cased', attn_type='uni')
 
-tokens = tokenizer.encode('I am very ')
+tokens = tokenizer.encode('I am very happy')
 for i in range(len(tokens), 20):
     mask = torch.tensor([[[0.0] * i + [1.0]]])
     logits, _ = model(torch.tensor([tokens + [0]]),
-                      perm_mask=mask.expand(-1, i+1, -1),
+                    #   perm_mask=mask.expand(-1, i+1, -1),
                       target_mapping=mask,
                       inp_q=mask.squeeze(1))
     output = torch.multinomial(F.softmax(logits[0, 0, :]), 1)
diff --git a/pytorch_pretrained_bert/modeling_xlnet.py b/pytorch_pretrained_bert/modeling_xlnet.py
index f825043e8c..a5af36ce29 100644
--- a/pytorch_pretrained_bert/modeling_xlnet.py
+++ b/pytorch_pretrained_bert/modeling_xlnet.py
@@ -730,12 +730,17 @@ class XLNetPreTrainedModel(nn.Module):
 
         # Load config
         config = XLNetConfig.from_json_file(resolved_config_file)
-        logger.info("Model config {}".format(config))
 
         # Update config with kwargs if needed
-        for key, value in kwargs:
+        to_remove = []
+        for key, value in kwargs.items():
             if hasattr(config, key):
                 setattr(config, key, value)
+                to_remove.append(key)
+        for key in to_remove:
+            kwargs.pop(key, None)
+
+        logger.info("Model config {}".format(config))
 
         # Instantiate model.
         model = cls(config, *inputs, **kwargs)
diff --git a/pytorch_pretrained_bert/tokenization_xlnet.py b/pytorch_pretrained_bert/tokenization_xlnet.py
index 3cc5053338..2fad20fb02 100644
--- a/pytorch_pretrained_bert/tokenization_xlnet.py
+++ b/pytorch_pretrained_bert/tokenization_xlnet.py
@@ -36,7 +36,29 @@ PRETRAINED_VOCAB_ARCHIVE_MAP = {
 VOCAB_NAME = 'spiece.model'
 SPECIAL_TOKENS_NAME = 'special_tokens.txt'
 
-SPIECE_UNDERLINE = '▁'
+SPIECE_UNDERLINE = u'▁'
+
+# Tokens
+special_symbols = {
+    "<unk>"  : 0,
+    "<s>"    : 1,
+    "</s>"   : 2,
+    "<cls>"  : 3,
+    "<sep>"  : 4,
+    "<pad>"  : 5,
+    "<mask>" : 6,
+    "<eod>"  : 7,
+    "<eop>"  : 8,
+}
+
+VOCAB_SIZE = 32000
+UNK_ID = special_symbols["<unk>"]
+CLS_ID = special_symbols["<cls>"]
+SEP_ID = special_symbols["<sep>"]
+MASK_ID = special_symbols["<mask>"]
+EOD_ID = special_symbols["<eod>"]
+
+# Segments (not really needed)
 SEG_ID_A   = 0
 SEG_ID_B   = 1
 SEG_ID_CLS = 2
diff --git a/xlnet b/xlnet
new file mode 160000
index 0000000000..cbdedecbc7
--- /dev/null
+++ b/xlnet
@@ -0,0 +1 @@
+Subproject commit cbdedecbc7951fc000a1547f9feb086c34f0698b

From c946bb51a61f67b0c9eaae1c9cf6f164a7748e37 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Sat, 22 Jun 2019 22:28:49 +0200
Subject: [PATCH 010/139] fix xlnet tokenizer and python2

---
 pytorch_pretrained_bert/tokenization_xlnet.py | 10 ++++-
 tests/tokenization_xlnet_test.py              | 41 +++++++++++--------
 2 files changed, 32 insertions(+), 19 deletions(-)

diff --git a/pytorch_pretrained_bert/tokenization_xlnet.py b/pytorch_pretrained_bert/tokenization_xlnet.py
index 2fad20fb02..320800c8ff 100644
--- a/pytorch_pretrained_bert/tokenization_xlnet.py
+++ b/pytorch_pretrained_bert/tokenization_xlnet.py
@@ -241,7 +241,7 @@ class XLNetTokenizer(object):
             )
         return ids
 
-    def convert_ids_to_tokens(self, ids, skip_special_tokens=False):
+    def convert_ids_to_tokens(self, ids, return_unicode=True, skip_special_tokens=False):
         """Converts a sequence of ids in tokens."""
         tokens = []
         for i in ids:
@@ -250,6 +250,14 @@ class XLNetTokenizer(object):
                     tokens.append(self.special_tokens_decoder[i])
             else:
                 tokens.append(self.sp_model.IdToPiece(i))
+
+        if six.PY2 and return_unicode:
+            ret_pieces = []
+            for piece in tokens:
+                if isinstance(piece, str):
+                    piece = piece.decode('utf-8')
+                ret_pieces.append(piece)
+            tokens = ret_pieces
         return tokens
 
     def encode(self, text, sample=False):
diff --git a/tests/tokenization_xlnet_test.py b/tests/tokenization_xlnet_test.py
index e1267d3f9f..30d7f37c04 100644
--- a/tests/tokenization_xlnet_test.py
+++ b/tests/tokenization_xlnet_test.py
@@ -33,23 +33,24 @@ class XLNetTokenizationTest(unittest.TestCase):
     def test_full_tokenizer(self):
         tokenizer = XLNetTokenizer(SAMPLE_VOCAB)
 
-        tokens = tokenizer.tokenize('This is a test')
-        self.assertListEqual(tokens, ['▁This', '▁is', '▁a', '▁t', 'est'])
+        tokens = tokenizer.tokenize(u'This is a test')
+        self.assertListEqual(tokens, [u'▁This', u'▁is', u'▁a', u'▁t', u'est'])
 
         self.assertListEqual(
             tokenizer.convert_tokens_to_ids(tokens), [285, 46, 10, 170, 382])
 
-        vocab_path = "/tmp/"
+        vocab_path = u"/tmp/"
         vocab_file, special_tokens_file = tokenizer.save_vocabulary(vocab_path)
         tokenizer = tokenizer.from_pretrained(vocab_path,
                                               keep_accents=True)
         os.remove(vocab_file)
         os.remove(special_tokens_file)
 
-        tokens = tokenizer.tokenize("I was born in 92000, and this is falsé.")
-        self.assertListEqual(tokens, [SPIECE_UNDERLINE + 'I', SPIECE_UNDERLINE + 'was', SPIECE_UNDERLINE + 'b', 'or', 'n', SPIECE_UNDERLINE + 'in', SPIECE_UNDERLINE + '',
-                                      '9', '2', '0', '0', '0', ',', SPIECE_UNDERLINE + 'and', SPIECE_UNDERLINE + 'this',
-                                      SPIECE_UNDERLINE + 'is', SPIECE_UNDERLINE + 'f', 'al', 's', 'é', '.'])
+        tokens = tokenizer.tokenize(u"I was born in 92000, and this is falsé.")
+        self.assertListEqual(tokens, [SPIECE_UNDERLINE + u'I', SPIECE_UNDERLINE + u'was', SPIECE_UNDERLINE + u'b',
+                                      u'or', u'n', SPIECE_UNDERLINE + u'in', SPIECE_UNDERLINE + u'',
+                                      u'9', u'2', u'0', u'0', u'0', u',', SPIECE_UNDERLINE + u'and', SPIECE_UNDERLINE + u'this',
+                                      SPIECE_UNDERLINE + u'is', SPIECE_UNDERLINE + u'f', u'al', u's', u'é', u'.'])
         ids = tokenizer.convert_tokens_to_ids(tokens)
         self.assertListEqual(
             ids, [8, 21, 84, 55, 24, 19, 7, 0,
@@ -57,10 +58,12 @@ class XLNetTokenizationTest(unittest.TestCase):
                             46, 72, 80, 6, 0, 4])
 
         back_tokens = tokenizer.convert_ids_to_tokens(ids)
-        self.assertListEqual(back_tokens, [SPIECE_UNDERLINE + 'I', SPIECE_UNDERLINE + 'was', SPIECE_UNDERLINE + 'b', 'or', 'n', SPIECE_UNDERLINE + 'in',
-                                           SPIECE_UNDERLINE + '', '<unk>', '2', '0', '0', '0', ',',
-                                           SPIECE_UNDERLINE + 'and', SPIECE_UNDERLINE + 'this', SPIECE_UNDERLINE + 'is', SPIECE_UNDERLINE + 'f', 'al', 's',
-                                           '<unk>', '.'])
+        self.assertListEqual(back_tokens, [SPIECE_UNDERLINE + u'I', SPIECE_UNDERLINE + u'was', SPIECE_UNDERLINE + u'b',
+                                           u'or', u'n', SPIECE_UNDERLINE + u'in',
+                                           SPIECE_UNDERLINE + u'', u'<unk>', u'2', u'0', u'0', u'0', u',',
+                                           SPIECE_UNDERLINE + u'and', SPIECE_UNDERLINE + u'this',
+                                           SPIECE_UNDERLINE + u'is', SPIECE_UNDERLINE + u'f', u'al', u's',
+                                           u'<unk>', u'.'])
 
     @pytest.mark.slow
     def test_tokenizer_from_pretrained(self):
@@ -73,17 +76,19 @@ class XLNetTokenizationTest(unittest.TestCase):
     def test_tokenizer_lower(self):
         tokenizer = XLNetTokenizer(SAMPLE_VOCAB, do_lower_case=True)
         tokens = tokenizer.tokenize(u"I was born in 92000, and this is falsé.")
-        self.assertListEqual(tokens, [SPIECE_UNDERLINE + '', 'i', SPIECE_UNDERLINE + 'was', SPIECE_UNDERLINE + 'b', 'or', 'n', SPIECE_UNDERLINE + 'in', SPIECE_UNDERLINE + '',
-                                      '9', '2', '0', '0', '0', ',', SPIECE_UNDERLINE + 'and', SPIECE_UNDERLINE + 'this',
-                                      SPIECE_UNDERLINE + 'is', SPIECE_UNDERLINE + 'f', 'al', 'se', '.'])
-        self.assertListEqual(tokenizer.tokenize(u"H\u00E9llo"), ["▁he", "ll", "o"])
+        self.assertListEqual(tokens, [SPIECE_UNDERLINE + u'', u'i', SPIECE_UNDERLINE + u'was', SPIECE_UNDERLINE + u'b',
+                                      u'or', u'n', SPIECE_UNDERLINE + u'in', SPIECE_UNDERLINE + u'',
+                                      u'9', u'2', u'0', u'0', u'0', u',', SPIECE_UNDERLINE + u'and', SPIECE_UNDERLINE + u'this',
+                                      SPIECE_UNDERLINE + u'is', SPIECE_UNDERLINE + u'f', u'al', u'se', u'.'])
+        self.assertListEqual(tokenizer.tokenize(u"H\u00E9llo"), [u"▁he", u"ll", u"o"])
 
     def test_tokenizer_no_lower(self):
         tokenizer = XLNetTokenizer(SAMPLE_VOCAB, do_lower_case=False)
         tokens = tokenizer.tokenize(u"I was born in 92000, and this is falsé.")
-        self.assertListEqual(tokens, [SPIECE_UNDERLINE + 'I', SPIECE_UNDERLINE + 'was', SPIECE_UNDERLINE + 'b', 'or', 'n', SPIECE_UNDERLINE + 'in', SPIECE_UNDERLINE + '',
-                                      '9', '2', '0', '0', '0', ',', SPIECE_UNDERLINE + 'and', SPIECE_UNDERLINE + 'this',
-                                      SPIECE_UNDERLINE + 'is', SPIECE_UNDERLINE + 'f', 'al', 'se', '.'])
+        self.assertListEqual(tokens, [SPIECE_UNDERLINE + u'I', SPIECE_UNDERLINE + u'was', SPIECE_UNDERLINE + u'b', u'or',
+                                      u'n', SPIECE_UNDERLINE + u'in', SPIECE_UNDERLINE + u'',
+                                      u'9', u'2', u'0', u'0', u'0', u',', SPIECE_UNDERLINE + u'and', SPIECE_UNDERLINE + u'this',
+                                      SPIECE_UNDERLINE + u'is', SPIECE_UNDERLINE + u'f', u'al', u'se', u'.'])
 
 
 if __name__ == '__main__':

From f6081f2255d71779f4ef71acd47502dbe00bf2a9 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Mon, 24 Jun 2019 10:01:07 +0200
Subject: [PATCH 011/139] add xlnetforsequence classif and run_classifier
 example for xlnet

---
 examples/bertology.py                         |   2 +-
 ...n_classifier.py => run_bert_classifier.py} |   2 +-
 ...atures.py => run_bert_extract_features.py} |   0
 examples/{run_squad.py => run_bert_squad.py}  |   2 +-
 examples/run_xlnet_classifier.py              | 539 ++++++++++++++++++
 ...ssifier_dataset_utils.py => utils_glue.py} |   0
 ..._squad_dataset_utils.py => utils_squad.py} |   1 +
 hubconfs/xlnet_hubconf.py                     |  58 +-
 pytorch_pretrained_bert/modeling_xlnet.py     |  59 +-
 9 files changed, 621 insertions(+), 42 deletions(-)
 rename examples/{run_classifier.py => run_bert_classifier.py} (99%)
 rename examples/{extract_features.py => run_bert_extract_features.py} (100%)
 rename examples/{run_squad.py => run_bert_squad.py} (99%)
 create mode 100644 examples/run_xlnet_classifier.py
 rename examples/{run_classifier_dataset_utils.py => utils_glue.py} (100%)
 rename examples/{run_squad_dataset_utils.py => utils_squad.py} (99%)

diff --git a/examples/bertology.py b/examples/bertology.py
index 4bb23b8f16..6f7f7c9592 100644
--- a/examples/bertology.py
+++ b/examples/bertology.py
@@ -14,7 +14,7 @@ from torch.nn import CrossEntropyLoss, MSELoss
 
 from pytorch_pretrained_bert import BertForSequenceClassification, BertTokenizer
 
-from run_classifier_dataset_utils import processors, output_modes, convert_examples_to_features, compute_metrics
+from utils_glue import processors, output_modes, convert_examples_to_features, compute_metrics
 
 
 logger = logging.getLogger(__name__)
diff --git a/examples/run_classifier.py b/examples/run_bert_classifier.py
similarity index 99%
rename from examples/run_classifier.py
rename to examples/run_bert_classifier.py
index 5a359ad262..cc8d1fe571 100644
--- a/examples/run_classifier.py
+++ b/examples/run_bert_classifier.py
@@ -39,7 +39,7 @@ from pytorch_pretrained_bert.modeling import BertForSequenceClassification
 from pytorch_pretrained_bert.tokenization import BertTokenizer
 from pytorch_pretrained_bert.optimization import BertAdam, WarmupLinearSchedule
 
-from run_classifier_dataset_utils import processors, output_modes, convert_examples_to_features, compute_metrics
+from utils_glue import processors, output_modes, convert_examples_to_features, compute_metrics
 
 if sys.version_info[0] == 2:
     import cPickle as pickle
diff --git a/examples/extract_features.py b/examples/run_bert_extract_features.py
similarity index 100%
rename from examples/extract_features.py
rename to examples/run_bert_extract_features.py
diff --git a/examples/run_squad.py b/examples/run_bert_squad.py
similarity index 99%
rename from examples/run_squad.py
rename to examples/run_bert_squad.py
index bf1763e884..c0e7844236 100644
--- a/examples/run_squad.py
+++ b/examples/run_bert_squad.py
@@ -38,7 +38,7 @@ from pytorch_pretrained_bert.modeling import BertForQuestionAnswering
 from pytorch_pretrained_bert.optimization import BertAdam, WarmupLinearSchedule
 from pytorch_pretrained_bert.tokenization import BertTokenizer
 
-from run_squad_dataset_utils import read_squad_examples, convert_examples_to_features, RawResult, write_predictions
+from utils_squad import read_squad_examples, convert_examples_to_features, RawResult, write_predictions
 
 if sys.version_info[0] == 2:
     import cPickle as pickle
diff --git a/examples/run_xlnet_classifier.py b/examples/run_xlnet_classifier.py
new file mode 100644
index 0000000000..bedca65bb7
--- /dev/null
+++ b/examples/run_xlnet_classifier.py
@@ -0,0 +1,539 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""BERT finetuning runner."""
+
+from __future__ import absolute_import, division, print_function
+
+import argparse
+import logging
+import os
+import sys
+import random
+from tqdm import tqdm, trange
+
+import numpy as np
+
+import torch
+from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
+                              TensorDataset)
+from torch.utils.data.distributed import DistributedSampler
+from torch.nn import CrossEntropyLoss, MSELoss
+
+from tensorboardX import SummaryWriter
+
+from pytorch_pretrained_bert.file_utils import WEIGHTS_NAME, CONFIG_NAME
+from pytorch_pretrained_bert.modeling_xlnet import XLNetForSequenceClassification
+from pytorch_pretrained_bert.tokenization_xlnet import XLNetTokenizer
+from pytorch_pretrained_bert.optimization import BertAdam, WarmupLinearSchedule
+
+from utils_glue import processors, output_modes, convert_examples_to_features, compute_metrics
+
+if sys.version_info[0] == 2:
+    import cPickle as pickle
+else:
+    import pickle
+
+
+logger = logging.getLogger(__name__)
+
+
+def main():
+    parser = argparse.ArgumentParser()
+
+    ## Required parameters
+    parser.add_argument("--data_dir",
+                        default=None,
+                        type=str,
+                        required=True,
+                        help="The input data dir. Should contain the .tsv files (or other data files) for the task.")
+    parser.add_argument("--xlnet_model", default="xlnet-large-cased", type=str,
+                        help="XLNet pre-trained model: currently only xlnet-large-cased.")
+    parser.add_argument("--task_name",
+                        default=None,
+                        type=str,
+                        required=True,
+                        help="The name of the task to train.")
+    parser.add_argument("--output_dir",
+                        default=None,
+                        type=str,
+                        required=True,
+                        help="The output directory where the model predictions and checkpoints will be written.")
+
+    ## Other parameters
+    parser.add_argument("--cache_dir",
+                        default="",
+                        type=str,
+                        help="Where do you want to store the pre-trained models downloaded from s3")
+    parser.add_argument("--max_seq_length",
+                        default=128,
+                        type=int,
+                        help="The maximum total input sequence length after WordPiece tokenization. \n"
+                             "Sequences longer than this will be truncated, and sequences shorter \n"
+                             "than this will be padded.")
+    parser.add_argument("--do_train",
+                        action='store_true',
+                        help="Whether to run training.")
+    parser.add_argument("--do_eval",
+                        action='store_true',
+                        help="Whether to run eval on the dev set.")
+    parser.add_argument("--do_lower_case",
+                        action='store_true',
+                        help="Set this flag if you are using an uncased model.")
+    parser.add_argument("--train_batch_size",
+                        default=32,
+                        type=int,
+                        help="Total batch size for training.")
+    parser.add_argument("--eval_batch_size",
+                        default=8,
+                        type=int,
+                        help="Total batch size for eval.")
+    parser.add_argument("--learning_rate",
+                        default=5e-5,
+                        type=float,
+                        help="The initial learning rate for Adam.")
+    parser.add_argument("--num_train_epochs",
+                        default=3.0,
+                        type=float,
+                        help="Total number of training epochs to perform.")
+    parser.add_argument("--warmup_proportion",
+                        default=0.1,
+                        type=float,
+                        help="Proportion of training to perform linear learning rate warmup for. "
+                             "E.g., 0.1 = 10%% of training.")
+    parser.add_argument("--no_cuda",
+                        action='store_true',
+                        help="Whether not to use CUDA when available")
+    parser.add_argument('--overwrite_output_dir',
+                        action='store_true',
+                        help="Overwrite the content of the output directory")
+    parser.add_argument("--local_rank",
+                        type=int,
+                        default=-1,
+                        help="local_rank for distributed training on gpus")
+    parser.add_argument('--seed',
+                        type=int,
+                        default=42,
+                        help="random seed for initialization")
+    parser.add_argument('--gradient_accumulation_steps',
+                        type=int,
+                        default=1,
+                        help="Number of updates steps to accumulate before performing a backward/update pass.")
+    parser.add_argument('--fp16',
+                        action='store_true',
+                        help="Whether to use 16-bit float precision instead of 32-bit")
+    parser.add_argument('--loss_scale',
+                        type=float, default=0,
+                        help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
+                             "0 (default value): dynamic loss scaling.\n"
+                             "Positive power of 2: static loss scaling value.\n")
+    parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.")
+    parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.")
+    args = parser.parse_args()
+
+    if args.server_ip and args.server_port:
+        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
+        import ptvsd
+        print("Waiting for debugger attach")
+        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
+        ptvsd.wait_for_attach()
+
+    if args.local_rank == -1 or args.no_cuda:
+        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
+        n_gpu = torch.cuda.device_count()
+    else:
+        torch.cuda.set_device(args.local_rank)
+        device = torch.device("cuda", args.local_rank)
+        n_gpu = 1
+        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
+        torch.distributed.init_process_group(backend='nccl')
+    args.device = device
+
+    logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
+                        datefmt = '%m/%d/%Y %H:%M:%S',
+                        level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
+
+    logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format(
+        device, n_gpu, bool(args.local_rank != -1), args.fp16))
+
+    if args.gradient_accumulation_steps < 1:
+        raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(
+                            args.gradient_accumulation_steps))
+
+    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps
+
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+    torch.manual_seed(args.seed)
+    if n_gpu > 0:
+        torch.cuda.manual_seed_all(args.seed)
+
+    if not args.do_train and not args.do_eval:
+        raise ValueError("At least one of `do_train` or `do_eval` must be True.")
+
+    if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir:
+        raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir))
+    if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
+        os.makedirs(args.output_dir)
+
+    task_name = args.task_name.lower()
+
+    if task_name not in processors:
+        raise ValueError("Task not found: %s" % (task_name))
+
+    processor = processors[task_name]()
+    output_mode = output_modes[task_name]
+
+    label_list = processor.get_labels()
+    num_labels = len(label_list)
+
+    if args.local_rank not in [-1, 0]:
+        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
+    tokenizer = XLNetTokenizer.from_pretrained(args.xlnet_model, do_lower_case=args.do_lower_case)
+    model = XLNetForSequenceClassification.from_pretrained(args.xlnet_model, num_labels=num_labels)
+    if args.local_rank == 0:
+        torch.distributed.barrier()
+
+    if args.fp16:
+        model.half()
+    model.to(device)
+    if args.local_rank != -1:
+        model = torch.nn.parallel.DistributedDataParallel(model,
+                                                          device_ids=[args.local_rank],
+                                                          output_device=args.local_rank,
+                                                          find_unused_parameters=True)
+    elif n_gpu > 1:
+        model = torch.nn.DataParallel(model)
+
+    global_step = 0
+    nb_tr_steps = 0
+    tr_loss = 0
+
+    if args.do_train:
+        if args.local_rank in [-1, 0]:
+            tb_writer = SummaryWriter()
+
+        # Prepare data loader
+        train_examples = processor.get_train_examples(args.data_dir)
+        cached_train_features_file = os.path.join(args.data_dir, 'train_{0}_{1}_{2}'.format(
+            list(filter(None, args.xlnet_model.split('/'))).pop(),
+                        str(args.max_seq_length),
+                        str(task_name)))
+        try:
+            with open(cached_train_features_file, "rb") as reader:
+                train_features = pickle.load(reader)
+        except:
+            train_features = convert_examples_to_features(
+                train_examples, label_list, args.max_seq_length, tokenizer, output_mode)
+            if args.local_rank == -1 or torch.distributed.get_rank() == 0:
+                logger.info("  Saving train features into cached file %s", cached_train_features_file)
+                with open(cached_train_features_file, "wb") as writer:
+                    pickle.dump(train_features, writer)
+
+        all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
+        all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
+        all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)
+
+        if output_mode == "classification":
+            all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long)
+        elif output_mode == "regression":
+            all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.float)
+
+        train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
+        if args.local_rank == -1:
+            train_sampler = RandomSampler(train_data)
+        else:
+            train_sampler = DistributedSampler(train_data)
+        train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)
+
+        num_train_optimization_steps = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
+
+        # Prepare optimizer
+
+        param_optimizer = list(model.named_parameters())
+        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
+        optimizer_grouped_parameters = [
+            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
+            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
+            ]
+        if args.fp16:
+            try:
+                from apex.optimizers import FP16_Optimizer
+                from apex.optimizers import FusedAdam
+            except ImportError:
+                raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
+
+            optimizer = FusedAdam(optimizer_grouped_parameters,
+                                  lr=args.learning_rate,
+                                  bias_correction=False,
+                                  max_grad_norm=1.0)
+            if args.loss_scale == 0:
+                optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
+            else:
+                optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale)
+            warmup_linear = WarmupLinearSchedule(warmup=args.warmup_proportion,
+                                                 t_total=num_train_optimization_steps)
+
+        else:
+            optimizer = BertAdam(optimizer_grouped_parameters,
+                                 lr=args.learning_rate,
+                                 warmup=args.warmup_proportion,
+                                 t_total=num_train_optimization_steps)
+
+        logger.info("***** Running training *****")
+        logger.info("  Num examples = %d", len(train_examples))
+        logger.info("  Batch size = %d", args.train_batch_size)
+        logger.info("  Num steps = %d", num_train_optimization_steps)
+
+        model.train()
+        for _ in trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]):
+            tr_loss = 0
+            nb_tr_examples, nb_tr_steps = 0, 0
+            for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])):
+                batch = tuple(t.to(device) for t in batch)
+                input_ids, input_mask, segment_ids, label_ids = batch
+
+                # define a new function to compute loss values for both output_modes
+                logits = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask)
+
+                if output_mode == "classification":
+                    loss_fct = CrossEntropyLoss()
+                    loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1))
+                elif output_mode == "regression":
+                    loss_fct = MSELoss()
+                    loss = loss_fct(logits.view(-1), label_ids.view(-1))
+
+                if n_gpu > 1:
+                    loss = loss.mean() # mean() to average on multi-gpu.
+                if args.gradient_accumulation_steps > 1:
+                    loss = loss / args.gradient_accumulation_steps
+
+                if args.fp16:
+                    optimizer.backward(loss)
+                else:
+                    loss.backward()
+
+                tr_loss += loss.item()
+                nb_tr_examples += input_ids.size(0)
+                nb_tr_steps += 1
+                if (step + 1) % args.gradient_accumulation_steps == 0:
+                    if args.fp16:
+                        # modify learning rate with special warm up BERT uses
+                        # if args.fp16 is False, BertAdam is used that handles this automatically
+                        lr_this_step = args.learning_rate * warmup_linear.get_lr(global_step, args.warmup_proportion)
+                        for param_group in optimizer.param_groups:
+                            param_group['lr'] = lr_this_step
+                    optimizer.step()
+                    optimizer.zero_grad()
+                    global_step += 1
+                    if args.local_rank in [-1, 0]:
+                        tb_writer.add_scalar('lr', optimizer.get_lr()[0], global_step)
+                        tb_writer.add_scalar('loss', loss.item(), global_step)
+
+    ### Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
+    ### Example:
+    if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
+        # Save a trained model, configuration and tokenizer
+        model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
+
+        # If we save using the predefined names, we can load using `from_pretrained`
+        output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
+        output_config_file = os.path.join(args.output_dir, CONFIG_NAME)
+
+        torch.save(model_to_save.state_dict(), output_model_file)
+        model_to_save.config.to_json_file(output_config_file)
+        tokenizer.save_vocabulary(args.output_dir)
+
+        # Load a trained model and vocabulary that you have fine-tuned
+        model = XLNetForSequenceClassification.from_pretrained(args.output_dir, num_labels=num_labels)
+        tokenizer = XLNetTokenizer.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
+
+        # Good practice: save your training arguments together with the trained model
+        output_args_file = os.path.join(args.output_dir, 'training_args.bin')
+        torch.save(args, output_args_file)
+    else:
+        model = XLNetForSequenceClassification.from_pretrained(args.xlnet_model, num_labels=num_labels)
+
+    model.to(device)
+
+    ### Evaluation
+    if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
+        eval_examples = processor.get_dev_examples(args.data_dir)
+        cached_eval_features_file = os.path.join(args.data_dir, 'dev_{0}_{1}_{2}'.format(
+            list(filter(None, args.xlnet_model.split('/'))).pop(),
+                        str(args.max_seq_length),
+                        str(task_name)))
+        try:
+            with open(cached_eval_features_file, "rb") as reader:
+                eval_features = pickle.load(reader)
+        except:
+            eval_features = convert_examples_to_features(
+                eval_examples, label_list, args.max_seq_length, tokenizer, output_mode)
+            if args.local_rank == -1 or torch.distributed.get_rank() == 0:
+                logger.info("  Saving eval features into cached file %s", cached_eval_features_file)
+                with open(cached_eval_features_file, "wb") as writer:
+                    pickle.dump(eval_features, writer)
+
+
+        logger.info("***** Running evaluation *****")
+        logger.info("  Num examples = %d", len(eval_examples))
+        logger.info("  Batch size = %d", args.eval_batch_size)
+        all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
+        all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
+        all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
+
+        if output_mode == "classification":
+            all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long)
+        elif output_mode == "regression":
+            all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.float)
+
+        eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
+        # Run prediction for full data
+        if args.local_rank == -1:
+            eval_sampler = SequentialSampler(eval_data)
+        else:
+            eval_sampler = DistributedSampler(eval_data)  # Note that this sampler samples randomly
+        eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)
+
+        model.eval()
+        eval_loss = 0
+        nb_eval_steps = 0
+        preds = []
+        out_label_ids = None
+
+        for input_ids, input_mask, segment_ids, label_ids in tqdm(eval_dataloader, desc="Evaluating"):
+            input_ids = input_ids.to(device)
+            input_mask = input_mask.to(device)
+            segment_ids = segment_ids.to(device)
+            label_ids = label_ids.to(device)
+
+            with torch.no_grad():
+                logits = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask)
+
+            # create eval loss and other metric required by the task
+            if output_mode == "classification":
+                loss_fct = CrossEntropyLoss()
+                tmp_eval_loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1))
+            elif output_mode == "regression":
+                loss_fct = MSELoss()
+                tmp_eval_loss = loss_fct(logits.view(-1), label_ids.view(-1))
+
+            eval_loss += tmp_eval_loss.mean().item()
+            nb_eval_steps += 1
+            if len(preds) == 0:
+                preds.append(logits.detach().cpu().numpy())
+                out_label_ids = label_ids.detach().cpu().numpy()
+            else:
+                preds[0] = np.append(
+                    preds[0], logits.detach().cpu().numpy(), axis=0)
+                out_label_ids = np.append(
+                    out_label_ids, label_ids.detach().cpu().numpy(), axis=0)
+
+        eval_loss = eval_loss / nb_eval_steps
+        preds = preds[0]
+        if output_mode == "classification":
+            preds = np.argmax(preds, axis=1)
+        elif output_mode == "regression":
+            preds = np.squeeze(preds)
+        result = compute_metrics(task_name, preds, out_label_ids)
+
+        loss = tr_loss/global_step if args.do_train else None
+
+        result['eval_loss'] = eval_loss
+        result['global_step'] = global_step
+        result['loss'] = loss
+
+        output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
+        with open(output_eval_file, "w") as writer:
+            logger.info("***** Eval results *****")
+            for key in sorted(result.keys()):
+                logger.info("  %s = %s", key, str(result[key]))
+                writer.write("%s = %s\n" % (key, str(result[key])))
+
+        # hack for MNLI-MM
+        if task_name == "mnli":
+            task_name = "mnli-mm"
+            processor = processors[task_name]()
+
+            if os.path.exists(args.output_dir + '-MM') and os.listdir(args.output_dir + '-MM') and args.do_train:
+                raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir))
+            if not os.path.exists(args.output_dir + '-MM'):
+                os.makedirs(args.output_dir + '-MM')
+
+            eval_examples = processor.get_dev_examples(args.data_dir)
+            eval_features = convert_examples_to_features(
+                eval_examples, label_list, args.max_seq_length, tokenizer, output_mode)
+            logger.info("***** Running evaluation *****")
+            logger.info("  Num examples = %d", len(eval_examples))
+            logger.info("  Batch size = %d", args.eval_batch_size)
+            all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
+            all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
+            all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
+            all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long)
+
+            eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
+            # Run prediction for full data
+            eval_sampler = SequentialSampler(eval_data)
+            eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)
+
+            model.eval()
+            eval_loss = 0
+            nb_eval_steps = 0
+            preds = []
+            out_label_ids = None
+
+            for input_ids, input_mask, segment_ids, label_ids in tqdm(eval_dataloader, desc="Evaluating"):
+                input_ids = input_ids.to(device)
+                input_mask = input_mask.to(device)
+                segment_ids = segment_ids.to(device)
+                label_ids = label_ids.to(device)
+
+                with torch.no_grad():
+                    logits = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=None)
+
+                loss_fct = CrossEntropyLoss()
+                tmp_eval_loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1))
+
+                eval_loss += tmp_eval_loss.mean().item()
+                nb_eval_steps += 1
+                if len(preds) == 0:
+                    preds.append(logits.detach().cpu().numpy())
+                    out_label_ids = label_ids.detach().cpu().numpy()
+                else:
+                    preds[0] = np.append(
+                        preds[0], logits.detach().cpu().numpy(), axis=0)
+                    out_label_ids = np.append(
+                        out_label_ids, label_ids.detach().cpu().numpy(), axis=0)
+
+            eval_loss = eval_loss / nb_eval_steps
+            preds = preds[0]
+            preds = np.argmax(preds, axis=1)
+            result = compute_metrics(task_name, preds, out_label_ids)
+
+            loss = tr_loss/global_step if args.do_train else None
+
+            result['eval_loss'] = eval_loss
+            result['global_step'] = global_step
+            result['loss'] = loss
+
+            output_eval_file = os.path.join(args.output_dir + '-MM', "eval_results.txt")
+            with open(output_eval_file, "w") as writer:
+                logger.info("***** Eval results *****")
+                for key in sorted(result.keys()):
+                    logger.info("  %s = %s", key, str(result[key]))
+                    writer.write("%s = %s\n" % (key, str(result[key])))
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/run_classifier_dataset_utils.py b/examples/utils_glue.py
similarity index 100%
rename from examples/run_classifier_dataset_utils.py
rename to examples/utils_glue.py
diff --git a/examples/run_squad_dataset_utils.py b/examples/utils_squad.py
similarity index 99%
rename from examples/run_squad_dataset_utils.py
rename to examples/utils_squad.py
index 4043ee57f8..e4e43eff9d 100644
--- a/examples/run_squad_dataset_utils.py
+++ b/examples/utils_squad.py
@@ -1,3 +1,4 @@
+
 # coding=utf-8
 # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
 # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
diff --git a/hubconfs/xlnet_hubconf.py b/hubconfs/xlnet_hubconf.py
index 155e9ffa42..d3766d04e0 100644
--- a/hubconfs/xlnet_hubconf.py
+++ b/hubconfs/xlnet_hubconf.py
@@ -3,7 +3,7 @@ from pytorch_pretrained_bert.modeling_xlnet import (
     XLNetConfig,
     XLNetModel,
     XLNetLMHeadModel,
-    XLNetForSequenceClassification
+    # XLNetForSequenceClassification
 )
 
 # A lot of models share the same param doc. Use a decorator
@@ -135,35 +135,35 @@ def xlnetLMHeadModel(*args, **kwargs):
     return model
 
 
-@_append_from_pretrained_docstring(xlnet_docstring)
-def xlnetForSequenceClassification(*args, **kwargs):
-    """
-    xlnetModel is the basic XLNet Transformer model from
-        "XLNet: Generalized Autoregressive Pretraining for Language Understanding"
-        by Zhilin Yang, Zihang Dai1, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le
+# @_append_from_pretrained_docstring(xlnet_docstring)
+# def xlnetForSequenceClassification(*args, **kwargs):
+#     """
+#     xlnetModel is the basic XLNet Transformer model from
+#         "XLNet: Generalized Autoregressive Pretraining for Language Understanding"
+#         by Zhilin Yang, Zihang Dai1, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le
 
-    Example:
-        # Load the tokenizer
-        >>> import torch
-        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'xlnetTokenizer', 'xlnet-large-cased')
+#     Example:
+#         # Load the tokenizer
+#         >>> import torch
+#         >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'xlnetTokenizer', 'xlnet-large-cased')
 
-        #  Prepare tokenized input
-        >>> text1 = "Who was Jim Henson ? Jim Henson was a puppeteer"
-        >>> text2 = "Who was Jim Henson ? Jim Henson was a mysterious young man"
-        >>> tokenized_text1 = tokenizer.tokenize(text1)
-        >>> tokenized_text2 = tokenizer.tokenize(text2)
-        >>> indexed_tokens1 = tokenizer.convert_tokens_to_ids(tokenized_text1)
-        >>> indexed_tokens2 = tokenizer.convert_tokens_to_ids(tokenized_text2)
-        >>> tokens_tensor = torch.tensor([[indexed_tokens1, indexed_tokens2]])
-        >>> mc_token_ids = torch.LongTensor([[len(tokenized_text1)-1, len(tokenized_text2)-1]])
+#         #  Prepare tokenized input
+#         >>> text1 = "Who was Jim Henson ? Jim Henson was a puppeteer"
+#         >>> text2 = "Who was Jim Henson ? Jim Henson was a mysterious young man"
+#         >>> tokenized_text1 = tokenizer.tokenize(text1)
+#         >>> tokenized_text2 = tokenizer.tokenize(text2)
+#         >>> indexed_tokens1 = tokenizer.convert_tokens_to_ids(tokenized_text1)
+#         >>> indexed_tokens2 = tokenizer.convert_tokens_to_ids(tokenized_text2)
+#         >>> tokens_tensor = torch.tensor([[indexed_tokens1, indexed_tokens2]])
+#         >>> mc_token_ids = torch.LongTensor([[len(tokenized_text1)-1, len(tokenized_text2)-1]])
 
-        # Load xlnetForSequenceClassification
-        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'xlnetForSequenceClassification', 'xlnet-large-cased')
-        >>> model.eval()
+#         # Load xlnetForSequenceClassification
+#         >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'xlnetForSequenceClassification', 'xlnet-large-cased')
+#         >>> model.eval()
 
-        # Predict sequence classes logits
-        >>> with torch.no_grad():
-                lm_logits, mems = model(tokens_tensor)
-    """
-    model = XLNetForSequenceClassification.from_pretrained(*args, **kwargs)
-    return model
+#         # Predict sequence classes logits
+#         >>> with torch.no_grad():
+#                 lm_logits, mems = model(tokens_tensor)
+#     """
+#     model = XLNetForSequenceClassification.from_pretrained(*args, **kwargs)
+#     return model
diff --git a/pytorch_pretrained_bert/modeling_xlnet.py b/pytorch_pretrained_bert/modeling_xlnet.py
index a5af36ce29..45cd6350d5 100644
--- a/pytorch_pretrained_bert/modeling_xlnet.py
+++ b/pytorch_pretrained_bert/modeling_xlnet.py
@@ -1194,6 +1194,38 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
         return logits, new_mems
         #     return all_attentions, encoded_layers, pooled_output
 
+class XLNetSequenceSummary(nn.Module):
+    def __init__(self, config, summary_type="last", use_proj=True,
+                 output_attentions=False, keep_multihead_output=False):
+        super(XLNetSequenceSummary, self).__init__()
+        self.summary_type = summary_type
+        if use_proj:
+            self.summary = nn.Linear(config.hidden_size, num_labels)
+        else:
+            self.summary = None
+        if summary_type == 'attn':
+            # We should use a standard multi-head attention module with absolute positional embedding for that.
+            # Cf. https://github.com/zihangdai/xlnet/blob/master/modeling.py#L253-L276
+            # We can probably just use the multi-head attention module of PyTorch >=1.1.0
+            raise NotImplementedError
+        self.dropout = nn.Dropout(config.dropout)
+        self.activation = nn.Tanh()
+
+    def forward(self, hidden_states, input_mask=None):
+        if self.summary_type == 'last':
+            output = hidden_states[-1]
+        elif self.summary_type == 'first':
+            output = hidden_states[0]
+        elif self.summary_type == 'mean':
+            output = hidden_states.mean(dim=0)
+        elif summary_type == 'attn':
+            raise NotImplementedError
+
+        output = self.summary(output)
+        output = self.dropout(output)
+        output = self.activation(output)
+        return output
+
 
 class XLNetForSequenceClassification(XLNetPreTrainedModel):
     """XLNet model ("XLNet: Generalized Autoregressive Pretraining for Language Understanding").
@@ -1255,19 +1287,23 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel):
     all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)
     ```
     """
-    def __init__(self, config, summary_type="last", output_attentions=False, keep_multihead_output=False):
+    def __init__(self, config, summary_type="last", use_proj=True, num_labels=2,
+                 is_regression=False, output_attentions=False, keep_multihead_output=False):
         super(XLNetForSequenceClassification, self).__init__(config)
         self.output_attentions = output_attentions
         self.attn_type = config.attn_type
         self.same_length = config.same_length
         self.summary_type = summary_type
+        self.is_regression = is_regression
 
         self.transformer = XLNetModel(config, output_attentions=output_attentions,
                                               keep_multihead_output=keep_multihead_output)
-        self.lm_loss = nn.Linear(config.d_model, config.n_token, bias=True)
 
-        self.apply(self.init_xlnet_weights)
-        self.tie_weights()
+        self.sequence_summary = XLNetSequenceSummary(config, summary_type=summary_type,
+                                                     use_proj=use_proj, output_attentions=output_attentions,
+                                                     keep_multihead_output=keep_multihead_output)
+        self.loss_proj = nn.Linear(config.d_model, num_classes if not is_regression else 1)
+        self.apply(self.init_bert_weights)
 
     def forward(self, inp_k, seg_id=None, input_mask=None,
                 mems=None, perm_mask=None, target_mapping=None, inp_q=None,
@@ -1295,17 +1331,20 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel):
                 Only used during pretraining for two-stream attention.
                 Set to None during finetuning.
         """
-        output, hidden_states, new_mems = self.transformer(inp_k, seg_id, input_mask,
+        output, _, new_mems = self.transformer(inp_k, seg_id, input_mask,
                                             mems, perm_mask, target_mapping, inp_q,
                                             output_all_encoded_layers, head_mask)
 
-        logits = self.lm_loss(output)
+        output = self.sequence_summary(output)
+        logits = self.loss_proj(output)
 
         if target is not None:
-            # Flatten the tokens
-            loss_fct = CrossEntropyLoss(ignore_index=-1)
-            loss = loss_fct(logits.view(-1, logits.size(-1)),
-                            target.view(-1))
+            if self.is_regression:
+                loss_fct = MSELoss()
+                loss = loss_fct(logits.view(-1), target.view(-1))
+            else:
+                loss_fct = CrossEntropyLoss(ignore_index=-1)
+                loss = loss_fct(logits.view(-1, logits.size(-1)), target.view(-1))
             return loss, new_mems
 
         # if self.output_attentions:

From 24ed0b9346079da741b952c21966fdc2063292e4 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Mon, 24 Jun 2019 12:00:09 +0200
Subject: [PATCH 012/139] updating run_xlnet_classifier

---
 .gitignore                                |   5 +-
 examples/run_xlnet_classifier.py          | 117 +++----
 examples/run_xlnet_squad.py               | 398 ++++++++++++++++++++++
 pytorch_pretrained_bert/modeling.py       |  18 +-
 pytorch_pretrained_bert/modeling_xlnet.py | 174 +++++++---
 5 files changed, 587 insertions(+), 125 deletions(-)
 create mode 100644 examples/run_xlnet_squad.py

diff --git a/.gitignore b/.gitignore
index 8abc9b84e1..05129fc402 100644
--- a/.gitignore
+++ b/.gitignore
@@ -123,4 +123,7 @@ tensorflow_code
 
 # Models
 models
-proc_data
\ No newline at end of file
+proc_data
+
+# examples
+examples/runs
\ No newline at end of file
diff --git a/examples/run_xlnet_classifier.py b/examples/run_xlnet_classifier.py
index bedca65bb7..154aac332a 100644
--- a/examples/run_xlnet_classifier.py
+++ b/examples/run_xlnet_classifier.py
@@ -54,91 +54,58 @@ def main():
     parser = argparse.ArgumentParser()
 
     ## Required parameters
-    parser.add_argument("--data_dir",
-                        default=None,
-                        type=str,
-                        required=True,
+    parser.add_argument("--data_dir", default=None, type=str, required=True,
                         help="The input data dir. Should contain the .tsv files (or other data files) for the task.")
-    parser.add_argument("--xlnet_model", default="xlnet-large-cased", type=str,
-                        help="XLNet pre-trained model: currently only xlnet-large-cased.")
-    parser.add_argument("--task_name",
-                        default=None,
-                        type=str,
-                        required=True,
+    parser.add_argument("--task_name", default=None, type=str, required=True,
                         help="The name of the task to train.")
-    parser.add_argument("--output_dir",
-                        default=None,
-                        type=str,
-                        required=True,
+    parser.add_argument("--output_dir", default=None, type=str, required=True,
                         help="The output directory where the model predictions and checkpoints will be written.")
-
-    ## Other parameters
-    parser.add_argument("--cache_dir",
-                        default="",
-                        type=str,
-                        help="Where do you want to store the pre-trained models downloaded from s3")
-    parser.add_argument("--max_seq_length",
-                        default=128,
-                        type=int,
-                        help="The maximum total input sequence length after WordPiece tokenization. \n"
-                             "Sequences longer than this will be truncated, and sequences shorter \n"
-                             "than this will be padded.")
-    parser.add_argument("--do_train",
-                        action='store_true',
+    # training
+    parser.add_argument("--do_train", action='store_true',
                         help="Whether to run training.")
-    parser.add_argument("--do_eval",
-                        action='store_true',
-                        help="Whether to run eval on the dev set.")
-    parser.add_argument("--do_lower_case",
-                        action='store_true',
-                        help="Set this flag if you are using an uncased model.")
-    parser.add_argument("--train_batch_size",
-                        default=32,
-                        type=int,
-                        help="Total batch size for training.")
-    parser.add_argument("--eval_batch_size",
-                        default=8,
-                        type=int,
-                        help="Total batch size for eval.")
-    parser.add_argument("--learning_rate",
-                        default=5e-5,
-                        type=float,
+    parser.add_argument("--learning_rate", default=5e-5, type=float,
                         help="The initial learning rate for Adam.")
-    parser.add_argument("--num_train_epochs",
-                        default=3.0,
-                        type=float,
+    parser.add_argument("--num_train_epochs", default=3.0, type=float,
                         help="Total number of training epochs to perform.")
-    parser.add_argument("--warmup_proportion",
-                        default=0.1,
-                        type=float,
+    parser.add_argument("--warmup_proportion", default=0.1, type=float,
                         help="Proportion of training to perform linear learning rate warmup for. "
                              "E.g., 0.1 = 10%% of training.")
-    parser.add_argument("--no_cuda",
-                        action='store_true',
-                        help="Whether not to use CUDA when available")
-    parser.add_argument('--overwrite_output_dir',
-                        action='store_true',
-                        help="Overwrite the content of the output directory")
-    parser.add_argument("--local_rank",
-                        type=int,
-                        default=-1,
-                        help="local_rank for distributed training on gpus")
-    parser.add_argument('--seed',
-                        type=int,
-                        default=42,
-                        help="random seed for initialization")
-    parser.add_argument('--gradient_accumulation_steps',
-                        type=int,
-                        default=1,
+    parser.add_argument("--train_batch_size", default=32, type=int,
+                        help="Total batch size for training.")
+    parser.add_argument('--gradient_accumulation_steps', type=int, default=1,
                         help="Number of updates steps to accumulate before performing a backward/update pass.")
-    parser.add_argument('--fp16',
-                        action='store_true',
+    parser.add_argument('--fp16', action='store_true',
                         help="Whether to use 16-bit float precision instead of 32-bit")
-    parser.add_argument('--loss_scale',
-                        type=float, default=0,
+    parser.add_argument('--loss_scale', type=float, default=0,
                         help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
                              "0 (default value): dynamic loss scaling.\n"
                              "Positive power of 2: static loss scaling value.\n")
+    # evaluation
+    parser.add_argument("--do_eval", action='store_true',
+                        help="Whether to run eval on the dev set.")
+    parser.add_argument("--eval_batch_size", default=8, type=int,
+                        help="Total batch size for eval.")
+    # Model
+    parser.add_argument("--xlnet_model", default="xlnet-large-cased", type=str,
+                        help="XLNet pre-trained model: currently only xlnet-large-cased.")
+    parser.add_argument("--do_lower_case", action='store_true',
+                        help="Set this flag if you are using an uncased model.")
+    parser.add_argument("--cache_dir", default="", type=str,
+                        help="Where do you want to store the pre-trained models downloaded from s3")
+    # task specific
+    parser.add_argument("--max_seq_length", default=128, type=int,
+                        help="The maximum total input sequence length after WordPiece tokenization. \n"
+                             "Sequences longer than this will be truncated, and sequences shorter \n"
+                             "than this will be padded.")
+    parser.add_argument('--overwrite_output_dir', action='store_true',
+                        help="Overwrite the content of the output directory")
+    # Misc
+    parser.add_argument("--no_cuda", action='store_true',
+                        help="Whether not to use CUDA when available")
+    parser.add_argument("--local_rank", type=int, default=-1,
+                        help="local_rank for distributed training on gpus")
+    parser.add_argument('--seed', type=int, default=42,
+                        help="random seed for initialization")
     parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.")
     parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.")
     args = parser.parse_args()
@@ -306,7 +273,7 @@ def main():
                 input_ids, input_mask, segment_ids, label_ids = batch
 
                 # define a new function to compute loss values for both output_modes
-                logits = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask)
+                logits, _ = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask)
 
                 if output_mode == "classification":
                     loss_fct = CrossEntropyLoss()
@@ -420,7 +387,7 @@ def main():
             label_ids = label_ids.to(device)
 
             with torch.no_grad():
-                logits = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask)
+                logits, _ = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask)
 
             # create eval loss and other metric required by the task
             if output_mode == "classification":
@@ -501,7 +468,7 @@ def main():
                 label_ids = label_ids.to(device)
 
                 with torch.no_grad():
-                    logits = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=None)
+                    logits, _ = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=None)
 
                 loss_fct = CrossEntropyLoss()
                 tmp_eval_loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1))
diff --git a/examples/run_xlnet_squad.py b/examples/run_xlnet_squad.py
new file mode 100644
index 0000000000..b01bf82a55
--- /dev/null
+++ b/examples/run_xlnet_squad.py
@@ -0,0 +1,398 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Run BERT on SQuAD."""
+
+from __future__ import absolute_import, division, print_function
+
+import argparse
+import logging
+import os
+import random
+import sys
+from io import open
+
+import numpy as np
+import torch
+from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
+                              TensorDataset)
+from torch.utils.data.distributed import DistributedSampler
+from tqdm import tqdm, trange
+
+from tensorboardX import SummaryWriter
+
+from pytorch_pretrained_bert.file_utils import WEIGHTS_NAME, CONFIG_NAME
+from pytorch_pretrained_bert.modeling_xlnet import BertForQuestionAnswering
+from pytorch_pretrained_bert.tokenization_xlnet import XLNetTokenizer
+from pytorch_pretrained_bert.optimization import BertAdam, WarmupLinearSchedule
+
+from utils_squad import read_squad_examples, convert_examples_to_features, RawResult, write_predictions
+
+if sys.version_info[0] == 2:
+    import cPickle as pickle
+else:
+    import pickle
+
+logger = logging.getLogger(__name__)
+
+
+def main():
+    parser = argparse.ArgumentParser()
+
+    ## Required parameters
+    parser.add_argument("--bert_model", default=None, type=str, required=True,
+                        help="Bert pre-trained model selected in the list: bert-base-uncased, "
+                        "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, "
+                        "bert-base-multilingual-cased, bert-base-chinese.")
+    parser.add_argument("--output_dir", default=None, type=str, required=True,
+                        help="The output directory where the model checkpoints and predictions will be written.")
+
+    ## Other parameters
+    parser.add_argument("--train_file", default=None, type=str, help="SQuAD json for training. E.g., train-v1.1.json")
+    parser.add_argument("--predict_file", default=None, type=str,
+                        help="SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json")
+    parser.add_argument("--max_seq_length", default=384, type=int,
+                        help="The maximum total input sequence length after WordPiece tokenization. Sequences "
+                             "longer than this will be truncated, and sequences shorter than this will be padded.")
+    parser.add_argument("--doc_stride", default=128, type=int,
+                        help="When splitting up a long document into chunks, how much stride to take between chunks.")
+    parser.add_argument("--max_query_length", default=64, type=int,
+                        help="The maximum number of tokens for the question. Questions longer than this will "
+                             "be truncated to this length.")
+    parser.add_argument("--do_train", action='store_true', help="Whether to run training.")
+    parser.add_argument("--do_predict", action='store_true', help="Whether to run eval on the dev set.")
+    parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.")
+    parser.add_argument("--predict_batch_size", default=8, type=int, help="Total batch size for predictions.")
+    parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
+    parser.add_argument("--num_train_epochs", default=3.0, type=float,
+                        help="Total number of training epochs to perform.")
+    parser.add_argument("--warmup_proportion", default=0.1, type=float,
+                        help="Proportion of training to perform linear learning rate warmup for. E.g., 0.1 = 10%% "
+                             "of training.")
+    parser.add_argument("--n_best_size", default=20, type=int,
+                        help="The total number of n-best predictions to generate in the nbest_predictions.json "
+                             "output file.")
+    parser.add_argument("--max_answer_length", default=30, type=int,
+                        help="The maximum length of an answer that can be generated. This is needed because the start "
+                             "and end predictions are not conditioned on one another.")
+    parser.add_argument("--verbose_logging", action='store_true',
+                        help="If true, all of the warnings related to data processing will be printed. "
+                             "A number of warnings are expected for a normal SQuAD evaluation.")
+    parser.add_argument("--no_cuda",
+                        action='store_true',
+                        help="Whether not to use CUDA when available")
+    parser.add_argument('--seed',
+                        type=int,
+                        default=42,
+                        help="random seed for initialization")
+    parser.add_argument('--gradient_accumulation_steps',
+                        type=int,
+                        default=1,
+                        help="Number of updates steps to accumulate before performing a backward/update pass.")
+    parser.add_argument("--do_lower_case",
+                        action='store_true',
+                        help="Whether to lower case the input text. True for uncased models, False for cased models.")
+    parser.add_argument("--local_rank",
+                        type=int,
+                        default=-1,
+                        help="local_rank for distributed training on gpus")
+    parser.add_argument('--fp16',
+                        action='store_true',
+                        help="Whether to use 16-bit float precision instead of 32-bit")
+    parser.add_argument('--overwrite_output_dir',
+                        action='store_true',
+                        help="Overwrite the content of the output directory")
+    parser.add_argument('--loss_scale',
+                        type=float, default=0,
+                        help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
+                             "0 (default value): dynamic loss scaling.\n"
+                             "Positive power of 2: static loss scaling value.\n")
+    parser.add_argument('--version_2_with_negative',
+                        action='store_true',
+                        help='If true, the SQuAD examples contain some that do not have an answer.')
+    parser.add_argument('--null_score_diff_threshold',
+                        type=float, default=0.0,
+                        help="If null_score - best_non_null is greater than the threshold predict null.")
+    parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.")
+    parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.")
+    args = parser.parse_args()
+    print(args)
+
+    if args.server_ip and args.server_port:
+        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
+        import ptvsd
+        print("Waiting for debugger attach")
+        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
+        ptvsd.wait_for_attach()
+
+    if args.local_rank == -1 or args.no_cuda:
+        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
+        n_gpu = torch.cuda.device_count()
+    else:
+        torch.cuda.set_device(args.local_rank)
+        device = torch.device("cuda", args.local_rank)
+        n_gpu = 1
+        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
+        torch.distributed.init_process_group(backend='nccl')
+
+    logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
+                        datefmt = '%m/%d/%Y %H:%M:%S',
+                        level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
+
+    logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format(
+        device, n_gpu, bool(args.local_rank != -1), args.fp16))
+
+    if args.gradient_accumulation_steps < 1:
+        raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(
+                            args.gradient_accumulation_steps))
+
+    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps
+
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+    torch.manual_seed(args.seed)
+    if n_gpu > 0:
+        torch.cuda.manual_seed_all(args.seed)
+
+    if not args.do_train and not args.do_predict:
+        raise ValueError("At least one of `do_train` or `do_predict` must be True.")
+
+    if args.do_train:
+        if not args.train_file:
+            raise ValueError(
+                "If `do_train` is True, then `train_file` must be specified.")
+    if args.do_predict:
+        if not args.predict_file:
+            raise ValueError(
+                "If `do_predict` is True, then `predict_file` must be specified.")
+
+    if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir:
+        raise ValueError("Output directory () already exists and is not empty.")
+    if not os.path.exists(args.output_dir):
+        os.makedirs(args.output_dir)
+
+    if args.local_rank not in [-1, 0]:
+        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
+    tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)
+    model = BertForQuestionAnswering.from_pretrained(args.bert_model)
+    if args.local_rank == 0:
+        torch.distributed.barrier()
+
+    if args.fp16:
+        model.half()
+    model.to(device)
+    if args.local_rank != -1:
+        model = torch.nn.parallel.DistributedDataParallel(model,
+                                                          device_ids=[args.local_rank],
+                                                          output_device=args.local_rank,
+                                                          find_unused_parameters=True)
+    elif n_gpu > 1:
+        model = torch.nn.DataParallel(model)
+
+    if args.do_train:
+        if args.local_rank in [-1, 0]:
+            tb_writer = SummaryWriter()
+        # Prepare data loader
+        train_examples = read_squad_examples(
+            input_file=args.train_file, is_training=True, version_2_with_negative=args.version_2_with_negative)
+        cached_train_features_file = args.train_file+'_{0}_{1}_{2}_{3}'.format(
+            list(filter(None, args.bert_model.split('/'))).pop(), str(args.max_seq_length), str(args.doc_stride), str(args.max_query_length))
+        try:
+            with open(cached_train_features_file, "rb") as reader:
+                train_features = pickle.load(reader)
+        except:
+            train_features = convert_examples_to_features(
+                examples=train_examples,
+                tokenizer=tokenizer,
+                max_seq_length=args.max_seq_length,
+                doc_stride=args.doc_stride,
+                max_query_length=args.max_query_length,
+                is_training=True)
+            if args.local_rank == -1 or torch.distributed.get_rank() == 0:
+                logger.info("  Saving train features into cached file %s", cached_train_features_file)
+                with open(cached_train_features_file, "wb") as writer:
+                    pickle.dump(train_features, writer)
+
+        all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
+        all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
+        all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)
+        all_start_positions = torch.tensor([f.start_position for f in train_features], dtype=torch.long)
+        all_end_positions = torch.tensor([f.end_position for f in train_features], dtype=torch.long)
+        train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
+                                   all_start_positions, all_end_positions)
+        if args.local_rank == -1:
+            train_sampler = RandomSampler(train_data)
+        else:
+            train_sampler = DistributedSampler(train_data)
+
+        train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)
+        num_train_optimization_steps = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
+        # if args.local_rank != -1:
+        #     num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size()
+
+        # Prepare optimizer
+        param_optimizer = list(model.named_parameters())
+
+        # hack to remove pooler, which is not used
+        # thus it produce None grad that break apex
+        param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]]
+
+        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
+        optimizer_grouped_parameters = [
+            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
+            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
+            ]
+
+        if args.fp16:
+            try:
+                from apex.optimizers import FP16_Optimizer
+                from apex.optimizers import FusedAdam
+            except ImportError:
+                raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
+
+            optimizer = FusedAdam(optimizer_grouped_parameters,
+                                  lr=args.learning_rate,
+                                  bias_correction=False,
+                                  max_grad_norm=1.0)
+            if args.loss_scale == 0:
+                optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
+            else:
+                optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale)
+            warmup_linear = WarmupLinearSchedule(warmup=args.warmup_proportion,
+                                                 t_total=num_train_optimization_steps)
+        else:
+            optimizer = BertAdam(optimizer_grouped_parameters,
+                                 lr=args.learning_rate,
+                                 warmup=args.warmup_proportion,
+                                 t_total=num_train_optimization_steps)
+
+        global_step = 0
+
+        logger.info("***** Running training *****")
+        logger.info("  Num orig examples = %d", len(train_examples))
+        logger.info("  Num split examples = %d", len(train_features))
+        logger.info("  Batch size = %d", args.train_batch_size)
+        logger.info("  Num steps = %d", num_train_optimization_steps)
+
+        model.train()
+        for epoch in trange(int(args.num_train_epochs), desc="Epoch"):
+            for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])):
+                if n_gpu == 1:
+                    batch = tuple(t.to(device) for t in batch) # multi-gpu does scattering it-self
+                input_ids, input_mask, segment_ids, start_positions, end_positions = batch
+                loss = model(input_ids, segment_ids, input_mask, start_positions, end_positions)
+                if n_gpu > 1:
+                    loss = loss.mean() # mean() to average on multi-gpu.
+                if args.gradient_accumulation_steps > 1:
+                    loss = loss / args.gradient_accumulation_steps
+
+                if args.fp16:
+                    optimizer.backward(loss)
+                else:
+                    loss.backward()
+                if (step + 1) % args.gradient_accumulation_steps == 0:
+                    if args.fp16:
+                        # modify learning rate with special warm up BERT uses
+                        # if args.fp16 is False, BertAdam is used and handles this automatically
+                        lr_this_step = args.learning_rate * warmup_linear.get_lr(global_step, args.warmup_proportion)
+                        for param_group in optimizer.param_groups:
+                            param_group['lr'] = lr_this_step
+                    optimizer.step()
+                    optimizer.zero_grad()
+                    global_step += 1
+                    if args.local_rank in [-1, 0]:
+                        tb_writer.add_scalar('lr', optimizer.get_lr()[0], global_step)
+                        tb_writer.add_scalar('loss', loss.item(), global_step)
+
+    if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
+        # Save a trained model, configuration and tokenizer
+        model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
+
+        # If we save using the predefined names, we can load using `from_pretrained`
+        output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
+        output_config_file = os.path.join(args.output_dir, CONFIG_NAME)
+
+        torch.save(model_to_save.state_dict(), output_model_file)
+        model_to_save.config.to_json_file(output_config_file)
+        tokenizer.save_vocabulary(args.output_dir)
+
+        # Load a trained model and vocabulary that you have fine-tuned
+        model = BertForQuestionAnswering.from_pretrained(args.output_dir)
+        tokenizer = BertTokenizer.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
+
+        # Good practice: save your training arguments together with the trained model
+        output_args_file = os.path.join(args.output_dir, 'training_args.bin')
+        torch.save(args, output_args_file)
+    else:
+        model = BertForQuestionAnswering.from_pretrained(args.bert_model)
+
+    model.to(device)
+
+    if args.do_predict and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
+        eval_examples = read_squad_examples(
+            input_file=args.predict_file, is_training=False, version_2_with_negative=args.version_2_with_negative)
+        eval_features = convert_examples_to_features(
+            examples=eval_examples,
+            tokenizer=tokenizer,
+            max_seq_length=args.max_seq_length,
+            doc_stride=args.doc_stride,
+            max_query_length=args.max_query_length,
+            is_training=False)
+
+        logger.info("***** Running predictions *****")
+        logger.info("  Num orig examples = %d", len(eval_examples))
+        logger.info("  Num split examples = %d", len(eval_features))
+        logger.info("  Batch size = %d", args.predict_batch_size)
+
+        all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
+        all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
+        all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
+        all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
+        eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_example_index)
+        # Run prediction for full data
+        eval_sampler = SequentialSampler(eval_data)
+        eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.predict_batch_size)
+
+        model.eval()
+        all_results = []
+        logger.info("Start evaluating")
+        for input_ids, input_mask, segment_ids, example_indices in tqdm(eval_dataloader, desc="Evaluating", disable=args.local_rank not in [-1, 0]):
+            if len(all_results) % 1000 == 0:
+                logger.info("Processing example: %d" % (len(all_results)))
+            input_ids = input_ids.to(device)
+            input_mask = input_mask.to(device)
+            segment_ids = segment_ids.to(device)
+            with torch.no_grad():
+                batch_start_logits, batch_end_logits = model(input_ids, segment_ids, input_mask)
+            for i, example_index in enumerate(example_indices):
+                start_logits = batch_start_logits[i].detach().cpu().tolist()
+                end_logits = batch_end_logits[i].detach().cpu().tolist()
+                eval_feature = eval_features[example_index.item()]
+                unique_id = int(eval_feature.unique_id)
+                all_results.append(RawResult(unique_id=unique_id,
+                                             start_logits=start_logits,
+                                             end_logits=end_logits))
+        output_prediction_file = os.path.join(args.output_dir, "predictions.json")
+        output_nbest_file = os.path.join(args.output_dir, "nbest_predictions.json")
+        output_null_log_odds_file = os.path.join(args.output_dir, "null_odds.json")
+        write_predictions(eval_examples, eval_features, all_results,
+                          args.n_best_size, args.max_answer_length,
+                          args.do_lower_case, output_prediction_file,
+                          output_nbest_file, output_null_log_odds_file, args.verbose_logging,
+                          args.version_2_with_negative, args.null_score_diff_threshold)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/pytorch_pretrained_bert/modeling.py b/pytorch_pretrained_bert/modeling.py
index 2b67a260f0..95bdd7452f 100644
--- a/pytorch_pretrained_bert/modeling.py
+++ b/pytorch_pretrained_bert/modeling.py
@@ -606,7 +606,7 @@ class BertPreTrainedModel(nn.Module):
                 ))
         self.config = config
 
-    def init_bert_weights(self, module):
+    def init_weights(self, module):
         """ Initialize the weights.
         """
         if isinstance(module, (nn.Linear, nn.Embedding)):
@@ -823,7 +823,7 @@ class BertModel(BertPreTrainedModel):
         self.encoder = BertEncoder(config, output_attentions=output_attentions,
                                            keep_multihead_output=keep_multihead_output)
         self.pooler = BertPooler(config)
-        self.apply(self.init_bert_weights)
+        self.apply(self.init_weights)
 
     def prune_heads(self, heads_to_prune):
         """ Prunes heads of the model.
@@ -951,7 +951,7 @@ class BertForPreTraining(BertPreTrainedModel):
         self.bert = BertModel(config, output_attentions=output_attentions,
                                       keep_multihead_output=keep_multihead_output)
         self.cls = BertPreTrainingHeads(config, self.bert.embeddings.word_embeddings.weight)
-        self.apply(self.init_bert_weights)
+        self.apply(self.init_weights)
 
     def forward(self, input_ids, token_type_ids=None, attention_mask=None, masked_lm_labels=None, next_sentence_label=None, head_mask=None):
         outputs = self.bert(input_ids, token_type_ids, attention_mask,
@@ -1030,7 +1030,7 @@ class BertForMaskedLM(BertPreTrainedModel):
         self.bert = BertModel(config, output_attentions=output_attentions,
                                       keep_multihead_output=keep_multihead_output)
         self.cls = BertOnlyMLMHead(config, self.bert.embeddings.word_embeddings.weight)
-        self.apply(self.init_bert_weights)
+        self.apply(self.init_weights)
 
     def forward(self, input_ids, token_type_ids=None, attention_mask=None, masked_lm_labels=None, head_mask=None):
         outputs = self.bert(input_ids, token_type_ids, attention_mask,
@@ -1105,7 +1105,7 @@ class BertForNextSentencePrediction(BertPreTrainedModel):
         self.bert = BertModel(config, output_attentions=output_attentions,
                                       keep_multihead_output=keep_multihead_output)
         self.cls = BertOnlyNSPHead(config)
-        self.apply(self.init_bert_weights)
+        self.apply(self.init_weights)
 
     def forward(self, input_ids, token_type_ids=None, attention_mask=None, next_sentence_label=None, head_mask=None):
         outputs = self.bert(input_ids, token_type_ids, attention_mask,
@@ -1184,7 +1184,7 @@ class BertForSequenceClassification(BertPreTrainedModel):
                                       keep_multihead_output=keep_multihead_output)
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
         self.classifier = nn.Linear(config.hidden_size, num_labels)
-        self.apply(self.init_bert_weights)
+        self.apply(self.init_weights)
 
     def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None, head_mask=None):
         outputs = self.bert(input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False, head_mask=head_mask)
@@ -1261,7 +1261,7 @@ class BertForMultipleChoice(BertPreTrainedModel):
                                       keep_multihead_output=keep_multihead_output)
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
         self.classifier = nn.Linear(config.hidden_size, 1)
-        self.apply(self.init_bert_weights)
+        self.apply(self.init_weights)
 
     def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None, head_mask=None):
         flat_input_ids = input_ids.view(-1, input_ids.size(-1))
@@ -1343,7 +1343,7 @@ class BertForTokenClassification(BertPreTrainedModel):
                                       keep_multihead_output=keep_multihead_output)
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
         self.classifier = nn.Linear(config.hidden_size, num_labels)
-        self.apply(self.init_bert_weights)
+        self.apply(self.init_weights)
 
     def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None, head_mask=None):
         outputs = self.bert(input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False, head_mask=head_mask)
@@ -1428,7 +1428,7 @@ class BertForQuestionAnswering(BertPreTrainedModel):
         self.bert = BertModel(config, output_attentions=output_attentions,
                                       keep_multihead_output=keep_multihead_output)
         self.qa_outputs = nn.Linear(config.hidden_size, 2)
-        self.apply(self.init_bert_weights)
+        self.apply(self.init_weights)
 
     def forward(self, input_ids, token_type_ids=None, attention_mask=None, start_positions=None,
                 end_positions=None, head_mask=None):
diff --git a/pytorch_pretrained_bert/modeling_xlnet.py b/pytorch_pretrained_bert/modeling_xlnet.py
index 45cd6350d5..9cdf82bbc3 100644
--- a/pytorch_pretrained_bert/modeling_xlnet.py
+++ b/pytorch_pretrained_bert/modeling_xlnet.py
@@ -633,7 +633,7 @@ class XLNetPreTrainedModel(nn.Module):
                 ))
         self.config = config
 
-    def init_xlnet_weights(self, module):
+    def init_weights(self, module):
         """ Initialize the weights.
         """
         if isinstance(module, (nn.Linear, nn.Embedding)):
@@ -904,14 +904,14 @@ class XLNetModel(XLNetPreTrainedModel):
         pos_emb = pos_emb.to(next(self.parameters()))
         return pos_emb
 
-    def forward(self, inp_k, seg_id=None, input_mask=None,
+    def forward(self, inp_k, token_type_ids=None, attention_mask=None,
                 mems=None, perm_mask=None, target_mapping=None, inp_q=None,
                 output_all_encoded_layers=True, head_mask=None):
         """
         Args:
             inp_k: int32 Tensor in shape [bsz, len], the input token IDs.
-            seg_id: int32 Tensor in shape [bsz, len], the input segment IDs.
-            input_mask: [optional] float32 Tensor in shape [bsz, len], the input mask.
+            token_type_ids: int32 Tensor in shape [bsz, len], the input segment IDs.
+            attention_mask: [optional] float32 Tensor in shape [bsz, len], the input mask.
                 0 for real tokens and 1 for padding.
             mems: [optional] a list of float32 Tensors in shape [mem_len, bsz, d_model], memory
                 from previous batches. The length of the list equals n_layer.
@@ -945,8 +945,8 @@ class XLNetModel(XLNetPreTrainedModel):
         # but we want a unified interface in the library with the batch size on the first dimension
         # so we move here the first dimension (batch) to the end
         inp_k = inp_k.transpose(0, 1).contiguous()
-        seg_id = seg_id.transpose(0, 1).contiguous() if seg_id is not None else None
-        input_mask = input_mask.transpose(0, 1).contiguous() if input_mask is not None else None
+        token_type_ids = token_type_ids.transpose(0, 1).contiguous() if token_type_ids is not None else None
+        attention_mask = attention_mask.transpose(0, 1).contiguous() if attention_mask is not None else None
         perm_mask = perm_mask.permute(1, 2, 0).contiguous() if perm_mask is not None else None
         target_mapping = target_mapping.permute(1, 2, 0).contiguous() if target_mapping is not None else None
         inp_q = inp_q.transpose(0, 1).contiguous() if inp_q is not None else None
@@ -969,11 +969,11 @@ class XLNetModel(XLNetPreTrainedModel):
             raise ValueError('Unsupported attention type: {}'.format(self.attn_type))
 
         # data mask: input mask & perm mask
-        if input_mask is not None and perm_mask is not None:
-            data_mask = input_mask[None] + perm_mask
-        elif input_mask is not None and perm_mask is None:
-            data_mask = input_mask[None]
-        elif input_mask is None and perm_mask is not None:
+        if attention_mask is not None and perm_mask is not None:
+            data_mask = attention_mask[None] + perm_mask
+        elif attention_mask is not None and perm_mask is None:
+            data_mask = attention_mask[None]
+        elif attention_mask is None and perm_mask is not None:
             data_mask = perm_mask
         else:
             data_mask = None
@@ -1011,13 +1011,13 @@ class XLNetModel(XLNetPreTrainedModel):
             output_g = None
 
         ##### Segment embedding
-        if seg_id is not None:
-            # Convert `seg_id` to one-hot `seg_mat`
+        if token_type_ids is not None:
+            # Convert `token_type_ids` to one-hot `seg_mat`
             mem_pad = torch.zeros([mlen, bsz], dtype=torch.long, device=device)
-            cat_ids = torch.cat([mem_pad, seg_id], dim=0)
+            cat_ids = torch.cat([mem_pad, token_type_ids], dim=0)
 
             # `1` indicates not in the same segment [qlen x klen x bsz]
-            seg_mat = (seg_id[:, None] != cat_ids[None, :]).long()
+            seg_mat = (token_type_ids[:, None] != cat_ids[None, :]).long()
             seg_mat = F.one_hot(seg_mat, num_classes=2).to(dtype_float)
         else:
             seg_mat = None
@@ -1076,8 +1076,8 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
 
     Inputs:
         inp_k: int32 Tensor in shape [bsz, len], the input token IDs.
-        seg_id: int32 Tensor in shape [bsz, len], the input segment IDs.
-        input_mask: [optional] float32 Tensor in shape [bsz, len], the input mask.
+        token_type_ids: int32 Tensor in shape [bsz, len], the input segment IDs.
+        attention_mask: [optional] float32 Tensor in shape [bsz, len], the input mask.
             0 for real tokens and 1 for padding.
         mems: [optional] a list of float32 Tensors in shape [mem_len, bsz, d_model], memory
             from previous batches. The length of the list equals n_layer.
@@ -1112,14 +1112,14 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
     ```python
     # Already been converted into WordPiece token ids
     input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
-    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
+    attention_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
     token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
 
     config = modeling.XLNetConfig(vocab_size_or_config_json_file=32000, d_model=768,
         n_layer=12, num_attention_heads=12, intermediate_size=3072)
 
     model = modeling.XLNetModel(config=config)
-    all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)
+    all_encoder_layers, pooled_output = model(input_ids, token_type_ids, attention_mask)
     ```
     """
     def __init__(self, config, output_attentions=False, keep_multihead_output=False):
@@ -1134,7 +1134,7 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
 
         # Tie weights
 
-        self.apply(self.init_xlnet_weights)
+        self.apply(self.init_weights)
         self.tie_weights()
 
     def tie_weights(self):
@@ -1142,14 +1142,14 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
         """
         self.lm_loss.weight = self.transformer.word_embedding.weight
 
-    def forward(self, inp_k, seg_id=None, input_mask=None,
+    def forward(self, inp_k, token_type_ids=None, attention_mask=None,
                 mems=None, perm_mask=None, target_mapping=None, inp_q=None,
                 target=None, output_all_encoded_layers=True, head_mask=None):
         """
         Args:
             inp_k: int32 Tensor in shape [bsz, len], the input token IDs.
-            seg_id: int32 Tensor in shape [bsz, len], the input segment IDs.
-            input_mask: float32 Tensor in shape [bsz, len], the input mask.
+            token_type_ids: int32 Tensor in shape [bsz, len], the input segment IDs.
+            attention_mask: float32 Tensor in shape [bsz, len], the input mask.
                 0 for real tokens and 1 for padding.
             mems: a list of float32 Tensors in shape [mem_len, bsz, d_model], memory
                 from previous batches. The length of the list equals n_layer.
@@ -1171,7 +1171,7 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
             summary_type: str, "last", "first", "mean", or "attn". The method
                 to pool the input to get a vector representation.
         """
-        output, hidden_states, new_mems = self.transformer(inp_k, seg_id, input_mask,
+        output, hidden_states, new_mems = self.transformer(inp_k, token_type_ids, attention_mask,
                                             mems, perm_mask, target_mapping, inp_q,
                                             output_all_encoded_layers, head_mask)
 
@@ -1200,7 +1200,7 @@ class XLNetSequenceSummary(nn.Module):
         super(XLNetSequenceSummary, self).__init__()
         self.summary_type = summary_type
         if use_proj:
-            self.summary = nn.Linear(config.hidden_size, num_labels)
+            self.summary = nn.Linear(config.d_model, config.d_model)
         else:
             self.summary = None
         if summary_type == 'attn':
@@ -1211,19 +1211,20 @@ class XLNetSequenceSummary(nn.Module):
         self.dropout = nn.Dropout(config.dropout)
         self.activation = nn.Tanh()
 
-    def forward(self, hidden_states, input_mask=None):
+    def forward(self, hidden_states):
+        """ hidden_states: float Tensor in shape [bsz, seq_len, d_model], the hidden-states of the last layer."""
         if self.summary_type == 'last':
-            output = hidden_states[-1]
+            output = hidden_states[:, -1]
         elif self.summary_type == 'first':
-            output = hidden_states[0]
+            output = hidden_states[:, 0]
         elif self.summary_type == 'mean':
-            output = hidden_states.mean(dim=0)
+            output = hidden_states.mean(dim=1)
         elif summary_type == 'attn':
             raise NotImplementedError
 
         output = self.summary(output)
-        output = self.dropout(output)
         output = self.activation(output)
+        output = self.dropout(output)
         return output
 
 
@@ -1240,8 +1241,8 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel):
 
     Inputs:
         inp_k: int32 Tensor in shape [bsz, len], the input token IDs.
-        seg_id: int32 Tensor in shape [bsz, len], the input segment IDs.
-        input_mask: float32 Tensor in shape [bsz, len], the input mask.
+        token_type_ids: int32 Tensor in shape [bsz, len], the input segment IDs.
+        attention_mask: float32 Tensor in shape [bsz, len], the input mask.
             0 for real tokens and 1 for padding.
         mems: a list of float32 Tensors in shape [mem_len, bsz, d_model], memory
             from previous batches. The length of the list equals n_layer.
@@ -1277,14 +1278,14 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel):
     ```python
     # Already been converted into WordPiece token ids
     input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
-    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
+    attention_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
     token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
 
     config = modeling.XLNetConfig(vocab_size_or_config_json_file=32000, d_model=768,
         n_layer=12, num_attention_heads=12, intermediate_size=3072)
 
     model = modeling.XLNetModel(config=config)
-    all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)
+    all_encoder_layers, pooled_output = model(input_ids, token_type_ids, attention_mask)
     ```
     """
     def __init__(self, config, summary_type="last", use_proj=True, num_labels=2,
@@ -1302,17 +1303,17 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel):
         self.sequence_summary = XLNetSequenceSummary(config, summary_type=summary_type,
                                                      use_proj=use_proj, output_attentions=output_attentions,
                                                      keep_multihead_output=keep_multihead_output)
-        self.loss_proj = nn.Linear(config.d_model, num_classes if not is_regression else 1)
-        self.apply(self.init_bert_weights)
+        self.loss_proj = nn.Linear(config.d_model, num_labels if not is_regression else 1)
+        self.apply(self.init_weights)
 
-    def forward(self, inp_k, seg_id=None, input_mask=None,
+    def forward(self, inp_k, token_type_ids=None, attention_mask=None,
                 mems=None, perm_mask=None, target_mapping=None, inp_q=None,
                 target=None, output_all_encoded_layers=True, head_mask=None):
         """
         Args:
             inp_k: int32 Tensor in shape [bsz, len], the input token IDs.
-            seg_id: int32 Tensor in shape [bsz, len], the input segment IDs.
-            input_mask: float32 Tensor in shape [bsz, len], the input mask.
+            token_type_ids: int32 Tensor in shape [bsz, len], the input segment IDs.
+            attention_mask: float32 Tensor in shape [bsz, len], the input mask.
                 0 for real tokens and 1 for padding.
             mems: a list of float32 Tensors in shape [mem_len, bsz, d_model], memory
                 from previous batches. The length of the list equals n_layer.
@@ -1331,7 +1332,7 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel):
                 Only used during pretraining for two-stream attention.
                 Set to None during finetuning.
         """
-        output, _, new_mems = self.transformer(inp_k, seg_id, input_mask,
+        output, _, new_mems = self.transformer(inp_k, token_type_ids, attention_mask,
                                             mems, perm_mask, target_mapping, inp_q,
                                             output_all_encoded_layers, head_mask)
 
@@ -1356,3 +1357,96 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel):
         # if self.output_attentions:
         return logits, new_mems
         #     return all_attentions, encoded_layers, pooled_output
+
+class XLNetForQuestionAnswering(XLNetPreTrainedModel):
+    """XLNet model for Question Answering (span extraction).
+    This module is composed of the XLNet model with a linear layer on top of
+    the sequence output that computes start_logits and end_logits
+
+    Params:
+        `config`: a XLNetConfig class instance with the configuration to build a new model
+        `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
+        `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
+            This can be used to compute head importance metrics. Default: False
+
+    Inputs:
+        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
+            with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
+            `extract_features.py`, `run_classifier.py` and `run_squad.py`)
+        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
+            types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
+            a `sentence B` token (see XLNet paper for more details).
+        `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
+            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
+            input sequence length in the current batch. It's the mask that we typically use for attention when
+            a batch has varying length sentences.
+        `start_positions`: position of the first token for the labeled span: torch.LongTensor of shape [batch_size].
+            Positions are clamped to the length of the sequence and position outside of the sequence are not taken
+            into account for computing the loss.
+        `end_positions`: position of the last token for the labeled span: torch.LongTensor of shape [batch_size].
+            Positions are clamped to the length of the sequence and position outside of the sequence are not taken
+            into account for computing the loss.
+        `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
+            It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
+
+    Outputs:
+        if `start_positions` and `end_positions` are not `None`:
+            Outputs the total_loss which is the sum of the CrossEntropy loss for the start and end token positions.
+        if `start_positions` or `end_positions` is `None`:
+            Outputs a tuple of start_logits, end_logits which are the logits respectively for the start and end
+            position tokens of shape [batch_size, sequence_length].
+
+    Example usage:
+    ```python
+    # Already been converted into WordPiece token ids
+    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
+    attention_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
+    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
+
+    config = XLNetConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
+        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
+
+    model = XLNetForQuestionAnswering(config)
+    start_logits, end_logits = model(input_ids, token_type_ids, attention_mask)
+    ```
+    """
+    def __init__(self, config, output_attentions=False, keep_multihead_output=False):
+        super(XLNetForQuestionAnswering, self).__init__(config)
+        self.output_attentions = output_attentions
+        self.transformer = XLNetModel(config, output_attentions=output_attentions,
+                                      keep_multihead_output=keep_multihead_output)
+        self.qa_outputs = nn.Linear(config.hidden_size, 2)
+        self.apply(self.init_weights)
+
+    def forward(self, inp_k, token_type_ids=None, attention_mask=None,
+                mems=None, perm_mask=None, target_mapping=None, inp_q=None,
+                start_positions=None, end_positions=None,
+                output_all_encoded_layers=True, head_mask=None):
+        output, _, new_mems = self.transformer(inp_k, token_type_ids, attention_mask,
+                                            mems, perm_mask, target_mapping, inp_q,
+                                            output_all_encoded_layers, head_mask)
+
+        logits = self.qa_outputs(output)
+        start_logits, end_logits = logits.split(1, dim=-1)
+        start_logits = start_logits.squeeze(-1)
+        end_logits = end_logits.squeeze(-1)
+
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, split add a dimension
+            if len(start_positions.size()) > 1:
+                start_positions = start_positions.squeeze(-1)
+            if len(end_positions.size()) > 1:
+                end_positions = end_positions.squeeze(-1)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.size(1)
+            start_positions.clamp_(0, ignored_index)
+            end_positions.clamp_(0, ignored_index)
+
+            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+            return total_loss
+        elif self.output_attentions:
+            return all_attentions, start_logits, end_logits
+        return start_logits, end_logits

From 62d78aa37e7738efaf71e75abd13e4dbe510f1b5 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Mon, 24 Jun 2019 14:36:11 +0200
Subject: [PATCH 013/139] updating GLUE utils for compatibility with XLNet

---
 README.md                                     | 126 ++++++++++++++----
 examples/run_xlnet_classifier.py              |   8 +-
 examples/utils_glue.py                        |  26 +++-
 .../Comparing-TF-and-PT-models-SQuAD.ipynb    |   2 +-
 pytorch_pretrained_bert/modeling.py           |  16 +--
 pytorch_pretrained_bert/modeling_xlnet.py     | 107 ++++++++++-----
 pytorch_pretrained_bert/tokenization.py       |  41 +++++-
 pytorch_pretrained_bert/tokenization_xlnet.py |  72 +++++++---
 tests/modeling_xlnet_test.py                  |  10 +-
 9 files changed, 310 insertions(+), 98 deletions(-)

diff --git a/README.md b/README.md
index 2c67dc65e8..4f69cbdd19 100644
--- a/README.md
+++ b/README.md
@@ -137,9 +137,9 @@ This package comprises the following classes that can be imported in Python and
 The repository further comprises:
 
 - Five examples on how to use **BERT** (in the [`examples` folder](./examples)):
-  - [`extract_features.py`](./examples/extract_features.py) - Show how to extract hidden states from an instance of `BertModel`,
-  - [`run_classifier.py`](./examples/run_classifier.py) - Show how to fine-tune an instance of `BertForSequenceClassification` on GLUE's MRPC task,
-  - [`run_squad.py`](./examples/run_squad.py) - Show how to fine-tune an instance of `BertForQuestionAnswering` on SQuAD v1.0 and SQuAD v2.0 tasks.
+  - [`run_bert_extract_features.py`](./examples/run_bert_extract_features.py) - Show how to extract hidden states from an instance of `BertModel`,
+  - [`run_bert_classifier.py`](./examples/run_bert_classifier.py) - Show how to fine-tune an instance of `BertForSequenceClassification` on GLUE's MRPC task,
+  - [`run_bert_squad.py`](./examples/run_bert_squad.py) - Show how to fine-tune an instance of `BertForQuestionAnswering` on SQuAD v1.0 and SQuAD v2.0 tasks.
   - [`run_swag.py`](./examples/run_swag.py) - Show how to fine-tune an instance of `BertForMultipleChoice` on Swag task.
   - [`simple_lm_finetuning.py`](./examples/lm_finetuning/simple_lm_finetuning.py) - Show how to fine-tune an instance of `BertForPretraining` on a target text corpus.
 
@@ -541,7 +541,7 @@ where
     - `bert-base-german-cased`: Trained on German data only, 12-layer, 768-hidden, 12-heads, 110M parameters [Performance Evaluation](https://deepset.ai/german-bert)
     - `bert-large-uncased-whole-word-masking`: 24-layer, 1024-hidden, 16-heads, 340M parameters - Trained with Whole Word Masking (mask all of the the tokens corresponding to a word at once)
     - `bert-large-cased-whole-word-masking`: 24-layer, 1024-hidden, 16-heads, 340M parameters - Trained with Whole Word Masking (mask all of the the tokens corresponding to a word at once)
-    - `bert-large-uncased-whole-word-masking-finetuned-squad`: The `bert-large-uncased-whole-word-masking` model finetuned on SQuAD (using the `run_squad.py` examples). Results: *exact_match: 86.91579943235573, f1: 93.1532499015869*
+    - `bert-large-uncased-whole-word-masking-finetuned-squad`: The `bert-large-uncased-whole-word-masking` model finetuned on SQuAD (using the `run_bert_squad.py` examples). Results: *exact_match: 86.91579943235573, f1: 93.1532499015869*
     - `openai-gpt`: OpenAI GPT English model, 12-layer, 768-hidden, 12-heads, 110M parameters
     - `gpt2`: OpenAI GPT-2 English model, 12-layer, 768-hidden, 12-heads, 117M parameters
     - `gpt2-medium`: OpenAI GPT-2 English model, 24-layer, 1024-hidden, 16-heads, 345M parameters
@@ -720,7 +720,7 @@ The inputs and output are **identical to the TensorFlow model inputs and outputs
 
 We detail them here. This model takes as *inputs*:
 [`modeling.py`](./pytorch_pretrained_bert/modeling.py)
-- `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] with the word token indices in the vocabulary (see the tokens preprocessing logic in the scripts [`extract_features.py`](./examples/extract_features.py), [`run_classifier.py`](./examples/run_classifier.py) and [`run_squad.py`](./examples/run_squad.py)), and
+- `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] with the word token indices in the vocabulary (see the tokens preprocessing logic in the scripts [`run_bert_extract_features.py`](./examples/run_bert_extract_features.py), [`run_bert_classifier.py`](./examples/run_bert_classifier.py) and [`run_bert_squad.py`](./examples/run_bert_squad.py)), and
 - `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to a `sentence B` token (see BERT paper for more details).
 - `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices selected in [0, 1]. It's a mask to be used if some input sequence lengths are smaller than the max input sequence length of the current batch. It's the mask that we typically use for attention when a batch has varying length sentences.
 - `output_all_encoded_layers`: boolean which controls the content of the `encoded_layers` output as described below. Default: `True`.
@@ -735,7 +735,7 @@ This model *outputs* a tuple composed of:
 
 - `pooled_output`: a torch.FloatTensor of size [batch_size, hidden_size] which is the output of a classifier pretrained on top of the hidden state associated to the first character of the input (`CLF`) to train on the Next-Sentence task (see BERT's paper).
 
-An example on how to use this class is given in the [`extract_features.py`](./examples/extract_features.py) script which can be used to extract the hidden states of the model for a given input.
+An example on how to use this class is given in the [`run_bert_extract_features.py`](./examples/run_bert_extract_features.py) script which can be used to extract the hidden states of the model for a given input.
 
 #### 2. `BertForPreTraining`
 
@@ -792,7 +792,7 @@ An example on how to use this class is given in the [`run_lm_finetuning.py`](./e
 
 The sequence-level classifier is a linear layer that takes as input the last hidden state of the first character in the input sequence (see Figures 3a and 3b in the BERT paper).
 
-An example on how to use this class is given in the [`run_classifier.py`](./examples/run_classifier.py) script which can be used to fine-tune a single sequence (or pair of sequence) classifier using BERT, for example for the MRPC task.
+An example on how to use this class is given in the [`run_bert_classifier.py`](./examples/run_bert_classifier.py) script which can be used to fine-tune a single sequence (or pair of sequence) classifier using BERT, for example for the MRPC task.
 
 #### 6. `BertForMultipleChoice`
 
@@ -816,7 +816,7 @@ The token-level classifier is a linear layer that takes as input the last hidden
 
 The token-level classifier takes as input the full sequence of the last hidden state and compute several (e.g. two) scores for each tokens that can for example respectively be the score that a given token is a `start_span` and a `end_span` token (see Figures 3c and 3d in the BERT paper).
 
-An example on how to use this class is given in the [`run_squad.py`](./examples/run_squad.py) script which can be used to fine-tune a token classifier using BERT, for example for the SQuAD task.
+An example on how to use this class is given in the [`run_bert_squad.py`](./examples/run_bert_squad.py) script which can be used to fine-tune a token classifier using BERT, for example for the SQuAD task.
 
 #### 9. `OpenAIGPTModel`
 
@@ -1138,7 +1138,7 @@ An overview of the implemented schedules:
 | Sub-section | Description |
 |-|-|
 | [Training large models: introduction, tools and examples](#Training-large-models-introduction,-tools-and-examples) | How to use gradient-accumulation, multi-gpu training, distributed training, optimize on CPU and 16-bits training to train Bert models |
-| [Fine-tuning with BERT: running the examples](#Fine-tuning-with-BERT-running-the-examples) | Running the examples in [`./examples`](./examples/): `extract_classif.py`, `run_classifier.py`, `run_squad.py` and `run_lm_finetuning.py` |
+| [Fine-tuning with BERT: running the examples](#Fine-tuning-with-BERT-running-the-examples) | Running the examples in [`./examples`](./examples/): `extract_classif.py`, `run_bert_classifier.py`, `run_bert_squad.py` and `run_lm_finetuning.py` |
 | [Fine-tuning with OpenAI GPT, Transformer-XL and GPT-2](#openai-gpt-transformer-xl-and-gpt-2-running-the-examples) | Running the examples in [`./examples`](./examples/): `run_openai_gpt.py`, `run_transfo_xl.py` and `run_gpt2.py` |
 | [Fine-tuning BERT-large on GPUs](#Fine-tuning-BERT-large-on-GPUs) | How to fine tune `BERT large`|
 
@@ -1146,7 +1146,7 @@ An overview of the implemented schedules:
 
 BERT-base and BERT-large are respectively 110M and 340M parameters models and it can be difficult to fine-tune them on a single GPU with the recommended batch size for good performance (in most case a batch size of 32).
 
-To help with fine-tuning these models, we have included several techniques that you can activate in the fine-tuning scripts [`run_classifier.py`](./examples/run_classifier.py) and [`run_squad.py`](./examples/run_squad.py): gradient-accumulation, multi-gpu training, distributed training and 16-bits training . For more details on how to use these techniques you can read [the tips on training large batches in PyTorch](https://medium.com/huggingface/training-larger-batches-practical-tips-on-1-gpu-multi-gpu-distributed-setups-ec88c3e51255) that I published earlier this month.
+To help with fine-tuning these models, we have included several techniques that you can activate in the fine-tuning scripts [`run_bert_classifier.py`](./examples/run_bert_classifier.py) and [`run_bert_squad.py`](./examples/run_bert_squad.py): gradient-accumulation, multi-gpu training, distributed training and 16-bits training . For more details on how to use these techniques you can read [the tips on training large batches in PyTorch](https://medium.com/huggingface/training-larger-batches-practical-tips-on-1-gpu-multi-gpu-distributed-setups-ec88c3e51255) that I published earlier this month.
 
 Here is how to use these techniques in our scripts:
 
@@ -1159,7 +1159,7 @@ To use 16-bits training and distributed training, you need to install NVIDIA's a
 
 Note: To use *Distributed Training*, you will need to run one training script on each of your machines. This can be done for example by running the following command on each server (see [the above mentioned blog post]((https://medium.com/huggingface/training-larger-batches-practical-tips-on-1-gpu-multi-gpu-distributed-setups-ec88c3e51255)) for more details):
 ```bash
-python -m torch.distributed.launch --nproc_per_node=4 --nnodes=2 --node_rank=$THIS_MACHINE_INDEX --master_addr="192.168.1.1" --master_port=1234 run_classifier.py (--arg1 --arg2 --arg3 and all other arguments of the run_classifier script)
+python -m torch.distributed.launch --nproc_per_node=4 --nnodes=2 --node_rank=$THIS_MACHINE_INDEX --master_addr="192.168.1.1" --master_port=1234 run_bert_classifier.py (--arg1 --arg2 --arg3 and all other arguments of the run_classifier script)
 ```
 Where `$THIS_MACHINE_INDEX` is an sequential index assigned to each of your machine (0, 1, 2...) and the machine with rank 0 has an IP address `192.168.1.1` and an open port `1234`.
 
@@ -1201,7 +1201,7 @@ and unpack it to some directory `$GLUE_DIR`.
 export GLUE_DIR=/path/to/glue
 export TASK_NAME=MRPC
 
-python run_classifier.py \
+python run_bert_classifier.py \
   --task_name $TASK_NAME \
   --do_train \
   --do_eval \
@@ -1234,7 +1234,7 @@ and unpack it to some directory `$GLUE_DIR`.
 ```shell
 export GLUE_DIR=/path/to/glue
 
-python run_classifier.py \
+python run_bert_classifier.py \
   --task_name MRPC \
   --do_train \
   --do_eval \
@@ -1256,7 +1256,7 @@ Then run
 ```shell
 export GLUE_DIR=/path/to/glue
 
-python run_classifier.py \
+python run_bert_classifier.py \
   --task_name MRPC \
   --do_train \
   --do_eval \
@@ -1275,7 +1275,7 @@ python run_classifier.py \
 Here is an example using distributed training on 8 V100 GPUs and Bert Whole Word Masking model to reach a F1 > 92 on MRPC:
 
 ```bash
-python -m torch.distributed.launch --nproc_per_node 8 run_classifier.py   --bert_model bert-large-uncased-whole-word-masking    --task_name MRPC --do_train   --do_eval   --do_lower_case   --data_dir $GLUE_DIR/MRPC/   --max_seq_length 128   --train_batch_size 8   --learning_rate 2e-5   --num_train_epochs 3.0  --output_dir /tmp/mrpc_output/
+python -m torch.distributed.launch --nproc_per_node 8 run_bert_classifier.py   --bert_model bert-large-uncased-whole-word-masking    --task_name MRPC --do_train   --do_eval   --do_lower_case   --data_dir $GLUE_DIR/MRPC/   --max_seq_length 128   --train_batch_size 8   --learning_rate 2e-5   --num_train_epochs 3.0  --output_dir /tmp/mrpc_output/
 ```
 
 Training with these hyper-parameters gave us the following results:
@@ -1291,7 +1291,7 @@ Training with these hyper-parameters gave us the following results:
 Here is an example on MNLI:
 
 ```bash
-python -m torch.distributed.launch --nproc_per_node 8 run_classifier.py   --bert_model bert-large-uncased-whole-word-masking    --task_name mnli --do_train   --do_eval   --do_lower_case   --data_dir /datadrive/bert_data/glue_data//MNLI/   --max_seq_length 128   --train_batch_size 8   --learning_rate 2e-5   --num_train_epochs 3.0   --output_dir ../models/wwm-uncased-finetuned-mnli/ --overwrite_output_dir
+python -m torch.distributed.launch --nproc_per_node 8 run_bert_classifier.py   --bert_model bert-large-uncased-whole-word-masking    --task_name mnli --do_train   --do_eval   --do_lower_case   --data_dir /datadrive/bert_data/glue_data//MNLI/   --max_seq_length 128   --train_batch_size 8   --learning_rate 2e-5   --num_train_epochs 3.0   --output_dir ../models/wwm-uncased-finetuned-mnli/ --overwrite_output_dir
 ```
 
 ```bash
@@ -1324,7 +1324,7 @@ The data for SQuAD can be downloaded with the following links and should be save
 ```shell
 export SQUAD_DIR=/path/to/SQUAD
 
-python run_squad.py \
+python run_bert_squad.py \
   --bert_model bert-base-uncased \
   --do_train \
   --do_predict \
@@ -1351,7 +1351,7 @@ Here is an example using distributed training on 8 V100 GPUs and Bert Whole Word
 
 ```bash
 python -m torch.distributed.launch --nproc_per_node=8 \
- run_squad.py \
+ run_bert_squad.py \
  --bert_model bert-large-uncased-whole-word-masking  \
  --do_train \
  --do_predict \
@@ -1378,7 +1378,7 @@ This is the model provided as `bert-large-uncased-whole-word-masking-finetuned-s
 And here is the model provided as `bert-large-cased-whole-word-masking-finetuned-squad`:
 
 ```bash
-python -m torch.distributed.launch --nproc_per_node=8  run_squad.py  --bert_model bert-large-cased-whole-word-masking   --do_train  --do_predict  --do_lower_case  --train_file $SQUAD_DIR/train-v1.1.json  --predict_file $SQUAD_DIR/dev-v1.1.json  --learning_rate 3e-5  --num_train_epochs 2  --max_seq_length 384  --doc_stride 128  --output_dir ../models/wwm_cased_finetuned_squad/  --train_batch_size 24  --gradient_accumulation_steps 12
+python -m torch.distributed.launch --nproc_per_node=8  run_bert_squad.py  --bert_model bert-large-cased-whole-word-masking   --do_train  --do_predict  --do_lower_case  --train_file $SQUAD_DIR/train-v1.1.json  --predict_file $SQUAD_DIR/dev-v1.1.json  --learning_rate 3e-5  --num_train_epochs 2  --max_seq_length 384  --doc_stride 128  --output_dir ../models/wwm_cased_finetuned_squad/  --train_batch_size 24  --gradient_accumulation_steps 12
 ```
 
 Training with these hyper-parameters gave us the following results:
@@ -1499,7 +1499,7 @@ Here is the full list of hyper-parameters for this run:
 ```bash
 export SQUAD_DIR=/path/to/SQUAD
 
-python ./run_squad.py \
+python ./run_bert_squad.py \
   --bert_model bert-large-uncased \
   --do_train \
   --do_predict \
@@ -1521,7 +1521,7 @@ Here is an example of hyper-parameters for a FP16 run we tried:
 ```bash
 export SQUAD_DIR=/path/to/SQUAD
 
-python ./run_squad.py \
+python ./run_bert_squad.py \
   --bert_model bert-large-uncased \
   --do_train \
   --do_predict \
@@ -1547,7 +1547,7 @@ Here is an example with the recent `bert-large-uncased-whole-word-masking`:
 
 ```bash
 python -m torch.distributed.launch --nproc_per_node=8 \
-  run_squad.py \
+  run_bert_squad.py \
   --bert_model bert-large-uncased-whole-word-masking \
   --do_train \
   --do_predict \
@@ -1563,6 +1563,86 @@ python -m torch.distributed.launch --nproc_per_node=8 \
   --gradient_accumulation_steps 2
 ```
 
+## Fine-tuning XLNet
+
+#### STS-B
+
+This example code fine-tunes XLNet on the STS-B corpus.
+
+Before running this example you should download the
+[GLUE data](https://gluebenchmark.com/tasks) by running
+[this script](https://gist.github.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e)
+and unpack it to some directory `$GLUE_DIR`.
+
+```shell
+export GLUE_DIR=/path/to/glue
+
+python run_xlnet_classifier.py \
+ --task_name STS-B \
+ --do_train \
+ --do_eval \
+ --do_lower_case \
+ --data_dir $GLUE_DIR/STS-B/ \
+ --max_seq_length 128 \
+ --train_batch_size 8 \
+ --gradient_accumulation_steps 1 \
+ --learning_rate 5e-5 \
+ --num_train_epochs 3.0 \
+ --output_dir /tmp/mrpc_output/
+```
+
+Our test ran on a few seeds with [the original implementation hyper-parameters](https://github.com/zihangdai/xlnet#1-sts-b-sentence-pair-relevance-regression-with-gpus) gave evaluation results between 84% and 88%.
+
+**Distributed training**
+Here is an example using distributed training on 8 V100 GPUs to reach XXXX:
+
+```bash
+python -m torch.distributed.launch --nproc_per_node 8 \
+ run_xlnet_classifier.py \
+ --task_name STS-B \
+ --do_train \
+ --do_eval \
+ --data_dir $GLUE_DIR/STS-B/ \
+ --max_seq_length 128 \
+ --train_batch_size 8 \
+ --gradient_accumulation_steps 1 \
+ --learning_rate 5e-5 \
+ --num_train_epochs 3.0 \
+ --output_dir /tmp/mrpc_output/
+```
+
+Training with these hyper-parameters gave us the following results:
+```bash
+  acc = 0.8823529411764706
+  acc_and_f1 = 0.901702786377709
+  eval_loss = 0.3418912578906332
+  f1 = 0.9210526315789473
+  global_step = 174
+  loss = 0.07231863956341798
+```
+
+Here is an example on MNLI:
+
+```bash
+python -m torch.distributed.launch --nproc_per_node 8 run_bert_classifier.py   --bert_model bert-large-uncased-whole-word-masking    --task_name mnli --do_train   --do_eval   --data_dir /datadrive/bert_data/glue_data//MNLI/   --max_seq_length 128   --train_batch_size 8   --learning_rate 2e-5   --num_train_epochs 3.0   --output_dir ../models/wwm-uncased-finetuned-mnli/ --overwrite_output_dir
+```
+
+```bash
+***** Eval results *****
+  acc = 0.8679706601466992
+  eval_loss = 0.4911287787382479
+  global_step = 18408
+  loss = 0.04755385363816904
+
+***** Eval results *****
+  acc = 0.8747965825874695
+  eval_loss = 0.45516540421714036
+  global_step = 18408
+  loss = 0.04755385363816904
+```
+
+This is the example of the `bert-large-uncased-whole-word-masking-finetuned-mnli` model
+
 ## BERTology
 
 There is a growing field of study concerned with investigating the inner working of large-scale transformers like BERT (that some call "BERTology"). Some good examples of this field are:
@@ -1599,7 +1679,7 @@ A command-line interface is provided to convert a TensorFlow checkpoint in a PyT
 
 You can convert any TensorFlow checkpoint for BERT (in particular [the pre-trained models released by Google](https://github.com/google-research/bert#pre-trained-models)) in a PyTorch save file by using the [`convert_tf_checkpoint_to_pytorch.py`](./pytorch_pretrained_bert/convert_tf_checkpoint_to_pytorch.py ) script.
 
-This CLI takes as input a TensorFlow checkpoint (three files starting with `bert_model.ckpt`) and the associated configuration file (`bert_config.json`), and creates a PyTorch model for this configuration, loads the weights from the TensorFlow checkpoint in the PyTorch model and saves the resulting model in a standard PyTorch save file that can be imported using `torch.load()` (see examples in [`extract_features.py`](./examples/extract_features.py), [`run_classifier.py`](./examples/run_classifier.py) and [`run_squad.py`](./examples/run_squad.py)).
+This CLI takes as input a TensorFlow checkpoint (three files starting with `bert_model.ckpt`) and the associated configuration file (`bert_config.json`), and creates a PyTorch model for this configuration, loads the weights from the TensorFlow checkpoint in the PyTorch model and saves the resulting model in a standard PyTorch save file that can be imported using `torch.load()` (see examples in [`run_bert_extract_features.py`](./examples/run_bert_extract_features.py), [`run_bert_classifier.py`](./examples/run_bert_classifier.py) and [`run_bert_squad.py`](./examples/run_bert_squad.py)).
 
 You only need to run this conversion script **once** to get a PyTorch model. You can then disregard the TensorFlow checkpoint (the three files starting with `bert_model.ckpt`) but be sure to keep the configuration file (`bert_config.json`) and the vocabulary file (`vocab.txt`) as these are needed for the PyTorch model too.
 
diff --git a/examples/run_xlnet_classifier.py b/examples/run_xlnet_classifier.py
index 154aac332a..758c96c67d 100644
--- a/examples/run_xlnet_classifier.py
+++ b/examples/run_xlnet_classifier.py
@@ -203,7 +203,9 @@ def main():
                 train_features = pickle.load(reader)
         except:
             train_features = convert_examples_to_features(
-                train_examples, label_list, args.max_seq_length, tokenizer, output_mode)
+                train_examples, label_list, args.max_seq_length, tokenizer, output_mode,
+                cls_token_at_end=True, cls_token=tokenizer.CLS_TOKEN,
+                sep_token=tokenizer.SEP_TOKEN, cls_token_segment_id=2)
             if args.local_rank == -1 or torch.distributed.get_rank() == 0:
                 logger.info("  Saving train features into cached file %s", cached_train_features_file)
                 with open(cached_train_features_file, "wb") as writer:
@@ -347,7 +349,9 @@ def main():
                 eval_features = pickle.load(reader)
         except:
             eval_features = convert_examples_to_features(
-                eval_examples, label_list, args.max_seq_length, tokenizer, output_mode)
+                eval_examples, label_list, args.max_seq_length, tokenizer, output_mode,
+                cls_token_at_end=True, cls_token=tokenizer.CLS_TOKEN,
+                sep_token=tokenizer.SEP_TOKEN, cls_token_segment_id=2)
             if args.local_rank == -1 or torch.distributed.get_rank() == 0:
                 logger.info("  Saving eval features into cached file %s", cached_eval_features_file)
                 with open(cached_eval_features_file, "wb") as writer:
diff --git a/examples/utils_glue.py b/examples/utils_glue.py
index 4924afaea2..ed3cde5a93 100644
--- a/examples/utils_glue.py
+++ b/examples/utils_glue.py
@@ -388,8 +388,15 @@ class WnliProcessor(DataProcessor):
 
 
 def convert_examples_to_features(examples, label_list, max_seq_length,
-                                 tokenizer, output_mode):
-    """Loads a data file into a list of `InputBatch`s."""
+                                 tokenizer, output_mode,
+                                 cls_token_at_end=False, cls_token='[CLS]',
+                                 sep_token='[SEP]', cls_token_segment_id=0):
+    """ Loads a data file into a list of `InputBatch`s
+        `cls_token_at_end` define the location of the CLS token:
+            - False (BERT pattern): [CLS] + A + [SEP] + B + [SEP]
+            - True (XLNet/GPT pattern): A + [SEP] + B + [SEP] + [CLS]
+        `cls_token_segment_id` define the segment id associated to the CLS token (0 for BERT, 2 for XLNet)
+    """
 
     label_map = {label : i for i, label in enumerate(label_list)}
 
@@ -415,10 +422,10 @@ def convert_examples_to_features(examples, label_list, max_seq_length,
         # The convention in BERT is:
         # (a) For sequence pairs:
         #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
-        #  type_ids: 0   0  0    0    0     0       0 0    1  1  1  1   1 1
+        #  type_ids:   0   0  0    0    0     0       0   0   1  1  1  1   1   1
         # (b) For single sequences:
         #  tokens:   [CLS] the dog is hairy . [SEP]
-        #  type_ids: 0   0   0   0  0     0 0
+        #  type_ids:   0   0   0   0  0     0   0
         #
         # Where "type_ids" are used to indicate whether this is the first
         # sequence or the second sequence. The embedding vectors for `type=0` and
@@ -430,13 +437,20 @@ def convert_examples_to_features(examples, label_list, max_seq_length,
         # For classification tasks, the first vector (corresponding to [CLS]) is
         # used as as the "sentence vector". Note that this only makes sense because
         # the entire model is fine-tuned.
-        tokens = ["[CLS]"] + tokens_a + ["[SEP]"]
+        tokens = tokens_a + [sep_token]
         segment_ids = [0] * len(tokens)
 
         if tokens_b:
-            tokens += tokens_b + ["[SEP]"]
+            tokens += tokens_b + [sep_token]
             segment_ids += [1] * (len(tokens_b) + 1)
 
+        if cls_token_at_end:
+            tokens = tokens + [cls_token]
+            segment_ids = segment_ids + [cls_token_segment_id]
+        else:
+            tokens = [cls_token] + tokens
+            segment_ids = [cls_token_segment_id] + segment_ids
+
         input_ids = tokenizer.convert_tokens_to_ids(tokens)
 
         # The mask has 1 for real tokens and 0 for padding tokens. Only real
diff --git a/notebooks/Comparing-TF-and-PT-models-SQuAD.ipynb b/notebooks/Comparing-TF-and-PT-models-SQuAD.ipynb
index c91822d8f9..a75e052643 100644
--- a/notebooks/Comparing-TF-and-PT-models-SQuAD.ipynb
+++ b/notebooks/Comparing-TF-and-PT-models-SQuAD.ipynb
@@ -86,7 +86,7 @@
     "spec.loader.exec_module(module)\n",
     "sys.modules['modeling_tensorflow'] = module\n",
     "\n",
-    "spec = importlib.util.spec_from_file_location('*', original_tf_inplem_dir + '/run_squad.py')\n",
+    "spec = importlib.util.spec_from_file_location('*', original_tf_inplem_dir + '/run_bert_squad.py')\n",
     "module = importlib.util.module_from_spec(spec)\n",
     "spec.loader.exec_module(module)\n",
     "sys.modules['run_squad_tensorflow'] = module\n",
diff --git a/pytorch_pretrained_bert/modeling.py b/pytorch_pretrained_bert/modeling.py
index 95bdd7452f..bc2304bc06 100644
--- a/pytorch_pretrained_bert/modeling.py
+++ b/pytorch_pretrained_bert/modeling.py
@@ -778,7 +778,7 @@ class BertModel(BertPreTrainedModel):
     Inputs:
         `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
             with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
-            `extract_features.py`, `run_classifier.py` and `run_squad.py`)
+            `run_bert_extract_features.py`, `run_bert_classifier.py` and `run_bert_squad.py`)
         `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
             types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
             a `sentence B` token (see BERT paper for more details).
@@ -905,7 +905,7 @@ class BertForPreTraining(BertPreTrainedModel):
     Inputs:
         `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
             with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
-            `extract_features.py`, `run_classifier.py` and `run_squad.py`)
+            `run_bert_extract_features.py`, `run_bert_classifier.py` and `run_bert_squad.py`)
         `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
             types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
             a `sentence B` token (see BERT paper for more details).
@@ -986,7 +986,7 @@ class BertForMaskedLM(BertPreTrainedModel):
     Inputs:
         `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
             with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
-            `extract_features.py`, `run_classifier.py` and `run_squad.py`)
+            `run_bert_extract_features.py`, `run_bert_classifier.py` and `run_bert_squad.py`)
         `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
             types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
             a `sentence B` token (see BERT paper for more details).
@@ -1064,7 +1064,7 @@ class BertForNextSentencePrediction(BertPreTrainedModel):
     Inputs:
         `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
             with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
-            `extract_features.py`, `run_classifier.py` and `run_squad.py`)
+            `run_bert_extract_features.py`, `run_bert_classifier.py` and `run_bert_squad.py`)
         `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
             types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
             a `sentence B` token (see BERT paper for more details).
@@ -1141,7 +1141,7 @@ class BertForSequenceClassification(BertPreTrainedModel):
     Inputs:
         `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
             with the word token indices in the vocabulary. Items in the batch should begin with the special "CLS" token. (see the tokens preprocessing logic in the scripts
-            `extract_features.py`, `run_classifier.py` and `run_squad.py`)
+            `run_bert_extract_features.py`, `run_bert_classifier.py` and `run_bert_squad.py`)
         `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
             types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
             a `sentence B` token (see BERT paper for more details).
@@ -1219,7 +1219,7 @@ class BertForMultipleChoice(BertPreTrainedModel):
     Inputs:
         `input_ids`: a torch.LongTensor of shape [batch_size, num_choices, sequence_length]
             with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
-            `extract_features.py`, `run_classifier.py` and `run_squad.py`)
+            `run_bert_extract_features.py`, `run_bert_classifier.py` and `run_bert_squad.py`)
         `token_type_ids`: an optional torch.LongTensor of shape [batch_size, num_choices, sequence_length]
             with the token types indices selected in [0, 1]. Type 0 corresponds to a `sentence A`
             and type 1 corresponds to a `sentence B` token (see BERT paper for more details).
@@ -1300,7 +1300,7 @@ class BertForTokenClassification(BertPreTrainedModel):
     Inputs:
         `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
             with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
-            `extract_features.py`, `run_classifier.py` and `run_squad.py`)
+            `run_bert_extract_features.py`, `run_bert_classifier.py` and `run_bert_squad.py`)
         `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
             types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
             a `sentence B` token (see BERT paper for more details).
@@ -1384,7 +1384,7 @@ class BertForQuestionAnswering(BertPreTrainedModel):
     Inputs:
         `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
             with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
-            `extract_features.py`, `run_classifier.py` and `run_squad.py`)
+            `run_bert_extract_features.py`, `run_bert_classifier.py` and `run_bert_squad.py`)
         `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
             types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
             a `sentence B` token (see BERT paper for more details).
diff --git a/pytorch_pretrained_bert/modeling_xlnet.py b/pytorch_pretrained_bert/modeling_xlnet.py
index 9cdf82bbc3..7687d0d52e 100644
--- a/pytorch_pretrained_bert/modeling_xlnet.py
+++ b/pytorch_pretrained_bert/modeling_xlnet.py
@@ -46,7 +46,7 @@ XLNET_CONFIG_NAME = 'xlnet_config.json'
 TF_WEIGHTS_NAME = 'model.ckpt'
 
 
-def build_tf_xlnet_to_pytorch_map(model, config):
+def build_tf_xlnet_to_pytorch_map(model, config, tf_weights=None):
     """ A map of modules from TF to PyTorch.
         I use a map to keep the PyTorch model as
         identical to the original PyTorch model as possible.
@@ -55,8 +55,15 @@ def build_tf_xlnet_to_pytorch_map(model, config):
     tf_to_pt_map = {}
 
     if hasattr(model, 'transformer'):
-        # We are loading pre-trained weights in a XLNetLMHeadModel => we will load also the output bias
-        tf_to_pt_map['model/lm_loss/bias'] = model.lm_loss.bias
+        if hasattr(model, 'lm_loss'):
+            # We will load also the output bias
+            tf_to_pt_map['model/lm_loss/bias'] = model.lm_loss.bias
+        elif hasattr(model, 'sequence_summary') and 'model/sequnece_summary/summary/kernel' in tf_weights:
+            # We will load also the sequence summary
+            tf_to_pt_map['model/sequnece_summary/summary/kernel'] = model.sequence_summary.summary.weight
+            tf_to_pt_map['model/sequnece_summary/summary/bias'] = model.sequence_summary.summary.bias
+        elif hasattr(model, 'proj_loss') and any('model/regression' in name for name in tf_weights.keys()):
+            raise NotImplementedError
         # Now load the rest of the transformer
         model = model.transformer
 
@@ -116,9 +123,6 @@ def load_tf_weights_in_xlnet(model, config, tf_path):
         print("Loading a TensorFlow models in PyTorch, requires TensorFlow to be installed. Please see "
             "https://www.tensorflow.org/install/ for installation instructions.")
         raise
-    # Build TF to PyTorch weights loading map
-    tf_to_pt_map = build_tf_xlnet_to_pytorch_map(model, config)
-
     # Load weights from TF model
     init_vars = tf.train.list_variables(tf_path)
     tf_weights = {}
@@ -127,9 +131,14 @@ def load_tf_weights_in_xlnet(model, config, tf_path):
         array = tf.train.load_variable(tf_path, name)
         tf_weights[name] = array
 
+    # Build TF to PyTorch weights loading map
+    tf_to_pt_map = build_tf_xlnet_to_pytorch_map(model, config, tf_weights)
+
     for name, pointer in tf_to_pt_map.items():
         print("Importing {}".format(name))
-        assert name in tf_weights
+        if name not in tf_weights:
+            print("{} not in tf pre-trained weights, skipping".format(name))
+            continue
         array = tf_weights[name]
         # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
         # which are not required for using pretrained model
@@ -643,6 +652,11 @@ class XLNetPreTrainedModel(nn.Module):
         elif isinstance(module, XLNetLayerNorm):
             module.bias.data.zero_()
             module.weight.data.fill_(1.0)
+        elif isinstance(module, XLNetRelativeAttention):
+            for param in [module.q, module.k, module.v, module.o, module.r,
+                          module.r_r_bias, module.r_s_bias, module.r_w_bias,
+                          module.seg_embed]:
+                param.data.normal_(mean=0.0, std=self.config.initializer_range)
         if isinstance(module, nn.Linear) and module.bias is not None:
             module.bias.data.zero_()
 
@@ -904,15 +918,19 @@ class XLNetModel(XLNetPreTrainedModel):
         pos_emb = pos_emb.to(next(self.parameters()))
         return pos_emb
 
-    def forward(self, inp_k, token_type_ids=None, attention_mask=None,
+    def forward(self, inp_k, token_type_ids=None, input_mask=None, attention_mask=None,
                 mems=None, perm_mask=None, target_mapping=None, inp_q=None,
                 output_all_encoded_layers=True, head_mask=None):
         """
         Args:
             inp_k: int32 Tensor in shape [bsz, len], the input token IDs.
             token_type_ids: int32 Tensor in shape [bsz, len], the input segment IDs.
-            attention_mask: [optional] float32 Tensor in shape [bsz, len], the input mask.
+            input_mask: [optional] float32 Tensor in shape [bsz, len], the input mask.
                 0 for real tokens and 1 for padding.
+            attention_mask: [optional] float32 Tensor, SAME FUNCTION as `input_mask`
+                but with 1 for real tokens and 0 for padding.
+                Added for easy compatibility with the BERT model (which uses this negative masking).
+                You can only uses one among `input_mask` and `attention_mask`
             mems: [optional] a list of float32 Tensors in shape [mem_len, bsz, d_model], memory
                 from previous batches. The length of the list equals n_layer.
                 If None, no memory is used.
@@ -946,6 +964,7 @@ class XLNetModel(XLNetPreTrainedModel):
         # so we move here the first dimension (batch) to the end
         inp_k = inp_k.transpose(0, 1).contiguous()
         token_type_ids = token_type_ids.transpose(0, 1).contiguous() if token_type_ids is not None else None
+        input_mask = input_mask.transpose(0, 1).contiguous() if input_mask is not None else None
         attention_mask = attention_mask.transpose(0, 1).contiguous() if attention_mask is not None else None
         perm_mask = perm_mask.permute(1, 2, 0).contiguous() if perm_mask is not None else None
         target_mapping = target_mapping.permute(1, 2, 0).contiguous() if target_mapping is not None else None
@@ -969,11 +988,15 @@ class XLNetModel(XLNetPreTrainedModel):
             raise ValueError('Unsupported attention type: {}'.format(self.attn_type))
 
         # data mask: input mask & perm mask
-        if attention_mask is not None and perm_mask is not None:
-            data_mask = attention_mask[None] + perm_mask
-        elif attention_mask is not None and perm_mask is None:
-            data_mask = attention_mask[None]
-        elif attention_mask is None and perm_mask is not None:
+        assert input_mask is None or attention_mask is None, "You can only use one of input_mask (uses 1 for padding) "
+        "or attention_mask (uses 0 for padding, added for compatbility with BERT). Please choose one."
+        if input_mask is None and attention_mask is not None:
+            input_mask = 1.0 - attention_mask
+        if input_mask is not None and perm_mask is not None:
+            data_mask = input_mask[None] + perm_mask
+        elif input_mask is not None and perm_mask is None:
+            data_mask = input_mask[None]
+        elif input_mask is None and perm_mask is not None:
             data_mask = perm_mask
         else:
             data_mask = None
@@ -1077,8 +1100,12 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
     Inputs:
         inp_k: int32 Tensor in shape [bsz, len], the input token IDs.
         token_type_ids: int32 Tensor in shape [bsz, len], the input segment IDs.
-        attention_mask: [optional] float32 Tensor in shape [bsz, len], the input mask.
+        input_mask: [optional] float32 Tensor in shape [bsz, len], the input mask.
             0 for real tokens and 1 for padding.
+        attention_mask: [optional] float32 Tensor, SAME FUNCTION as `input_mask`
+            but with 1 for real tokens and 0 for padding.
+            Added for easy compatibility with the BERT model (which uses this negative masking).
+            You can only uses one among `input_mask` and `attention_mask`
         mems: [optional] a list of float32 Tensors in shape [mem_len, bsz, d_model], memory
             from previous batches. The length of the list equals n_layer.
             If None, no memory is used.
@@ -1112,14 +1139,14 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
     ```python
     # Already been converted into WordPiece token ids
     input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
-    attention_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
+    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
     token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
 
     config = modeling.XLNetConfig(vocab_size_or_config_json_file=32000, d_model=768,
         n_layer=12, num_attention_heads=12, intermediate_size=3072)
 
     model = modeling.XLNetModel(config=config)
-    all_encoder_layers, pooled_output = model(input_ids, token_type_ids, attention_mask)
+    all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)
     ```
     """
     def __init__(self, config, output_attentions=False, keep_multihead_output=False):
@@ -1142,15 +1169,19 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
         """
         self.lm_loss.weight = self.transformer.word_embedding.weight
 
-    def forward(self, inp_k, token_type_ids=None, attention_mask=None,
+    def forward(self, inp_k, token_type_ids=None, input_mask=None, attention_mask=None,
                 mems=None, perm_mask=None, target_mapping=None, inp_q=None,
                 target=None, output_all_encoded_layers=True, head_mask=None):
         """
         Args:
             inp_k: int32 Tensor in shape [bsz, len], the input token IDs.
             token_type_ids: int32 Tensor in shape [bsz, len], the input segment IDs.
-            attention_mask: float32 Tensor in shape [bsz, len], the input mask.
+            input_mask: float32 Tensor in shape [bsz, len], the input mask.
                 0 for real tokens and 1 for padding.
+            attention_mask: [optional] float32 Tensor, SAME FUNCTION as `input_mask`
+                but with 1 for real tokens and 0 for padding.
+                Added for easy compatibility with the BERT model (which uses this negative masking).
+                You can only uses one among `input_mask` and `attention_mask`
             mems: a list of float32 Tensors in shape [mem_len, bsz, d_model], memory
                 from previous batches. The length of the list equals n_layer.
                 If None, no memory is used.
@@ -1171,7 +1202,7 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
             summary_type: str, "last", "first", "mean", or "attn". The method
                 to pool the input to get a vector representation.
         """
-        output, hidden_states, new_mems = self.transformer(inp_k, token_type_ids, attention_mask,
+        output, hidden_states, new_mems = self.transformer(inp_k, token_type_ids, input_mask, attention_mask,
                                             mems, perm_mask, target_mapping, inp_q,
                                             output_all_encoded_layers, head_mask)
 
@@ -1242,8 +1273,12 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel):
     Inputs:
         inp_k: int32 Tensor in shape [bsz, len], the input token IDs.
         token_type_ids: int32 Tensor in shape [bsz, len], the input segment IDs.
-        attention_mask: float32 Tensor in shape [bsz, len], the input mask.
+        input_mask: float32 Tensor in shape [bsz, len], the input mask.
             0 for real tokens and 1 for padding.
+        attention_mask: [optional] float32 Tensor, SAME FUNCTION as `input_mask`
+            but with 1 for real tokens and 0 for padding.
+            Added for easy compatibility with the BERT model (which uses this negative masking).
+            You can only uses one among `input_mask` and `attention_mask`
         mems: a list of float32 Tensors in shape [mem_len, bsz, d_model], memory
             from previous batches. The length of the list equals n_layer.
             If None, no memory is used.
@@ -1278,14 +1313,14 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel):
     ```python
     # Already been converted into WordPiece token ids
     input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
-    attention_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
+    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
     token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
 
     config = modeling.XLNetConfig(vocab_size_or_config_json_file=32000, d_model=768,
         n_layer=12, num_attention_heads=12, intermediate_size=3072)
 
     model = modeling.XLNetModel(config=config)
-    all_encoder_layers, pooled_output = model(input_ids, token_type_ids, attention_mask)
+    all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)
     ```
     """
     def __init__(self, config, summary_type="last", use_proj=True, num_labels=2,
@@ -1306,15 +1341,19 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel):
         self.loss_proj = nn.Linear(config.d_model, num_labels if not is_regression else 1)
         self.apply(self.init_weights)
 
-    def forward(self, inp_k, token_type_ids=None, attention_mask=None,
+    def forward(self, inp_k, token_type_ids=None, input_mask=None, attention_mask=None,
                 mems=None, perm_mask=None, target_mapping=None, inp_q=None,
                 target=None, output_all_encoded_layers=True, head_mask=None):
         """
         Args:
             inp_k: int32 Tensor in shape [bsz, len], the input token IDs.
             token_type_ids: int32 Tensor in shape [bsz, len], the input segment IDs.
-            attention_mask: float32 Tensor in shape [bsz, len], the input mask.
+            input_mask: float32 Tensor in shape [bsz, len], the input mask.
                 0 for real tokens and 1 for padding.
+            attention_mask: [optional] float32 Tensor, SAME FUNCTION as `input_mask`
+                but with 1 for real tokens and 0 for padding.
+                Added for easy compatibility with the BERT model (which uses this negative masking).
+                You can only uses one among `input_mask` and `attention_mask`
             mems: a list of float32 Tensors in shape [mem_len, bsz, d_model], memory
                 from previous batches. The length of the list equals n_layer.
                 If None, no memory is used.
@@ -1332,7 +1371,7 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel):
                 Only used during pretraining for two-stream attention.
                 Set to None during finetuning.
         """
-        output, _, new_mems = self.transformer(inp_k, token_type_ids, attention_mask,
+        output, _, new_mems = self.transformer(inp_k, token_type_ids, input_mask, attention_mask,
                                             mems, perm_mask, target_mapping, inp_q,
                                             output_all_encoded_layers, head_mask)
 
@@ -1372,11 +1411,15 @@ class XLNetForQuestionAnswering(XLNetPreTrainedModel):
     Inputs:
         `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
             with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
-            `extract_features.py`, `run_classifier.py` and `run_squad.py`)
+            `run_bert_extract_features.py`, `run_bert_classifier.py` and `run_bert_squad.py`)
         `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
             types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
             a `sentence B` token (see XLNet paper for more details).
-        `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
+        `attention_mask`: [optional] float32 Tensor, SAME FUNCTION as `input_mask`
+            but with 1 for real tokens and 0 for padding.
+            Added for easy compatibility with the BERT model (which uses this negative masking).
+            You can only uses one among `input_mask` and `attention_mask`
+        `input_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
             selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
             input sequence length in the current batch. It's the mask that we typically use for attention when
             a batch has varying length sentences.
@@ -1400,14 +1443,14 @@ class XLNetForQuestionAnswering(XLNetPreTrainedModel):
     ```python
     # Already been converted into WordPiece token ids
     input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
-    attention_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
+    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
     token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
 
     config = XLNetConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
         num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
 
     model = XLNetForQuestionAnswering(config)
-    start_logits, end_logits = model(input_ids, token_type_ids, attention_mask)
+    start_logits, end_logits = model(input_ids, token_type_ids, input_mask)
     ```
     """
     def __init__(self, config, output_attentions=False, keep_multihead_output=False):
@@ -1418,11 +1461,11 @@ class XLNetForQuestionAnswering(XLNetPreTrainedModel):
         self.qa_outputs = nn.Linear(config.hidden_size, 2)
         self.apply(self.init_weights)
 
-    def forward(self, inp_k, token_type_ids=None, attention_mask=None,
+    def forward(self, inp_k, token_type_ids=None, input_mask=None, attention_mask=None,
                 mems=None, perm_mask=None, target_mapping=None, inp_q=None,
                 start_positions=None, end_positions=None,
                 output_all_encoded_layers=True, head_mask=None):
-        output, _, new_mems = self.transformer(inp_k, token_type_ids, attention_mask,
+        output, _, new_mems = self.transformer(inp_k, token_type_ids, input_mask, attention_mask,
                                             mems, perm_mask, target_mapping, inp_q,
                                             output_all_encoded_layers, head_mask)
 
diff --git a/pytorch_pretrained_bert/tokenization.py b/pytorch_pretrained_bert/tokenization.py
index d37165d888..328964c535 100644
--- a/pytorch_pretrained_bert/tokenization.py
+++ b/pytorch_pretrained_bert/tokenization.py
@@ -58,7 +58,6 @@ PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP = {
 }
 VOCAB_NAME = 'vocab.txt'
 
-
 def load_vocab(vocab_file):
     """Loads a vocabulary file into a dictionary."""
     vocab = collections.OrderedDict()
@@ -116,6 +115,46 @@ class BertTokenizer(object):
         self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
         self.max_len = max_len if max_len is not None else int(1e12)
 
+    @property
+    def UNK_TOKEN(self):
+        return "[UNK]"
+
+    @property
+    def SEP_TOKEN(self):
+        return "[SEP]"
+
+    @property
+    def PAD_TOKEN(self):
+        return "[PAD]"
+
+    @property
+    def CLS_TOKEN(self):
+        return "[CLS]"
+
+    @property
+    def MASK_TOKEN(self):
+        return "[MASK]"
+
+    @property
+    def UNK_ID(self):
+        return self.vocab["[UNK]"]
+
+    @property
+    def SEP_ID(self):
+        return self.vocab["[SEP]"]
+
+    @property
+    def PAD_ID(self):
+        return self.vocab["[PAD]"]
+
+    @property
+    def CLS_ID(self):
+        return self.vocab["[CLS]"]
+
+    @property
+    def MASK_ID(self):
+        return self.vocab["[MASK]"]
+
     def tokenize(self, text):
         split_tokens = []
         if self.do_basic_tokenize:
diff --git a/pytorch_pretrained_bert/tokenization_xlnet.py b/pytorch_pretrained_bert/tokenization_xlnet.py
index 320800c8ff..63717a6e25 100644
--- a/pytorch_pretrained_bert/tokenization_xlnet.py
+++ b/pytorch_pretrained_bert/tokenization_xlnet.py
@@ -38,26 +38,6 @@ SPECIAL_TOKENS_NAME = 'special_tokens.txt'
 
 SPIECE_UNDERLINE = u'▁'
 
-# Tokens
-special_symbols = {
-    "<unk>"  : 0,
-    "<s>"    : 1,
-    "</s>"   : 2,
-    "<cls>"  : 3,
-    "<sep>"  : 4,
-    "<pad>"  : 5,
-    "<mask>" : 6,
-    "<eod>"  : 7,
-    "<eop>"  : 8,
-}
-
-VOCAB_SIZE = 32000
-UNK_ID = special_symbols["<unk>"]
-CLS_ID = special_symbols["<cls>"]
-SEP_ID = special_symbols["<sep>"]
-MASK_ID = special_symbols["<mask>"]
-EOD_ID = special_symbols["<eod>"]
-
 # Segments (not really needed)
 SEG_ID_A   = 0
 SEG_ID_B   = 1
@@ -70,6 +50,18 @@ class XLNetTokenizer(object):
         SentencePiece based tokenizer. Peculiarities:
             - requires SentencePiece: https://github.com/google/sentencepiece
     """
+    # Tokens
+    special_symbols = {
+        "<unk>"  : 0,
+        "<s>"    : 1,
+        "</s>"   : 2,
+        "<cls>"  : 3,
+        "<sep>"  : 4,
+        "<pad>"  : 5,
+        "<mask>" : 6,
+        "<eod>"  : 7,
+        "<eop>"  : 8,
+    }
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, *inputs, **kwargs):
         """
@@ -147,6 +139,46 @@ class XLNetTokenizer(object):
         self.special_tokens_decoder = {}
         self.set_special_tokens(special_tokens)
 
+    @property
+    def UNK_TOKEN(self):
+        return "<unk>"
+
+    @property
+    def SEP_TOKEN(self):
+        return "<sep>"
+
+    @property
+    def PAD_TOKEN(self):
+        return "<pad>"
+
+    @property
+    def CLS_TOKEN(self):
+        return "<cls>"
+
+    @property
+    def MASK_TOKEN(self):
+        return "<mask>"
+
+    @property
+    def UNK_ID(self):
+        return self.special_symbols["<unk>"]
+
+    @property
+    def SEP_ID(self):
+        return self.special_symbols["<sep>"]
+
+    @property
+    def PAD_ID(self):
+        return self.special_symbols["<pad>"]
+
+    @property
+    def CLS_ID(self):
+        return self.special_symbols["<cls>"]
+
+    @property
+    def MASK_ID(self):
+        return self.special_symbols["<mask>"]
+
     def __len__(self):
         return len(self.encoder) + len(self.special_tokens)
 
diff --git a/tests/modeling_xlnet_test.py b/tests/modeling_xlnet_test.py
index c99cfe25dd..dbae74aa80 100644
--- a/tests/modeling_xlnet_test.py
+++ b/tests/modeling_xlnet_test.py
@@ -86,7 +86,7 @@ class XLNetModelTest(unittest.TestCase):
             inp_q = target_mapping[:, 0, :].clone()  # predict last token
 
             # inp_k: int32 Tensor in shape [bsz, len], the input token IDs.
-            # seg_id: int32 Tensor in shape [bsz, len], the input segment IDs.
+            # token_type_ids: int32 Tensor in shape [bsz, len], the input segment IDs.
             # input_mask: float32 Tensor in shape [bsz, len], the input mask.
             #     0 for real tokens and 1 for padding.
             # mems: a list of float32 Tensors in shape [bsz, mem_len, d_model], memory
@@ -138,11 +138,11 @@ class XLNetModelTest(unittest.TestCase):
             model = XLNetLMHeadModel(config)
             model.eval()
 
-            loss_1, mems_1a = model(input_ids_1, seg_id=segment_ids, target=lm_labels)
-            all_logits_1, mems_1b = model(input_ids_1, seg_id=segment_ids)
+            loss_1, mems_1a = model(input_ids_1, token_type_ids=segment_ids, target=lm_labels)
+            all_logits_1, mems_1b = model(input_ids_1, token_type_ids=segment_ids)
 
-            loss_2, mems_2a = model(input_ids_2, seg_id=segment_ids, target=lm_labels, mems=mems_1a)
-            all_logits_2, mems_2b = model(input_ids_2, seg_id=segment_ids, mems=mems_1b)
+            loss_2, mems_2a = model(input_ids_2, token_type_ids=segment_ids, target=lm_labels, mems=mems_1a)
+            all_logits_2, mems_2b = model(input_ids_2, token_type_ids=segment_ids, mems=mems_1b)
 
             logits, _ = model(input_ids_q,
                                     perm_mask=perm_mask,

From c888663f18673003574cffe9608c5aae2bc9ccff Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Mon, 24 Jun 2019 14:38:24 +0200
Subject: [PATCH 014/139] overwrite output directories if needed

---
 examples/run_bert_classifier.py  | 2 +-
 examples/run_bert_squad.py       | 2 +-
 examples/run_xlnet_classifier.py | 2 +-
 examples/run_xlnet_squad.py      | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/examples/run_bert_classifier.py b/examples/run_bert_classifier.py
index cc8d1fe571..8bb37159d3 100644
--- a/examples/run_bert_classifier.py
+++ b/examples/run_bert_classifier.py
@@ -186,7 +186,7 @@ def main():
         raise ValueError("At least one of `do_train` or `do_eval` must be True.")
 
     if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir:
-        raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir))
+        raise ValueError("Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(args.output_dir))
     if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
         os.makedirs(args.output_dir)
 
diff --git a/examples/run_bert_squad.py b/examples/run_bert_squad.py
index c0e7844236..b35a9175ec 100644
--- a/examples/run_bert_squad.py
+++ b/examples/run_bert_squad.py
@@ -179,7 +179,7 @@ def main():
                 "If `do_predict` is True, then `predict_file` must be specified.")
 
     if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir:
-        raise ValueError("Output directory () already exists and is not empty.")
+        raise ValueError("Output directory {} already exists and is not empty. Use --overwrite_output_dir to overcome.".format(args.output_dir))
     if not os.path.exists(args.output_dir):
         os.makedirs(args.output_dir)
 
diff --git a/examples/run_xlnet_classifier.py b/examples/run_xlnet_classifier.py
index 758c96c67d..6733a25573 100644
--- a/examples/run_xlnet_classifier.py
+++ b/examples/run_xlnet_classifier.py
@@ -151,7 +151,7 @@ def main():
         raise ValueError("At least one of `do_train` or `do_eval` must be True.")
 
     if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir:
-        raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir))
+        raise ValueError("Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(args.output_dir))
     if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
         os.makedirs(args.output_dir)
 
diff --git a/examples/run_xlnet_squad.py b/examples/run_xlnet_squad.py
index b01bf82a55..a72d648ff7 100644
--- a/examples/run_xlnet_squad.py
+++ b/examples/run_xlnet_squad.py
@@ -179,7 +179,7 @@ def main():
                 "If `do_predict` is True, then `predict_file` must be specified.")
 
     if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir:
-        raise ValueError("Output directory () already exists and is not empty.")
+        raise ValueError("Output directory {} already exists and is not empty. Use --overwrite_output_dir to overcome.".format(args.output_dir))
     if not os.path.exists(args.output_dir):
         os.makedirs(args.output_dir)
 

From 7334bf6c21c65b6090e16247e294b737581c6d2e Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Mon, 24 Jun 2019 15:05:11 +0200
Subject: [PATCH 015/139] pad on left for xlnet

---
 examples/run_xlnet_classifier.py | 18 ++++++++++++------
 examples/utils_glue.py           | 26 +++++++++++++++++---------
 2 files changed, 29 insertions(+), 15 deletions(-)

diff --git a/examples/run_xlnet_classifier.py b/examples/run_xlnet_classifier.py
index 6733a25573..514776b242 100644
--- a/examples/run_xlnet_classifier.py
+++ b/examples/run_xlnet_classifier.py
@@ -198,14 +198,17 @@ def main():
             list(filter(None, args.xlnet_model.split('/'))).pop(),
                         str(args.max_seq_length),
                         str(task_name)))
-        try:
+        if os.path.exists(cached_train_features_file):
+            logger.info("Loading train features for cache file %s", cached_train_features_file)
             with open(cached_train_features_file, "rb") as reader:
                 train_features = pickle.load(reader)
-        except:
+        else:
+            logger.info("No cache file at %s, preparing train features", cached_train_features_file)
             train_features = convert_examples_to_features(
                 train_examples, label_list, args.max_seq_length, tokenizer, output_mode,
                 cls_token_at_end=True, cls_token=tokenizer.CLS_TOKEN,
-                sep_token=tokenizer.SEP_TOKEN, cls_token_segment_id=2)
+                sep_token=tokenizer.SEP_TOKEN, cls_token_segment_id=2,
+                pad_on_left=True, pad_token_segment_id=4)
             if args.local_rank == -1 or torch.distributed.get_rank() == 0:
                 logger.info("  Saving train features into cached file %s", cached_train_features_file)
                 with open(cached_train_features_file, "wb") as writer:
@@ -344,14 +347,17 @@ def main():
             list(filter(None, args.xlnet_model.split('/'))).pop(),
                         str(args.max_seq_length),
                         str(task_name)))
-        try:
+        if os.path.exists(cached_eval_features_file):
+            logger.info("Loading eval features for cache file %s", cached_eval_features_file)
             with open(cached_eval_features_file, "rb") as reader:
                 eval_features = pickle.load(reader)
-        except:
+        else:
+            logger.info("No cache file at %s, preparing eval features", cached_eval_features_file)
             eval_features = convert_examples_to_features(
                 eval_examples, label_list, args.max_seq_length, tokenizer, output_mode,
                 cls_token_at_end=True, cls_token=tokenizer.CLS_TOKEN,
-                sep_token=tokenizer.SEP_TOKEN, cls_token_segment_id=2)
+                sep_token=tokenizer.SEP_TOKEN, cls_token_segment_id=2,
+                pad_on_left=True, pad_token_segment_id=4)
             if args.local_rank == -1 or torch.distributed.get_rank() == 0:
                 logger.info("  Saving eval features into cached file %s", cached_eval_features_file)
                 with open(cached_eval_features_file, "wb") as writer:
diff --git a/examples/utils_glue.py b/examples/utils_glue.py
index ed3cde5a93..5d3454f439 100644
--- a/examples/utils_glue.py
+++ b/examples/utils_glue.py
@@ -389,8 +389,11 @@ class WnliProcessor(DataProcessor):
 
 def convert_examples_to_features(examples, label_list, max_seq_length,
                                  tokenizer, output_mode,
-                                 cls_token_at_end=False, cls_token='[CLS]',
-                                 sep_token='[SEP]', cls_token_segment_id=0):
+                                 cls_token_at_end=False, pad_on_left=False,
+                                 cls_token='[CLS]', sep_token='[SEP]', pad_token=0,
+                                 sequence_a_segment_id=0, sequence_b_segment_id=1,
+                                 cls_token_segment_id=1, pad_token_segment_id=0,
+                                 mask_padding_with_zero=True):
     """ Loads a data file into a list of `InputBatch`s
         `cls_token_at_end` define the location of the CLS token:
             - False (BERT pattern): [CLS] + A + [SEP] + B + [SEP]
@@ -438,11 +441,11 @@ def convert_examples_to_features(examples, label_list, max_seq_length,
         # used as as the "sentence vector". Note that this only makes sense because
         # the entire model is fine-tuned.
         tokens = tokens_a + [sep_token]
-        segment_ids = [0] * len(tokens)
+        segment_ids = [sequence_a_segment_id] * len(tokens)
 
         if tokens_b:
             tokens += tokens_b + [sep_token]
-            segment_ids += [1] * (len(tokens_b) + 1)
+            segment_ids += [sequence_b_segment_id] * (len(tokens_b) + 1)
 
         if cls_token_at_end:
             tokens = tokens + [cls_token]
@@ -455,13 +458,18 @@ def convert_examples_to_features(examples, label_list, max_seq_length,
 
         # The mask has 1 for real tokens and 0 for padding tokens. Only real
         # tokens are attended to.
-        input_mask = [1] * len(input_ids)
+        input_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)
 
         # Zero-pad up to the sequence length.
-        padding = [0] * (max_seq_length - len(input_ids))
-        input_ids += padding
-        input_mask += padding
-        segment_ids += padding
+        padding_length = max_seq_length - len(input_ids)
+        if pad_on_left:
+            input_ids = ([pad_token] * padding_length) + input_ids
+            input_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + input_mask
+            segment_ids = ([pad_token_segment_id] * padding_length) + segment_ids
+        else:
+            input_ids = input_ids + ([pad_token] * padding_length)
+            input_mask = input_mask + ([0 if mask_padding_with_zero else 1] * padding_length)
+            segment_ids = segment_ids + ([pad_token_segment_id] * padding_length)
 
         assert len(input_ids) == max_seq_length
         assert len(input_mask) == max_seq_length

From 7de17404901853e2d4b64a7624082a57fca70ad1 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Tue, 25 Jun 2019 10:27:58 +0200
Subject: [PATCH 016/139] add ability to restore fine-tuned TF mdoel

---
 .../convert_xlnet_checkpoint_to_pytorch.py    | 30 ++++++++++++++-----
 pytorch_pretrained_bert/modeling_xlnet.py     | 18 ++++++-----
 2 files changed, 33 insertions(+), 15 deletions(-)

diff --git a/pytorch_pretrained_bert/convert_xlnet_checkpoint_to_pytorch.py b/pytorch_pretrained_bert/convert_xlnet_checkpoint_to_pytorch.py
index 63f296ad83..e56cb538f4 100755
--- a/pytorch_pretrained_bert/convert_xlnet_checkpoint_to_pytorch.py
+++ b/pytorch_pretrained_bert/convert_xlnet_checkpoint_to_pytorch.py
@@ -24,16 +24,27 @@ import torch
 
 from pytorch_pretrained_bert.modeling_xlnet import (CONFIG_NAME, WEIGHTS_NAME,
                                                     XLNetConfig, XLNetRunConfig,
-                                                    XLNetLMHeadModel, load_tf_weights_in_xlnet)
+                                                    XLNetLMHeadModel, XLNetForQuestionAnswering,
+                                                    XLNetForSequenceClassification,
+                                                    load_tf_weights_in_xlnet)
 
-def convert_xlnet_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, pytorch_dump_folder_path):
+GLUE_TASKS = ["cola", "mnli", "mnli-mm", "mrpc", "sst-2", "sts-b", "qqp", "qnli", "rte", "wnli"]
+
+
+def convert_xlnet_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, pytorch_dump_folder_path, finetuning_task=None):
     # Initialise PyTorch model
     config = XLNetConfig.from_json_file(bert_config_file)
-    print("Building PyTorch model from configuration: {}".format(str(config)))
-    model = XLNetLMHeadModel(config)
+    if finetuning_task is not None and finetuning_task.lower() in GLUE_TASKS:
+        model_class = XLNetLMHeadModel
+    elif finetuning_task is not None and 'squad' in finetuning_task.lower():
+        model_class = XLNetForQuestionAnswering
+    else:
+        model_class = XLNetLMHeadModel
+    print("Building PyTorch model {} from configuration: {}".format(str(model_class), str(config)))
+    model = model_class(config)
 
     # Load weights from tf checkpoint
-    load_tf_weights_in_xlnet(model, config, tf_checkpoint_path)
+    load_tf_weights_in_xlnet(model, config, tf_checkpoint_path, finetuning_task)
 
     # Save pytorch-model
     pytorch_weights_dump_path = os.path.join(pytorch_dump_folder_path, WEIGHTS_NAME)
@@ -59,12 +70,17 @@ if __name__ == "__main__":
                         required = True,
                         help = "The config json file corresponding to the pre-trained XLNet model. \n"
                                "This specifies the model architecture.")
-    parser.add_argument("--pytorch_dump_folder_path",
+    parser.add_argument("--pytorch_dump_folder_path",finetuning_task
                         default = None,
                         type = str,
                         required = True,
                         help = "Path to the folder to store the PyTorch model or dataset/vocab.")
+    parser.add_argument("--finetuning_task",
+                        default = None,
+                        type = str,
+                        help = "Name of a task on which the XLNet TensorFloaw model was fine-tuned")
     args = parser.parse_args()
     convert_xlnet_checkpoint_to_pytorch(args.tf_checkpoint_path,
                                      args.xlnet_config_file,
-                                     args.pytorch_dump_folder_path)
+                                     args.pytorch_dump_folder_path,
+                                     args.finetuning_task)
diff --git a/pytorch_pretrained_bert/modeling_xlnet.py b/pytorch_pretrained_bert/modeling_xlnet.py
index 7687d0d52e..495f756f7e 100644
--- a/pytorch_pretrained_bert/modeling_xlnet.py
+++ b/pytorch_pretrained_bert/modeling_xlnet.py
@@ -46,7 +46,7 @@ XLNET_CONFIG_NAME = 'xlnet_config.json'
 TF_WEIGHTS_NAME = 'model.ckpt'
 
 
-def build_tf_xlnet_to_pytorch_map(model, config, tf_weights=None):
+def build_tf_xlnet_to_pytorch_map(model, config, tf_weights=None, finetuning_task=None):
     """ A map of modules from TF to PyTorch.
         I use a map to keep the PyTorch model as
         identical to the original PyTorch model as possible.
@@ -62,14 +62,16 @@ def build_tf_xlnet_to_pytorch_map(model, config, tf_weights=None):
             # We will load also the sequence summary
             tf_to_pt_map['model/sequnece_summary/summary/kernel'] = model.sequence_summary.summary.weight
             tf_to_pt_map['model/sequnece_summary/summary/bias'] = model.sequence_summary.summary.bias
-        elif hasattr(model, 'proj_loss') and any('model/regression' in name for name in tf_weights.keys()):
-            raise NotImplementedError
+        elif hasattr(model, 'logits_proj') and finetuning_task is not None and any('model/regression' in name for name in tf_weights.keys()):
+            tf_to_pt_map['model/regression_{}/logit/kernel'.format(finetuning_task)] = model.logits_proj.weight
+            tf_to_pt_map['model/regression_{}/logit/bias'.format(finetuning_task)] = model.logits_proj.bias
+
         # Now load the rest of the transformer
         model = model.transformer
 
     # Embeddings and output
     tf_to_pt_map.update({'model/transformer/word_embedding/lookup_table': model.word_embedding.weight,
-                    'model/transformer/mask_emb/mask_emb': model.mask_emb})
+                         'model/transformer/mask_emb/mask_emb': model.mask_emb})
 
     # Transformer blocks
     for i, b in enumerate(model.layer):
@@ -113,7 +115,7 @@ def build_tf_xlnet_to_pytorch_map(model, config, tf_weights=None):
         'model/transformer/seg_embed': seg_embed_list})
     return tf_to_pt_map
 
-def load_tf_weights_in_xlnet(model, config, tf_path):
+def load_tf_weights_in_xlnet(model, config, tf_path, finetuning_task=None):
     """ Load tf checkpoints in a pytorch model
     """
     try:
@@ -132,7 +134,7 @@ def load_tf_weights_in_xlnet(model, config, tf_path):
         tf_weights[name] = array
 
     # Build TF to PyTorch weights loading map
-    tf_to_pt_map = build_tf_xlnet_to_pytorch_map(model, config, tf_weights)
+    tf_to_pt_map = build_tf_xlnet_to_pytorch_map(model, config, tf_weights, finetuning_task)
 
     for name, pointer in tf_to_pt_map.items():
         print("Importing {}".format(name))
@@ -1338,7 +1340,7 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel):
         self.sequence_summary = XLNetSequenceSummary(config, summary_type=summary_type,
                                                      use_proj=use_proj, output_attentions=output_attentions,
                                                      keep_multihead_output=keep_multihead_output)
-        self.loss_proj = nn.Linear(config.d_model, num_labels if not is_regression else 1)
+        self.logits_proj = nn.Linear(config.d_model, num_labels if not is_regression else 1)
         self.apply(self.init_weights)
 
     def forward(self, inp_k, token_type_ids=None, input_mask=None, attention_mask=None,
@@ -1376,7 +1378,7 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel):
                                             output_all_encoded_layers, head_mask)
 
         output = self.sequence_summary(output)
-        logits = self.loss_proj(output)
+        logits = self.logits_proj(output)
 
         if target is not None:
             if self.is_regression:

From 603c513b35d1daf623e48eb68d54e06502d5e17d Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Tue, 25 Jun 2019 10:45:07 +0200
Subject: [PATCH 017/139] update main conversion script and readme

---
 README.md                                     |  24 +++-
 pytorch_pretrained_bert/__main__.py           | 113 +++++++++++-------
 .../convert_xlnet_checkpoint_to_pytorch.py    |   8 +-
 3 files changed, 96 insertions(+), 49 deletions(-)

diff --git a/README.md b/README.md
index 4f69cbdd19..e7834f8605 100644
--- a/README.md
+++ b/README.md
@@ -1690,7 +1690,7 @@ Here is an example of the conversion process for a pre-trained `BERT-Base Uncase
 ```shell
 export BERT_BASE_DIR=/path/to/bert/uncased_L-12_H-768_A-12
 
-pytorch_pretrained_bert convert_tf_checkpoint_to_pytorch \
+pytorch_pretrained_bert bert \
   $BERT_BASE_DIR/bert_model.ckpt \
   $BERT_BASE_DIR/bert_config.json \
   $BERT_BASE_DIR/pytorch_model.bin
@@ -1705,7 +1705,7 @@ Here is an example of the conversion process for a pre-trained OpenAI GPT model,
 ```shell
 export OPENAI_GPT_CHECKPOINT_FOLDER_PATH=/path/to/openai/pretrained/numpy/weights
 
-pytorch_pretrained_bert convert_openai_checkpoint \
+pytorch_pretrained_bert gpt \
   $OPENAI_GPT_CHECKPOINT_FOLDER_PATH \
   $PYTORCH_DUMP_OUTPUT \
   [OPENAI_GPT_CONFIG]
@@ -1718,7 +1718,7 @@ Here is an example of the conversion process for a pre-trained Transformer-XL mo
 ```shell
 export TRANSFO_XL_CHECKPOINT_FOLDER_PATH=/path/to/transfo/xl/checkpoint
 
-pytorch_pretrained_bert convert_transfo_xl_checkpoint \
+pytorch_pretrained_bert transfo_xl \
   $TRANSFO_XL_CHECKPOINT_FOLDER_PATH \
   $PYTORCH_DUMP_OUTPUT \
   [TRANSFO_XL_CONFIG]
@@ -1731,12 +1731,28 @@ Here is an example of the conversion process for a pre-trained OpenAI's GPT-2 mo
 ```shell
 export GPT2_DIR=/path/to/gpt2/checkpoint
 
-pytorch_pretrained_bert convert_gpt2_checkpoint \
+pytorch_pretrained_bert gpt2 \
   $GPT2_DIR/model.ckpt \
   $PYTORCH_DUMP_OUTPUT \
   [GPT2_CONFIG]
 ```
 
+### XLNet
+
+Here is an example of the conversion process for a pre-trained XLNet model, fine-tuned on STS-B using the TensorFlow script:
+
+```shell
+export TRANSFO_XL_CHECKPOINT_PATH=/path/to/xlnet/checkpoint
+export TRANSFO_XL_CONFIG_PATH=/path/to/xlnet/config
+
+pytorch_pretrained_bert xlnet \
+  $TRANSFO_XL_CHECKPOINT_PATH \
+  $TRANSFO_XL_CONFIG_PATH \
+  $PYTORCH_DUMP_OUTPUT \
+  STS-B \
+```
+
+
 ## TPU
 
 TPU support and pretraining scripts
diff --git a/pytorch_pretrained_bert/__main__.py b/pytorch_pretrained_bert/__main__.py
index a2aae9e9ce..bb9534a830 100644
--- a/pytorch_pretrained_bert/__main__.py
+++ b/pytorch_pretrained_bert/__main__.py
@@ -1,20 +1,16 @@
 # coding: utf8
 def main():
     import sys
-    if (len(sys.argv) != 4 and len(sys.argv) != 5) or sys.argv[1] not in [
-        "convert_tf_checkpoint_to_pytorch",
-        "convert_openai_checkpoint",
-        "convert_transfo_xl_checkpoint",
-        "convert_gpt2_checkpoint",
-    ]:
+    if (len(sys.argv) < 4 or len(sys.argv) > 6) or sys.argv[1] not in ["bert", "gpt", "transfo_xl", "gpt2", "xlnet"]:
         print(
         "Should be used as one of: \n"
-        ">> `pytorch_pretrained_bert convert_tf_checkpoint_to_pytorch TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT`, \n"
-        ">> `pytorch_pretrained_bert convert_openai_checkpoint OPENAI_GPT_CHECKPOINT_FOLDER_PATH PYTORCH_DUMP_OUTPUT [OPENAI_GPT_CONFIG]`, \n"
-        ">> `pytorch_pretrained_bert convert_transfo_xl_checkpoint TF_CHECKPOINT_OR_DATASET PYTORCH_DUMP_OUTPUT [TF_CONFIG]` or \n"
-        ">> `pytorch_pretrained_bert convert_gpt2_checkpoint TF_CHECKPOINT PYTORCH_DUMP_OUTPUT [GPT2_CONFIG]`")
+        ">> `pytorch_pretrained_bert bert TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT`, \n"
+        ">> `pytorch_pretrained_bert gpt OPENAI_GPT_CHECKPOINT_FOLDER_PATH PYTORCH_DUMP_OUTPUT [OPENAI_GPT_CONFIG]`, \n"
+        ">> `pytorch_pretrained_bert transfo_xl TF_CHECKPOINT_OR_DATASET PYTORCH_DUMP_OUTPUT [TF_CONFIG]` or \n"
+        ">> `pytorch_pretrained_bert gpt2 TF_CHECKPOINT PYTORCH_DUMP_OUTPUT [GPT2_CONFIG]` or \n"
+        ">> `pytorch_pretrained_bert xlnet TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT [FINETUNING_TASK_NAME]`")
     else:
-        if sys.argv[1] == "convert_tf_checkpoint_to_pytorch":
+        if sys.argv[1] == "bert":
             try:
                 from .convert_tf_checkpoint_to_pytorch import convert_tf_checkpoint_to_pytorch
             except ImportError:
@@ -25,24 +21,28 @@ def main():
 
             if len(sys.argv) != 5:
                 # pylint: disable=line-too-long
-                print("Should be used as `pytorch_pretrained_bert convert_tf_checkpoint_to_pytorch TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT`")
+                print("Should be used as `pytorch_pretrained_bert bert TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT`")
             else:
                 PYTORCH_DUMP_OUTPUT = sys.argv.pop()
                 TF_CONFIG = sys.argv.pop()
                 TF_CHECKPOINT = sys.argv.pop()
                 convert_tf_checkpoint_to_pytorch(TF_CHECKPOINT, TF_CONFIG, PYTORCH_DUMP_OUTPUT)
-        elif sys.argv[1] == "convert_openai_checkpoint":
+        elif sys.argv[1] == "gpt":
             from .convert_openai_checkpoint_to_pytorch import convert_openai_checkpoint_to_pytorch
-            OPENAI_GPT_CHECKPOINT_FOLDER_PATH = sys.argv[2]
-            PYTORCH_DUMP_OUTPUT = sys.argv[3]
-            if len(sys.argv) == 5:
-                OPENAI_GPT_CONFIG = sys.argv[4]
+            if len(sys.argv) < 4 or len(sys.argv) > 5:
+                # pylint: disable=line-too-long
+                print("Should be used as `pytorch_pretrained_bert gpt OPENAI_GPT_CHECKPOINT_FOLDER_PATH PYTORCH_DUMP_OUTPUT [OPENAI_GPT_CONFIG]`")
             else:
-                OPENAI_GPT_CONFIG = ""
-            convert_openai_checkpoint_to_pytorch(OPENAI_GPT_CHECKPOINT_FOLDER_PATH,
-                                                 OPENAI_GPT_CONFIG,
-                                                 PYTORCH_DUMP_OUTPUT)
-        elif sys.argv[1] == "convert_transfo_xl_checkpoint":
+                OPENAI_GPT_CHECKPOINT_FOLDER_PATH = sys.argv[2]
+                PYTORCH_DUMP_OUTPUT = sys.argv[3]
+                if len(sys.argv) == 5:
+                    OPENAI_GPT_CONFIG = sys.argv[4]
+                else:
+                    OPENAI_GPT_CONFIG = ""
+                convert_openai_checkpoint_to_pytorch(OPENAI_GPT_CHECKPOINT_FOLDER_PATH,
+                                                    OPENAI_GPT_CONFIG,
+                                                    PYTORCH_DUMP_OUTPUT)
+        elif sys.argv[1] == "transfo_xl":
             try:
                 from .convert_transfo_xl_checkpoint_to_pytorch import convert_transfo_xl_checkpoint_to_pytorch
             except ImportError:
@@ -50,20 +50,23 @@ def main():
                     "In that case, it requires TensorFlow to be installed. Please see "
                     "https://www.tensorflow.org/install/ for installation instructions.")
                 raise
-
-            if 'ckpt' in sys.argv[2].lower():
-                TF_CHECKPOINT = sys.argv[2]
-                TF_DATASET_FILE = ""
+            if len(sys.argv) < 4 or len(sys.argv) > 5:
+                # pylint: disable=line-too-long
+                print("Should be used as `pytorch_pretrained_bert transfo_xl TF_CHECKPOINT/TF_DATASET_FILE PYTORCH_DUMP_OUTPUT [TF_CONFIG]`")
             else:
-                TF_DATASET_FILE = sys.argv[2]
-                TF_CHECKPOINT = ""
-            PYTORCH_DUMP_OUTPUT = sys.argv[3]
-            if len(sys.argv) == 5:
-                TF_CONFIG = sys.argv[4]
-            else:
-                TF_CONFIG = ""
-            convert_transfo_xl_checkpoint_to_pytorch(TF_CHECKPOINT, TF_CONFIG, PYTORCH_DUMP_OUTPUT, TF_DATASET_FILE)
-        else:
+                if 'ckpt' in sys.argv[2].lower():
+                    TF_CHECKPOINT = sys.argv[2]
+                    TF_DATASET_FILE = ""
+                else:
+                    TF_DATASET_FILE = sys.argv[2]
+                    TF_CHECKPOINT = ""
+                PYTORCH_DUMP_OUTPUT = sys.argv[3]
+                if len(sys.argv) == 5:
+                    TF_CONFIG = sys.argv[4]
+                else:
+                    TF_CONFIG = ""
+                convert_transfo_xl_checkpoint_to_pytorch(TF_CHECKPOINT, TF_CONFIG, PYTORCH_DUMP_OUTPUT, TF_DATASET_FILE)
+        elif sys.argv[1] == "gpt2":
             try:
                 from .convert_gpt2_checkpoint_to_pytorch import convert_gpt2_checkpoint_to_pytorch
             except ImportError:
@@ -72,12 +75,40 @@ def main():
                     "https://www.tensorflow.org/install/ for installation instructions.")
                 raise
 
-            TF_CHECKPOINT = sys.argv[2]
-            PYTORCH_DUMP_OUTPUT = sys.argv[3]
-            if len(sys.argv) == 5:
-                TF_CONFIG = sys.argv[4]
+            if len(sys.argv) < 4 or len(sys.argv) > 5:
+                # pylint: disable=line-too-long
+                print("Should be used as `pytorch_pretrained_bert gpt2 TF_CHECKPOINT PYTORCH_DUMP_OUTPUT [TF_CONFIG]`")
             else:
-                TF_CONFIG = ""
-            convert_gpt2_checkpoint_to_pytorch(TF_CHECKPOINT, TF_CONFIG, PYTORCH_DUMP_OUTPUT)
+                TF_CHECKPOINT = sys.argv[2]
+                PYTORCH_DUMP_OUTPUT = sys.argv[3]
+                if len(sys.argv) == 5:
+                    TF_CONFIG = sys.argv[4]
+                else:
+                    TF_CONFIG = ""
+                convert_gpt2_checkpoint_to_pytorch(TF_CHECKPOINT, TF_CONFIG, PYTORCH_DUMP_OUTPUT)
+        else:
+            try:
+                from .convert_xlnet_checkpoint_to_pytorch import convert_xlnet_checkpoint_to_pytorch
+            except ImportError:
+                print("pytorch_pretrained_bert can only be used from the commandline to convert TensorFlow models in PyTorch, "
+                    "In that case, it requires TensorFlow to be installed. Please see "
+                    "https://www.tensorflow.org/install/ for installation instructions.")
+                raise
+
+            if len(sys.argv) < 5 or len(sys.argv) > 6:
+                # pylint: disable=line-too-long
+                print("Should be used as `pytorch_pretrained_bert xlnet TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT [FINETUNING_TASK_NAME]`")
+            else:
+                TF_CHECKPOINT = sys.argv[2]
+                TF_CONFIG = sys.argv[3]
+                PYTORCH_DUMP_OUTPUT = sys.argv[4]
+                if len(sys.argv) == 6:
+                    FINETUNING_TASK = sys.argv[5]
+
+                convert_xlnet_checkpoint_to_pytorch(TF_CHECKPOINT,
+                                                    TF_CONFIG,
+                                                    PYTORCH_DUMP_OUTPUT,
+                                                    FINETUNING_TASK)
+
 if __name__ == '__main__':
     main()
diff --git a/pytorch_pretrained_bert/convert_xlnet_checkpoint_to_pytorch.py b/pytorch_pretrained_bert/convert_xlnet_checkpoint_to_pytorch.py
index e56cb538f4..d343fd2189 100755
--- a/pytorch_pretrained_bert/convert_xlnet_checkpoint_to_pytorch.py
+++ b/pytorch_pretrained_bert/convert_xlnet_checkpoint_to_pytorch.py
@@ -70,7 +70,7 @@ if __name__ == "__main__":
                         required = True,
                         help = "The config json file corresponding to the pre-trained XLNet model. \n"
                                "This specifies the model architecture.")
-    parser.add_argument("--pytorch_dump_folder_path",finetuning_task
+    parser.add_argument("--pytorch_dump_folder_path",
                         default = None,
                         type = str,
                         required = True,
@@ -81,6 +81,6 @@ if __name__ == "__main__":
                         help = "Name of a task on which the XLNet TensorFloaw model was fine-tuned")
     args = parser.parse_args()
     convert_xlnet_checkpoint_to_pytorch(args.tf_checkpoint_path,
-                                     args.xlnet_config_file,
-                                     args.pytorch_dump_folder_path,
-                                     args.finetuning_task)
+                                        args.xlnet_config_file,
+                                        args.pytorch_dump_folder_path,
+                                        args.finetuning_task)

From e55d4c4edea6fe98f42109239a27f93bb88ed44e Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Wed, 26 Jun 2019 00:57:53 +0200
Subject: [PATCH 018/139] various updates to conversion, models and examples

---
 README.md                                     |  3 +-
 examples/{run_swag.py => run_bert_swag.py}    |  0
 examples/run_xlnet_classifier.py              | 22 +++++++++-----
 pytorch_pretrained_bert/__init__.py           |  1 +
 .../convert_xlnet_checkpoint_to_pytorch.py    | 29 ++++++++++++++-----
 pytorch_pretrained_bert/modeling_xlnet.py     | 10 ++++---
 6 files changed, 44 insertions(+), 21 deletions(-)
 rename examples/{run_swag.py => run_bert_swag.py} (100%)

diff --git a/README.md b/README.md
index e7834f8605..d8309be01e 100644
--- a/README.md
+++ b/README.md
@@ -1394,7 +1394,7 @@ The data for SWAG can be downloaded by cloning the following [repository](https:
 ```shell
 export SWAG_DIR=/path/to/SWAG
 
-python run_swag.py \
+python run_bert_swag.py \
   --bert_model bert-base-uncased \
   --do_train \
   --do_lower_case \
@@ -1581,7 +1581,6 @@ python run_xlnet_classifier.py \
  --task_name STS-B \
  --do_train \
  --do_eval \
- --do_lower_case \
  --data_dir $GLUE_DIR/STS-B/ \
  --max_seq_length 128 \
  --train_batch_size 8 \
diff --git a/examples/run_swag.py b/examples/run_bert_swag.py
similarity index 100%
rename from examples/run_swag.py
rename to examples/run_bert_swag.py
diff --git a/examples/run_xlnet_classifier.py b/examples/run_xlnet_classifier.py
index 514776b242..0278b40cdd 100644
--- a/examples/run_xlnet_classifier.py
+++ b/examples/run_xlnet_classifier.py
@@ -70,6 +70,8 @@ def main():
     parser.add_argument("--warmup_proportion", default=0.1, type=float,
                         help="Proportion of training to perform linear learning rate warmup for. "
                              "E.g., 0.1 = 10%% of training.")
+    parser.add_argument("--clip_gradients", default=1.0, type=float,
+                        help="Clip gradient norms.")
     parser.add_argument("--train_batch_size", default=32, type=int,
                         help="Total batch size for training.")
     parser.add_argument('--gradient_accumulation_steps', type=int, default=1,
@@ -80,6 +82,8 @@ def main():
                         help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
                              "0 (default value): dynamic loss scaling.\n"
                              "Positive power of 2: static loss scaling value.\n")
+    parser.add_argument("--log_every", default=10, type=int,
+                        help="Log metrics every X training steps.")
     # evaluation
     parser.add_argument("--do_eval", action='store_true',
                         help="Whether to run eval on the dev set.")
@@ -234,12 +238,13 @@ def main():
 
         # Prepare optimizer
 
-        param_optimizer = list(model.named_parameters())
-        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
-        optimizer_grouped_parameters = [
-            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
-            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
-            ]
+        optimizer_grouped_parameters = model.parameters()
+        # param_optimizer = list(model.named_parameters())
+        # no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
+        # optimizer_grouped_parameters = [
+        #     {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
+        #     {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
+        #     ]
         if args.fp16:
             try:
                 from apex.optimizers import FP16_Optimizer
@@ -297,6 +302,9 @@ def main():
                 else:
                     loss.backward()
 
+                if args.clip_gradients > 0.0:
+                    torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip_gradients)
+
                 tr_loss += loss.item()
                 nb_tr_examples += input_ids.size(0)
                 nb_tr_steps += 1
@@ -310,7 +318,7 @@ def main():
                     optimizer.step()
                     optimizer.zero_grad()
                     global_step += 1
-                    if args.local_rank in [-1, 0]:
+                    if args.local_rank in [-1, 0] and (args.log_every <= 0 or (step + 1) % args.log_every == 0):
                         tb_writer.add_scalar('lr', optimizer.get_lr()[0], global_step)
                         tb_writer.add_scalar('loss', loss.item(), global_step)
 
diff --git a/pytorch_pretrained_bert/__init__.py b/pytorch_pretrained_bert/__init__.py
index 89639820b8..069c6c52e2 100644
--- a/pytorch_pretrained_bert/__init__.py
+++ b/pytorch_pretrained_bert/__init__.py
@@ -20,6 +20,7 @@ from .modeling_gpt2 import (GPT2Config, GPT2Model,
                             load_tf_weights_in_gpt2)
 from .modeling_xlnet import (XLNetBaseConfig, XLNetConfig, XLNetRunConfig,
                              XLNetPreTrainedModel, XLNetModel, XLNetLMHeadModel,
+                             XLNetForSequenceClassification, XLNetForQuestionAnswering,
                              load_tf_weights_in_xlnet)
 
 from .optimization import BertAdam
diff --git a/pytorch_pretrained_bert/convert_xlnet_checkpoint_to_pytorch.py b/pytorch_pretrained_bert/convert_xlnet_checkpoint_to_pytorch.py
index d343fd2189..d46cc99e73 100755
--- a/pytorch_pretrained_bert/convert_xlnet_checkpoint_to_pytorch.py
+++ b/pytorch_pretrained_bert/convert_xlnet_checkpoint_to_pytorch.py
@@ -28,20 +28,31 @@ from pytorch_pretrained_bert.modeling_xlnet import (CONFIG_NAME, WEIGHTS_NAME,
                                                     XLNetForSequenceClassification,
                                                     load_tf_weights_in_xlnet)
 
-GLUE_TASKS = ["cola", "mnli", "mnli-mm", "mrpc", "sst-2", "sts-b", "qqp", "qnli", "rte", "wnli"]
+GLUE_TASKS = {
+    "cola": "classification",
+    "mnli": "classification",
+    "mrpc": "classification",
+    "sst-2": "classification",
+    "sts-b": "regression",
+    "qqp": "classification",
+    "qnli": "classification",
+    "rte": "classification",
+    "wnli": "classification",
+}
 
 
 def convert_xlnet_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, pytorch_dump_folder_path, finetuning_task=None):
     # Initialise PyTorch model
     config = XLNetConfig.from_json_file(bert_config_file)
-    if finetuning_task is not None and finetuning_task.lower() in GLUE_TASKS:
-        model_class = XLNetLMHeadModel
-    elif finetuning_task is not None and 'squad' in finetuning_task.lower():
-        model_class = XLNetForQuestionAnswering
+
+    finetuning_task = finetuning_task.lower() if finetuning_task is not None else ""
+    if finetuning_task in GLUE_TASKS:
+        print("Building PyTorch XLNetForSequenceClassification model from configuration: {}".format(str(config)))
+        model = XLNetForSequenceClassification(config, is_regression=bool(GLUE_TASKS[finetuning_task] == "regression"))
+    elif 'squad' in finetuning_task:
+        model = XLNetForQuestionAnswering(config)
     else:
-        model_class = XLNetLMHeadModel
-    print("Building PyTorch model {} from configuration: {}".format(str(model_class), str(config)))
-    model = model_class(config)
+        model = XLNetLMHeadModel(config)
 
     # Load weights from tf checkpoint
     load_tf_weights_in_xlnet(model, config, tf_checkpoint_path, finetuning_task)
@@ -80,6 +91,8 @@ if __name__ == "__main__":
                         type = str,
                         help = "Name of a task on which the XLNet TensorFloaw model was fine-tuned")
     args = parser.parse_args()
+    print(args)
+
     convert_xlnet_checkpoint_to_pytorch(args.tf_checkpoint_path,
                                         args.xlnet_config_file,
                                         args.pytorch_dump_folder_path,
diff --git a/pytorch_pretrained_bert/modeling_xlnet.py b/pytorch_pretrained_bert/modeling_xlnet.py
index 495f756f7e..7ee7be9025 100644
--- a/pytorch_pretrained_bert/modeling_xlnet.py
+++ b/pytorch_pretrained_bert/modeling_xlnet.py
@@ -30,7 +30,7 @@ from io import open
 import torch
 from torch import nn
 from torch.nn import functional as F
-from torch.nn import CrossEntropyLoss
+from torch.nn import CrossEntropyLoss, MSELoss
 
 from .file_utils import cached_path, WEIGHTS_NAME, CONFIG_NAME
 
@@ -58,11 +58,11 @@ def build_tf_xlnet_to_pytorch_map(model, config, tf_weights=None, finetuning_tas
         if hasattr(model, 'lm_loss'):
             # We will load also the output bias
             tf_to_pt_map['model/lm_loss/bias'] = model.lm_loss.bias
-        elif hasattr(model, 'sequence_summary') and 'model/sequnece_summary/summary/kernel' in tf_weights:
+        if hasattr(model, 'sequence_summary') and 'model/sequnece_summary/summary/kernel' in tf_weights:
             # We will load also the sequence summary
             tf_to_pt_map['model/sequnece_summary/summary/kernel'] = model.sequence_summary.summary.weight
             tf_to_pt_map['model/sequnece_summary/summary/bias'] = model.sequence_summary.summary.bias
-        elif hasattr(model, 'logits_proj') and finetuning_task is not None and any('model/regression' in name for name in tf_weights.keys()):
+        if hasattr(model, 'logits_proj') and finetuning_task is not None and 'model/regression_{}/logit/kernel'.format(finetuning_task) in tf_weights:
             tf_to_pt_map['model/regression_{}/logit/kernel'.format(finetuning_task)] = model.logits_proj.weight
             tf_to_pt_map['model/regression_{}/logit/bias'.format(finetuning_task)] = model.logits_proj.bias
 
@@ -133,6 +133,8 @@ def load_tf_weights_in_xlnet(model, config, tf_path, finetuning_task=None):
         array = tf.train.load_variable(tf_path, name)
         tf_weights[name] = array
 
+    input("Press Enter to continue...")
+
     # Build TF to PyTorch weights loading map
     tf_to_pt_map = build_tf_xlnet_to_pytorch_map(model, config, tf_weights, finetuning_task)
 
@@ -144,7 +146,7 @@ def load_tf_weights_in_xlnet(model, config, tf_path, finetuning_task=None):
         array = tf_weights[name]
         # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
         # which are not required for using pretrained model
-        if 'kernel' in name and 'ff' in name:
+        if 'kernel' in name and ('ff' in name or 'summary' in name or 'logit' in name):
             print("Transposing")
             array = np.transpose(array)
         if isinstance(pointer, list):

From 092dacfd623b75530c39773930783783f58fbdbe Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Wed, 26 Jun 2019 09:54:05 +0200
Subject: [PATCH 019/139] changing is_regression to unified API

---
 examples/utils_glue.py                        | 12 +++++++
 .../convert_xlnet_checkpoint_to_pytorch.py    | 24 +++++++-------
 pytorch_pretrained_bert/modeling.py           | 11 +++++--
 pytorch_pretrained_bert/modeling_xlnet.py     | 33 ++++++++++---------
 4 files changed, 49 insertions(+), 31 deletions(-)

diff --git a/examples/utils_glue.py b/examples/utils_glue.py
index 5d3454f439..e3e4179fae 100644
--- a/examples/utils_glue.py
+++ b/examples/utils_glue.py
@@ -591,3 +591,15 @@ output_modes = {
     "rte": "classification",
     "wnli": "classification",
 }
+
+GLUE_TASKS_NUM_LABELS = {
+    "cola": 2,
+    "mnli": 3,
+    "mrpc": 2,
+    "sst-2": 2,
+    "sts-b": 1,
+    "qqp": 2,
+    "qnli": 2,
+    "rte": 2,
+    "wnli": 2,
+}
diff --git a/pytorch_pretrained_bert/convert_xlnet_checkpoint_to_pytorch.py b/pytorch_pretrained_bert/convert_xlnet_checkpoint_to_pytorch.py
index d46cc99e73..258b82e363 100755
--- a/pytorch_pretrained_bert/convert_xlnet_checkpoint_to_pytorch.py
+++ b/pytorch_pretrained_bert/convert_xlnet_checkpoint_to_pytorch.py
@@ -28,16 +28,16 @@ from pytorch_pretrained_bert.modeling_xlnet import (CONFIG_NAME, WEIGHTS_NAME,
                                                     XLNetForSequenceClassification,
                                                     load_tf_weights_in_xlnet)
 
-GLUE_TASKS = {
-    "cola": "classification",
-    "mnli": "classification",
-    "mrpc": "classification",
-    "sst-2": "classification",
-    "sts-b": "regression",
-    "qqp": "classification",
-    "qnli": "classification",
-    "rte": "classification",
-    "wnli": "classification",
+GLUE_TASKS_NUM_LABELS = {
+    "cola": 2,
+    "mnli": 3,
+    "mrpc": 2,
+    "sst-2": 2,
+    "sts-b": 1,
+    "qqp": 2,
+    "qnli": 2,
+    "rte": 2,
+    "wnli": 2,
 }
 
 
@@ -46,9 +46,9 @@ def convert_xlnet_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, py
     config = XLNetConfig.from_json_file(bert_config_file)
 
     finetuning_task = finetuning_task.lower() if finetuning_task is not None else ""
-    if finetuning_task in GLUE_TASKS:
+    if finetuning_task in GLUE_TASKS_NUM_LABELS:
         print("Building PyTorch XLNetForSequenceClassification model from configuration: {}".format(str(config)))
-        model = XLNetForSequenceClassification(config, is_regression=bool(GLUE_TASKS[finetuning_task] == "regression"))
+        model = XLNetForSequenceClassification(config, num_labels=GLUE_TASKS_NUM_LABELS[finetuning_task])
     elif 'squad' in finetuning_task:
         model = XLNetForQuestionAnswering(config)
     else:
diff --git a/pytorch_pretrained_bert/modeling.py b/pytorch_pretrained_bert/modeling.py
index bc2304bc06..ce55c50c68 100644
--- a/pytorch_pretrained_bert/modeling.py
+++ b/pytorch_pretrained_bert/modeling.py
@@ -27,7 +27,7 @@ from io import open
 
 import torch
 from torch import nn
-from torch.nn import CrossEntropyLoss
+from torch.nn import CrossEntropyLoss, MSELoss
 
 from .file_utils import cached_path, WEIGHTS_NAME, CONFIG_NAME
 
@@ -1196,8 +1196,13 @@ class BertForSequenceClassification(BertPreTrainedModel):
         logits = self.classifier(pooled_output)
 
         if labels is not None:
-            loss_fct = CrossEntropyLoss()
-            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            if self.num_labels == 1:
+                #  We are doing regression
+                loss_fct = MSELoss()
+                loss = loss_fct(logits.view(-1), labels.view(-1))
+            else:
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
             return loss
         elif self.output_attentions:
             return all_attentions, logits
diff --git a/pytorch_pretrained_bert/modeling_xlnet.py b/pytorch_pretrained_bert/modeling_xlnet.py
index 7ee7be9025..8963f53615 100644
--- a/pytorch_pretrained_bert/modeling_xlnet.py
+++ b/pytorch_pretrained_bert/modeling_xlnet.py
@@ -1175,7 +1175,7 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
 
     def forward(self, inp_k, token_type_ids=None, input_mask=None, attention_mask=None,
                 mems=None, perm_mask=None, target_mapping=None, inp_q=None,
-                target=None, output_all_encoded_layers=True, head_mask=None):
+                labels=None, output_all_encoded_layers=True, head_mask=None):
         """
         Args:
             inp_k: int32 Tensor in shape [bsz, len], the input token IDs.
@@ -1212,11 +1212,11 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
 
         logits = self.lm_loss(output)
 
-        if target is not None:
+        if labels is not None:
             # Flatten the tokens
             loss_fct = CrossEntropyLoss(ignore_index=-1)
             loss = loss_fct(logits.view(-1, logits.size(-1)),
-                            target.view(-1))
+                            labels.view(-1))
             return loss, new_mems
 
         # if self.output_attentions:
@@ -1305,13 +1305,13 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel):
 
     Outputs: Tuple of (logits or loss, mems)
         `logits or loss`:
-            if target is None:
+            if labels is None:
                 Token logits with shape [batch_size, sequence_length] 
             else:
                 CrossEntropy loss with the targets
         `new_mems`: list (num layers) of updated mem states at the entry of each layer
             each mem state is a torch.FloatTensor of size [self.config.mem_len, batch_size, self.config.d_model]
-            Note that the first two dimensions are transposed in `mems` with regards to `input_ids` and `target`
+            Note that the first two dimensions are transposed in `mems` with regards to `input_ids` and `labels`
 
     Example usage:
     ```python
@@ -1328,13 +1328,13 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel):
     ```
     """
     def __init__(self, config, summary_type="last", use_proj=True, num_labels=2,
-                 is_regression=False, output_attentions=False, keep_multihead_output=False):
+                 output_attentions=False, keep_multihead_output=False):
         super(XLNetForSequenceClassification, self).__init__(config)
         self.output_attentions = output_attentions
         self.attn_type = config.attn_type
         self.same_length = config.same_length
         self.summary_type = summary_type
-        self.is_regression = is_regression
+        self.num_labels = num_labels
 
         self.transformer = XLNetModel(config, output_attentions=output_attentions,
                                               keep_multihead_output=keep_multihead_output)
@@ -1342,12 +1342,12 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel):
         self.sequence_summary = XLNetSequenceSummary(config, summary_type=summary_type,
                                                      use_proj=use_proj, output_attentions=output_attentions,
                                                      keep_multihead_output=keep_multihead_output)
-        self.logits_proj = nn.Linear(config.d_model, num_labels if not is_regression else 1)
+        self.logits_proj = nn.Linear(config.d_model, num_labels)
         self.apply(self.init_weights)
 
     def forward(self, inp_k, token_type_ids=None, input_mask=None, attention_mask=None,
                 mems=None, perm_mask=None, target_mapping=None, inp_q=None,
-                target=None, output_all_encoded_layers=True, head_mask=None):
+                labels=None, output_all_encoded_layers=True, head_mask=None):
         """
         Args:
             inp_k: int32 Tensor in shape [bsz, len], the input token IDs.
@@ -1376,19 +1376,20 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel):
                 Set to None during finetuning.
         """
         output, _, new_mems = self.transformer(inp_k, token_type_ids, input_mask, attention_mask,
-                                            mems, perm_mask, target_mapping, inp_q,
-                                            output_all_encoded_layers, head_mask)
+                                               mems, perm_mask, target_mapping, inp_q,
+                                               output_all_encoded_layers, head_mask)
 
         output = self.sequence_summary(output)
         logits = self.logits_proj(output)
 
-        if target is not None:
-            if self.is_regression:
+        if labels is not None:
+            if self.num_labels == 1:
+                #  We are doing regression
                 loss_fct = MSELoss()
-                loss = loss_fct(logits.view(-1), target.view(-1))
+                loss = loss_fct(logits.view(-1), labels.view(-1))
             else:
-                loss_fct = CrossEntropyLoss(ignore_index=-1)
-                loss = loss_fct(logits.view(-1, logits.size(-1)), target.view(-1))
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
             return loss, new_mems
 
         # if self.output_attentions:

From 93e9971c54e060e528adfdb0ebe149f2b284d660 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Wed, 26 Jun 2019 10:02:45 +0200
Subject: [PATCH 020/139] fix tests

---
 README.md                                     |   6 +-
 .../modeling_transfo_xl.py                    |  28 ++---
 .../modeling_transfo_xl_utilities.py          |  34 +++---
 .../modeling_xlnet_utilities.py               | 111 ------------------
 tests/modeling_transfo_xl_test.py             |   4 +-
 tests/modeling_xlnet_test.py                  |   4 +-
 6 files changed, 38 insertions(+), 149 deletions(-)
 delete mode 100644 pytorch_pretrained_bert/modeling_xlnet_utilities.py

diff --git a/README.md b/README.md
index d8309be01e..a5234bd9ba 100644
--- a/README.md
+++ b/README.md
@@ -930,12 +930,12 @@ all_hidden_states = lower_hidden_states + [hidden_states]
 `TransfoXLLMHeadModel` includes the `TransfoXLModel` Transformer followed by an (adaptive) softmax head with weights tied to the input embeddings.
 
 *Inputs* are the same as the inputs of the [`TransfoXLModel`](#-12.-`TransfoXLModel`) class plus optional labels:
-- `target`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the target token indices selected in the range [0, self.config.n_token[
+- `labels`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the labels token indices selected in the range [0, self.config.n_token[
 
 *Outputs* a tuple of (last_hidden_state, new_mems)
 - `softmax_output`: output of the (adaptive) softmax:
-  - if target is None: log probabilities of tokens, shape [batch_size, sequence_length, n_tokens] 
-  - else: Negative log likelihood of target tokens with shape [batch_size, sequence_length]
+  - if labels is None: log probabilities of tokens, shape [batch_size, sequence_length, n_tokens] 
+  - else: Negative log likelihood of labels tokens with shape [batch_size, sequence_length]
 - `new_mems`: list (num layers) of updated mem states at the entry of each layer each mem state is a torch.FloatTensor of size [self.config.mem_len, batch_size, self.config.d_model]. Note that the first two dimensions are transposed in `mems` with regards to `input_ids`.
 
 #### 14. `GPT2Model`
diff --git a/pytorch_pretrained_bert/modeling_transfo_xl.py b/pytorch_pretrained_bert/modeling_transfo_xl.py
index ee04eda496..c1a337a1d7 100644
--- a/pytorch_pretrained_bert/modeling_transfo_xl.py
+++ b/pytorch_pretrained_bert/modeling_transfo_xl.py
@@ -1025,14 +1025,14 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
         `mems`: optional memomry of hidden states from previous forward passes
             as a list (num layers) of hidden states at the entry of each layer
             each hidden states has shape [self.config.mem_len, bsz, self.config.d_model]
-            Note that the first two dimensions are transposed in `mems` with regards to `input_ids` and `target`
+            Note that the first two dimensions are transposed in `mems` with regards to `input_ids` and `labels`
     Outputs:
         A tuple of (last_hidden_state, new_mems)
         `last_hidden_state`: the encoded-hidden-states at the top of the model
             as a torch.FloatTensor of size [batch_size, sequence_length, self.config.d_model]
         `new_mems`: list (num layers) of updated mem states at the entry of each layer
             each mem state is a torch.FloatTensor of size [self.config.mem_len, batch_size, self.config.d_model]
-            Note that the first two dimensions are transposed in `mems` with regards to `input_ids` and `target`
+            Note that the first two dimensions are transposed in `mems` with regards to `input_ids` and `labels`
 
     Example usage:
     ```python
@@ -1265,7 +1265,7 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
                 mems :: optional mems from previous forwar passes (or init_mems)
                     list (num layers) of mem states at the entry of each layer
                         shape :: [self.config.mem_len, bsz, self.config.d_model]
-                    Note that the first two dimensions are transposed in `mems` with regards to `input_ids` and `target`
+                    Note that the first two dimensions are transposed in `mems` with regards to `input_ids` and `labels`
             Returns:
                 tuple (last_hidden, new_mems) where:
                     new_mems: list (num layers) of mem states at the entry of each layer
@@ -1303,23 +1303,23 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel):
     Inputs:
         `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
             with the token indices selected in the range [0, self.config.n_token[
-        `target`: an optional torch.LongTensor of shape [batch_size, sequence_length]
-            with the target token indices selected in the range [0, self.config.n_token[
+        `labels`: an optional torch.LongTensor of shape [batch_size, sequence_length]
+            with the labels token indices selected in the range [0, self.config.n_token[
         `mems`: an optional memory of hidden states from previous forward passes
             as a list (num layers) of hidden states at the entry of each layer
             each hidden states has shape [self.config.mem_len, bsz, self.config.d_model]
-            Note that the first two dimensions are transposed in `mems` with regards to `input_ids` and `target`
+            Note that the first two dimensions are transposed in `mems` with regards to `input_ids` and `labels`
 
     Outputs:
         A tuple of (last_hidden_state, new_mems)
         `softmax_output`: output of the (adaptive) softmax:
-            if target is None:
+            if labels is None:
                 Negative log likelihood of shape [batch_size, sequence_length] 
             else:
                 log probabilities of tokens, shape [batch_size, sequence_length, n_tokens]
         `new_mems`: list (num layers) of updated mem states at the entry of each layer
             each mem state is a torch.FloatTensor of size [self.config.mem_len, batch_size, self.config.d_model]
-            Note that the first two dimensions are transposed in `mems` with regards to `input_ids` and `target`
+            Note that the first two dimensions are transposed in `mems` with regards to `input_ids` and `labels`
 
     Example usage:
     ```python
@@ -1375,16 +1375,16 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel):
     def init_mems(self, data):
         return self.transformer.init_mems(data)
 
-    def forward(self, input_ids, target=None, mems=None):
+    def forward(self, input_ids, labels=None, mems=None):
         """ Params:
                 input_ids :: [bsz, len]
-                target :: [bsz, len]
+                labels :: [bsz, len]
             Returns:
                 tuple(softmax_output, new_mems) where:
                     new_mems: list (num layers) of hidden states at the entry of each layer
                         shape :: [mem_len, bsz, self.config.d_model] :: Warning: shapes are transposed here w. regards to input_ids
                     softmax_output: output of the (adaptive) softmax:
-                        if target is None:
+                        if labels is None:
                             Negative log likelihood of shape :: [bsz, len] 
                         else:
                             log probabilities of tokens, shape :: [bsz, len, n_tokens]
@@ -1397,11 +1397,11 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel):
         pred_hid = last_hidden[:, -tgt_len:]
         if self.sample_softmax > 0 and self.training:
             assert self.config.tie_weight
-            logit = sample_logits(self.transformer.word_emb, self.out_layer.bias, target, pred_hid, self.sampler)
+            logit = sample_logits(self.transformer.word_emb, self.out_layer.bias, labels, pred_hid, self.sampler)
             softmax_output = -F.log_softmax(logit, -1)[:, :, 0]
         else:
-            softmax_output = self.crit(pred_hid.view(-1, pred_hid.size(-1)), target)
-            if target is None:
+            softmax_output = self.crit(pred_hid.view(-1, pred_hid.size(-1)), labels)
+            if labels is None:
                 softmax_output = softmax_output.view(bsz, tgt_len, -1)
             else:
                 softmax_output = softmax_output.view(bsz, tgt_len)
diff --git a/pytorch_pretrained_bert/modeling_transfo_xl_utilities.py b/pytorch_pretrained_bert/modeling_transfo_xl_utilities.py
index 7fd67adb35..2621a57517 100644
--- a/pytorch_pretrained_bert/modeling_transfo_xl_utilities.py
+++ b/pytorch_pretrained_bert/modeling_transfo_xl_utilities.py
@@ -89,13 +89,13 @@ class ProjectedAdaptiveLogSoftmax(nn.Module):
 
         return logit
 
-    def forward(self, hidden, target=None, keep_order=False):
+    def forward(self, hidden, labels=None, keep_order=False):
         '''
             Params:
                 hidden :: [len*bsz x d_proj]
-                target :: [len*bsz]
+                labels :: [len*bsz]
             Return:
-                if target is None:
+                if labels is None:
                     out :: [len*bsz] Negative log likelihood
                 else:
                     out :: [len*bsz x n_tokens] log probabilities of tokens over the vocabulary
@@ -104,18 +104,18 @@ class ProjectedAdaptiveLogSoftmax(nn.Module):
             here: https://github.com/pytorch/pytorch/blob/dbe6a7a9ff1a364a8706bf5df58a1ca96d2fd9da/torch/nn/modules/adaptive.py#L138
         '''
 
-        if target is not None:
-            target = target.view(-1)
-            if hidden.size(0) != target.size(0):
-                raise RuntimeError('Input and target should have the same size '
+        if labels is not None:
+            labels = labels.view(-1)
+            if hidden.size(0) != labels.size(0):
+                raise RuntimeError('Input and labels should have the same size '
                                 'in the batch dimension.')
 
         if self.n_clusters == 0:
             logit = self._compute_logit(hidden, self.out_layers[0].weight,
                                         self.out_layers[0].bias, self.out_projs[0])
-            if target is not None:
+            if labels is not None:
                 out = -F.log_softmax(logit, dim=-1) \
-                        .gather(1, target.unsqueeze(1)).squeeze(1)
+                        .gather(1, labels.unsqueeze(1)).squeeze(1)
             else:
                 out = F.log_softmax(logit, dim=-1)
         else:
@@ -144,31 +144,31 @@ class ProjectedAdaptiveLogSoftmax(nn.Module):
             head_logit = self._compute_logit(hidden, head_weight, head_bias, head_proj)
             head_logprob = F.log_softmax(head_logit, dim=1)
 
-            if target is None:
+            if labels is None:
                 out = hidden.new_empty((head_logit.size(0), self.n_token))
             else:
-                out = torch.zeros_like(target, dtype=hidden.dtype, device=hidden.device)
+                out = torch.zeros_like(labels, dtype=hidden.dtype, device=hidden.device)
 
             offset = 0
             cutoff_values = [0] + self.cutoffs
             for i in range(len(cutoff_values) - 1):
                 l_idx, r_idx = cutoff_values[i], cutoff_values[i + 1]
 
-                if target is not None:
-                    mask_i = (target >= l_idx) & (target < r_idx)
+                if labels is not None:
+                    mask_i = (labels >= l_idx) & (labels < r_idx)
                     indices_i = mask_i.nonzero().squeeze()
 
                     if indices_i.numel() == 0:
                         continue
 
-                    target_i = target.index_select(0, indices_i) - l_idx
+                    target_i = labels.index_select(0, indices_i) - l_idx
                     head_logprob_i = head_logprob.index_select(0, indices_i)
                     hidden_i = hidden.index_select(0, indices_i)
                 else:
                     hidden_i = hidden
 
                 if i == 0:
-                    if target is not None:
+                    if labels is not None:
                         logprob_i = head_logprob_i.gather(1, target_i[:, None]).squeeze(1)
                     else:
                         out[:, :self.cutoffs[0]] = head_logprob[:, :self.cutoffs[0]]
@@ -178,14 +178,14 @@ class ProjectedAdaptiveLogSoftmax(nn.Module):
                     tail_logit_i = self._compute_logit(hidden_i, weight_i, bias_i, proj_i)
                     tail_logprob_i = F.log_softmax(tail_logit_i, dim=1)
                     cluster_prob_idx = self.cutoffs[0] + i - 1  # No probability for the head cluster
-                    if target is not None:
+                    if labels is not None:
                         logprob_i = head_logprob_i[:, cluster_prob_idx] \
                                 + tail_logprob_i.gather(1, target_i[:, None]).squeeze(1)
                     else:
                         logprob_i = head_logprob[:, cluster_prob_idx, None] + tail_logprob_i
                         out[:, l_idx:r_idx] = logprob_i
 
-                if target is not None:
+                if labels is not None:
                     if (hasattr(self, 'keep_order') and self.keep_order) or keep_order:
                         out.index_copy_(0, indices_i, -logprob_i)
                     else:
diff --git a/pytorch_pretrained_bert/modeling_xlnet_utilities.py b/pytorch_pretrained_bert/modeling_xlnet_utilities.py
deleted file mode 100644
index e2611b7a41..0000000000
--- a/pytorch_pretrained_bert/modeling_xlnet_utilities.py
+++ /dev/null
@@ -1,111 +0,0 @@
-# coding=utf-8
-# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Utilities for PyTorch XLNet model.
-"""
-
-from collections import defaultdict
-
-import numpy as np
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-special_symbols = {
-    "<unk>"  : 0,
-    "<s>"    : 1,
-    "</s>"   : 2,
-    "<cls>"  : 3,
-    "<sep>"  : 4,
-    "<pad>"  : 5,
-    "<mask>" : 6,
-    "<eod>"  : 7,
-    "<eop>"  : 8,
-}
-
-VOCAB_SIZE = 32000
-UNK_ID = special_symbols["<unk>"]
-CLS_ID = special_symbols["<cls>"]
-SEP_ID = special_symbols["<sep>"]
-MASK_ID = special_symbols["<mask>"]
-EOD_ID = special_symbols["<eod>"]
-
-
-def permutation_mask(inputs, targets, is_masked, perm_size, seq_len):
-    """
-    Sample a permutation of the factorization order, and create an
-    attention mask accordingly.
-    Args:
-        inputs: int64 Tensor in shape [seq_len], input ids.
-        targets: int64 Tensor in shape [seq_len], target ids.
-        is_masked: bool Tensor in shape [seq_len]. True means being selected
-            for partial prediction.
-        perm_size: the length of longest permutation. Could be set to be reuse_len.
-            Should not be larger than reuse_len or there will be data leaks.
-        seq_len: int, sequence length.
-    """
-
-    # Generate permutation indices
-    index = np.arange(10)
-    index = np.transpose(np.reshape(index, [-1, perm_size]))
-    index = np.random.shuffle(index)
-    index = np.reshape(np.transpose(index), [-1])
-
-    # `perm_mask` and `target_mask`
-    # non-functional tokens
-    non_func_tokens = tf.logical_not(tf.logical_or(
-        tf.equal(inputs, SEP_ID),
-        tf.equal(inputs, CLS_ID)))
-
-    non_mask_tokens = tf.logical_and(tf.logical_not(is_masked), non_func_tokens)
-    masked_or_func_tokens = tf.logical_not(non_mask_tokens)
-
-    # Set the permutation indices of non-masked (& non-funcional) tokens to the
-    # smallest index (-1):
-    # (1) they can be seen by all other positions
-    # (2) they cannot see masked positions, so there won"t be information leak
-    smallest_index = -tf.ones([seq_len], dtype=tf.int64)
-    rev_index = tf.where(non_mask_tokens, smallest_index, index)
-
-    # Create `target_mask`: non-funcional and maksed tokens
-    # 1: use mask as input and have loss
-    # 0: use token (or [SEP], [CLS]) as input and do not have loss
-    target_tokens = tf.logical_and(masked_or_func_tokens, non_func_tokens)
-    target_mask = tf.cast(target_tokens, tf.float32)
-
-    # Create `perm_mask`
-    # `target_tokens` cannot see themselves
-    self_rev_index = tf.where(target_tokens, rev_index, rev_index + 1)
-
-    # 1: cannot attend if i <= j and j is not non-masked (masked_or_func_tokens)
-    # 0: can attend if i > j or j is non-masked
-    perm_mask = tf.logical_and(
-        self_rev_index[:, None] <= rev_index[None, :],
-        masked_or_func_tokens)
-    perm_mask = tf.cast(perm_mask, tf.float32)
-
-    # new target: [next token] for LM and [curr token] (self) for PLM
-    new_targets = tf.concat([inputs[0: 1], targets[: -1]],
-                            axis=0)
-
-    # construct inputs_k
-    inputs_k = inputs
-
-    # construct inputs_q
-    inputs_q = target_mask
-
-    return perm_mask, new_targets, target_mask, inputs_k, inputs_q
-
diff --git a/tests/modeling_transfo_xl_test.py b/tests/modeling_transfo_xl_test.py
index e5c5f3d163..88a6ad35fe 100644
--- a/tests/modeling_transfo_xl_test.py
+++ b/tests/modeling_transfo_xl_test.py
@@ -129,10 +129,10 @@ class TransfoXLModelTest(unittest.TestCase):
             model = TransfoXLLMHeadModel(config)
             model.eval()
 
-            loss_1, mems_1a = model(input_ids_1, target=lm_labels)
+            loss_1, mems_1a = model(input_ids_1, labels=lm_labels)
             lm_logits_1, mems_1b = model(input_ids_1)
 
-            loss_2, mems_2a = model(input_ids_2, target=lm_labels, mems=mems_1a)
+            loss_2, mems_2a = model(input_ids_2, labels=lm_labels, mems=mems_1a)
             lm_logits_2, mems_2b = model(input_ids_2, mems=mems_1b)
 
             outputs = {
diff --git a/tests/modeling_xlnet_test.py b/tests/modeling_xlnet_test.py
index dbae74aa80..3d14af5d7c 100644
--- a/tests/modeling_xlnet_test.py
+++ b/tests/modeling_xlnet_test.py
@@ -138,10 +138,10 @@ class XLNetModelTest(unittest.TestCase):
             model = XLNetLMHeadModel(config)
             model.eval()
 
-            loss_1, mems_1a = model(input_ids_1, token_type_ids=segment_ids, target=lm_labels)
+            loss_1, mems_1a = model(input_ids_1, token_type_ids=segment_ids, labels=lm_labels)
             all_logits_1, mems_1b = model(input_ids_1, token_type_ids=segment_ids)
 
-            loss_2, mems_2a = model(input_ids_2, token_type_ids=segment_ids, target=lm_labels, mems=mems_1a)
+            loss_2, mems_2a = model(input_ids_2, token_type_ids=segment_ids, labels=lm_labels, mems=mems_1a)
             all_logits_2, mems_2b = model(input_ids_2, token_type_ids=segment_ids, mems=mems_1b)
 
             logits, _ = model(input_ids_q,

From 7e3070ae4f0a1f57a974de9b2b54fd7ad3f08fce Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Wed, 26 Jun 2019 11:12:00 +0200
Subject: [PATCH 021/139] add from_pretrained method to all configuration
 classes

---
 pytorch_pretrained_bert/__init__.py           |    7 +-
 .../convert_xlnet_checkpoint_to_pytorch.py    |    2 +-
 pytorch_pretrained_bert/file_utils.py         |    3 -
 pytorch_pretrained_bert/model_utils.py        |  198 ++
 pytorch_pretrained_bert/modeling.py           |   62 +-
 pytorch_pretrained_bert/modeling_gpt2.py      |   78 +-
 pytorch_pretrained_bert/modeling_openai.py    |   70 +-
 .../modeling_transfo_xl.py                    |   39 +-
 pytorch_pretrained_bert/modeling_xlm.py       | 1748 +++++++++++++++++
 pytorch_pretrained_bert/modeling_xlnet.py     |   97 +-
 tests/modeling_xlnet_test.py                  |    8 +-
 11 files changed, 1983 insertions(+), 329 deletions(-)
 create mode 100644 pytorch_pretrained_bert/model_utils.py
 create mode 100644 pytorch_pretrained_bert/modeling_xlm.py

diff --git a/pytorch_pretrained_bert/__init__.py b/pytorch_pretrained_bert/__init__.py
index 069c6c52e2..12e251522c 100644
--- a/pytorch_pretrained_bert/__init__.py
+++ b/pytorch_pretrained_bert/__init__.py
@@ -18,7 +18,7 @@ from .modeling_transfo_xl import (TransfoXLConfig, TransfoXLModel, TransfoXLLMHe
 from .modeling_gpt2 import (GPT2Config, GPT2Model,
                             GPT2LMHeadModel, GPT2DoubleHeadsModel, GPT2MultipleChoiceHead,
                             load_tf_weights_in_gpt2)
-from .modeling_xlnet import (XLNetBaseConfig, XLNetConfig, XLNetRunConfig,
+from .modeling_xlnet import (XLNetConfig,
                              XLNetPreTrainedModel, XLNetModel, XLNetLMHeadModel,
                              XLNetForSequenceClassification, XLNetForQuestionAnswering,
                              load_tf_weights_in_xlnet)
@@ -26,5 +26,6 @@ from .modeling_xlnet import (XLNetBaseConfig, XLNetConfig, XLNetRunConfig,
 from .optimization import BertAdam
 from .optimization_openai import OpenAIAdam
 
-from .file_utils import (PYTORCH_PRETRAINED_BERT_CACHE, cached_path,
-                         WEIGHTS_NAME, CONFIG_NAME)
+from .file_utils import (PYTORCH_PRETRAINED_BERT_CACHE, cached_path)
+
+from .model_utils import (WEIGHTS_NAME, CONFIG_NAME, PretrainedConfig)
diff --git a/pytorch_pretrained_bert/convert_xlnet_checkpoint_to_pytorch.py b/pytorch_pretrained_bert/convert_xlnet_checkpoint_to_pytorch.py
index 258b82e363..ce4fcc7810 100755
--- a/pytorch_pretrained_bert/convert_xlnet_checkpoint_to_pytorch.py
+++ b/pytorch_pretrained_bert/convert_xlnet_checkpoint_to_pytorch.py
@@ -23,7 +23,7 @@ import argparse
 import torch
 
 from pytorch_pretrained_bert.modeling_xlnet import (CONFIG_NAME, WEIGHTS_NAME,
-                                                    XLNetConfig, XLNetRunConfig,
+                                                    XLNetConfig,
                                                     XLNetLMHeadModel, XLNetForQuestionAnswering,
                                                     XLNetForSequenceClassification,
                                                     load_tf_weights_in_xlnet)
diff --git a/pytorch_pretrained_bert/file_utils.py b/pytorch_pretrained_bert/file_utils.py
index 605c841235..ed509e5033 100644
--- a/pytorch_pretrained_bert/file_utils.py
+++ b/pytorch_pretrained_bert/file_utils.py
@@ -44,9 +44,6 @@ except (AttributeError, ImportError):
     PYTORCH_PRETRAINED_BERT_CACHE = os.getenv('PYTORCH_PRETRAINED_BERT_CACHE',
                                               default_cache_path)
 
-CONFIG_NAME = "config.json"
-WEIGHTS_NAME = "pytorch_model.bin"
-
 logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
 
 
diff --git a/pytorch_pretrained_bert/model_utils.py b/pytorch_pretrained_bert/model_utils.py
new file mode 100644
index 0000000000..af35e7bcc4
--- /dev/null
+++ b/pytorch_pretrained_bert/model_utils.py
@@ -0,0 +1,198 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch BERT model."""
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import logging
+import os
+import json
+import copy
+
+import torch
+from torch import nn
+from torch.nn import CrossEntropyLoss, MSELoss
+
+from .file_utils import cached_path
+
+logger = logging.getLogger(__name__)
+
+CONFIG_NAME = "config.json"
+WEIGHTS_NAME = "pytorch_model.bin"
+
+
+class PretrainedConfig(object):
+    """ An abstract class to handle dowloading a model pretrained config.
+    """
+    pretrained_config_archive_map = {}
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
+        """
+        Instantiate a PretrainedConfig from a pre-trained model configuration.
+
+        Params:
+            pretrained_model_name_or_path: either:
+                - a str with the name of a pre-trained model to load selected in the list of:
+                    . `xlnet-large-cased`
+                - a path or url to a pretrained model archive containing:
+                    . `config.json` a configuration file for the model
+            cache_dir: an optional path to a folder in which the pre-trained model configuration will be cached.
+        """
+        cache_dir = kwargs.get('cache_dir', None)
+        kwargs.pop('cache_dir', None)
+
+        if pretrained_model_name_or_path in cls.pretrained_config_archive_map:
+            config_file = cls.pretrained_config_archive_map[pretrained_model_name_or_path]
+        else:
+            config_file = os.path.join(pretrained_model_name_or_path, CONFIG_NAME)
+        # redirect to the cache, if necessary
+        try:
+            resolved_config_file = cached_path(config_file, cache_dir=cache_dir)
+        except EnvironmentError:
+            if pretrained_model_name_or_path in cls.pretrained_config_archive_map:
+                logger.error(
+                    "Couldn't reach server at '{}' to download pretrained model configuration file.".format(
+                        config_file))
+            else:
+                logger.error(
+                    "Model name '{}' was not found in model name list ({}). "
+                    "We assumed '{}' was a path or url but couldn't find any file "
+                    "associated to this path or url.".format(
+                        pretrained_model_name_or_path,
+                        ', '.join(cls.pretrained_config_archive_map.keys()),
+                        config_file))
+            return None
+        if resolved_config_file == config_file:
+            logger.info("loading configuration file {}".format(config_file))
+        else:
+            logger.info("loading configuration file {} from cache at {}".format(
+                config_file, resolved_config_file))
+
+        # Load config
+        config = cls.from_json_file(resolved_config_file)
+
+        # Update config with kwargs if needed
+        to_remove = []
+        for key, value in kwargs.items():
+            if hasattr(config, key):
+                setattr(config, key, value)
+                to_remove.append(key)
+        for key in to_remove:
+            kwargs.pop(key, None)
+
+        logger.info("Model config {}".format(config))
+        return config
+
+    @classmethod
+    def from_dict(cls, json_object):
+        """Constructs a `Config` from a Python dictionary of parameters."""
+        config = cls(vocab_size_or_config_json_file=-1)
+        for key, value in json_object.items():
+            config.__dict__[key] = value
+        return config
+
+    @classmethod
+    def from_json_file(cls, json_file):
+        """Constructs a `BertConfig` from a json file of parameters."""
+        with open(json_file, "r", encoding='utf-8') as reader:
+            text = reader.read()
+        return cls.from_dict(json.loads(text))
+
+    def __repr__(self):
+        return str(self.to_json_string())
+
+    def to_dict(self):
+        """Serializes this instance to a Python dictionary."""
+        output = copy.deepcopy(self.__dict__)
+        return output
+
+    def to_json_string(self):
+        """Serializes this instance to a JSON string."""
+        return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"
+
+    def to_json_file(self, json_file_path):
+        """ Save this instance to a json file."""
+        with open(json_file_path, "w", encoding='utf-8') as writer:
+            writer.write(self.to_json_string())
+
+
+def prune_linear_layer(layer, index, dim=0):
+    """ Prune a linear layer (a model parameters) to keep only entries in index.
+        Return the pruned layer as a new layer with requires_grad=True.
+        Used to remove heads.
+    """
+    index = index.to(layer.weight.device)
+    W = layer.weight.index_select(dim, index).clone().detach()
+    if layer.bias is not None:
+        if dim == 1:
+            b = layer.bias.clone().detach()
+        else:
+            b = layer.bias[index].clone().detach()
+    new_size = list(layer.weight.size())
+    new_size[dim] = len(index)
+    new_layer = nn.Linear(new_size[1], new_size[0], bias=layer.bias is not None).to(layer.weight.device)
+    new_layer.weight.requires_grad = False
+    new_layer.weight.copy_(W.contiguous())
+    new_layer.weight.requires_grad = True
+    if layer.bias is not None:
+        new_layer.bias.requires_grad = False
+        new_layer.bias.copy_(b.contiguous())
+        new_layer.bias.requires_grad = True
+    return new_layer
+
+
+class Conv1D(nn.Module):
+    """ Conv1D layer as defined by Alec Radford for GPT (and also used in GPT-2)
+        Basically works like a Linear layer but the weights are transposed
+    """
+    def __init__(self, nf, nx):
+        super(Conv1D, self).__init__()
+        self.nf = nf
+        w = torch.empty(nx, nf)
+        nn.init.normal_(w, std=0.02)
+        self.weight = nn.Parameter(w)
+        self.bias = nn.Parameter(torch.zeros(nf))
+
+    def forward(self, x):
+        size_out = x.size()[:-1] + (self.nf,)
+        x = torch.addmm(self.bias, x.view(-1, x.size(-1)), self.weight)
+        x = x.view(*size_out)
+        return x
+
+
+def prune_conv1d_layer(layer, index, dim=1):
+    """ Prune a Conv1D layer (a model parameters) to keep only entries in index.
+        A Conv1D work as a Linear layer (see e.g. BERT) but the weights are transposed.
+        Return the pruned layer as a new layer with requires_grad=True.
+        Used to remove heads.
+    """
+    index = index.to(layer.weight.device)
+    W = layer.weight.index_select(dim, index).clone().detach()
+    if dim == 0:
+        b = layer.bias.clone().detach()
+    else:
+        b = layer.bias[index].clone().detach()
+    new_size = list(layer.weight.size())
+    new_size[dim] = len(index)
+    new_layer = Conv1D(new_size[1], new_size[0]).to(layer.weight.device)
+    new_layer.weight.requires_grad = False
+    new_layer.weight.copy_(W.contiguous())
+    new_layer.weight.requires_grad = True
+    new_layer.bias.requires_grad = False
+    new_layer.bias.copy_(b.contiguous())
+    new_layer.bias.requires_grad = True
+    return new_layer
diff --git a/pytorch_pretrained_bert/modeling.py b/pytorch_pretrained_bert/modeling.py
index ce55c50c68..13ad591f72 100644
--- a/pytorch_pretrained_bert/modeling.py
+++ b/pytorch_pretrained_bert/modeling.py
@@ -29,7 +29,8 @@ import torch
 from torch import nn
 from torch.nn import CrossEntropyLoss, MSELoss
 
-from .file_utils import cached_path, WEIGHTS_NAME, CONFIG_NAME
+from .file_utils import cached_path
+from .model_utils import WEIGHTS_NAME, CONFIG_NAME, PretrainedConfig, prune_linear_layer
 
 logger = logging.getLogger(__name__)
 
@@ -66,30 +67,6 @@ PRETRAINED_CONFIG_ARCHIVE_MAP = {
 BERT_CONFIG_NAME = 'bert_config.json'
 TF_WEIGHTS_NAME = 'model.ckpt'
 
-def prune_linear_layer(layer, index, dim=0):
-    """ Prune a linear layer (a model parameters) to keep only entries in index.
-        Return the pruned layer as a new layer with requires_grad=True.
-        Used to remove heads.
-    """
-    index = index.to(layer.weight.device)
-    W = layer.weight.index_select(dim, index).clone().detach()
-    if layer.bias is not None:
-        if dim == 1:
-            b = layer.bias.clone().detach()
-        else:
-            b = layer.bias[index].clone().detach()
-    new_size = list(layer.weight.size())
-    new_size[dim] = len(index)
-    new_layer = nn.Linear(new_size[1], new_size[0], bias=layer.bias is not None).to(layer.weight.device)
-    new_layer.weight.requires_grad = False
-    new_layer.weight.copy_(W.contiguous())
-    new_layer.weight.requires_grad = True
-    if layer.bias is not None:
-        new_layer.bias.requires_grad = False
-        new_layer.bias.copy_(b.contiguous())
-        new_layer.bias.requires_grad = True
-    return new_layer
-
 
 def load_tf_weights_in_bert(model, tf_checkpoint_path):
     """ Load tf checkpoints in a pytorch model
@@ -174,9 +151,11 @@ def swish(x):
 ACT2FN = {"gelu": gelu, "relu": torch.nn.functional.relu, "swish": swish}
 
 
-class BertConfig(object):
+class BertConfig(PretrainedConfig):
     """Configuration class to store the configuration of a `BertModel`.
     """
+    pretrained_config_archive_map = PRETRAINED_CONFIG_ARCHIVE_MAP
+
     def __init__(self,
                  vocab_size_or_config_json_file,
                  hidden_size=768,
@@ -238,37 +217,6 @@ class BertConfig(object):
             raise ValueError("First argument must be either a vocabulary size (int)"
                              "or the path to a pretrained model config file (str)")
 
-    @classmethod
-    def from_dict(cls, json_object):
-        """Constructs a `BertConfig` from a Python dictionary of parameters."""
-        config = BertConfig(vocab_size_or_config_json_file=-1)
-        for key, value in json_object.items():
-            config.__dict__[key] = value
-        return config
-
-    @classmethod
-    def from_json_file(cls, json_file):
-        """Constructs a `BertConfig` from a json file of parameters."""
-        with open(json_file, "r", encoding='utf-8') as reader:
-            text = reader.read()
-        return cls.from_dict(json.loads(text))
-
-    def __repr__(self):
-        return str(self.to_json_string())
-
-    def to_dict(self):
-        """Serializes this instance to a Python dictionary."""
-        output = copy.deepcopy(self.__dict__)
-        return output
-
-    def to_json_string(self):
-        """Serializes this instance to a JSON string."""
-        return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"
-
-    def to_json_file(self, json_file_path):
-        """ Save this instance to a json file."""
-        with open(json_file_path, "w", encoding='utf-8') as writer:
-            writer.write(self.to_json_string())
 
 try:
     from apex.normalization.fused_layer_norm import FusedLayerNorm as BertLayerNorm
diff --git a/pytorch_pretrained_bert/modeling_gpt2.py b/pytorch_pretrained_bert/modeling_gpt2.py
index c4c4876833..88ff521196 100644
--- a/pytorch_pretrained_bert/modeling_gpt2.py
+++ b/pytorch_pretrained_bert/modeling_gpt2.py
@@ -31,7 +31,8 @@ import torch.nn as nn
 from torch.nn import CrossEntropyLoss
 from torch.nn.parameter import Parameter
 
-from .file_utils import cached_path, CONFIG_NAME, WEIGHTS_NAME
+from .file_utils import cached_path
+from .model_utils import Conv1D, CONFIG_NAME, WEIGHTS_NAME, PretrainedConfig, prune_conv1d_layer
 from .modeling import BertLayerNorm as LayerNorm
 
 logger = logging.getLogger(__name__)
@@ -41,30 +42,6 @@ PRETRAINED_MODEL_ARCHIVE_MAP = {"gpt2": "https://s3.amazonaws.com/models.hugging
 PRETRAINED_CONFIG_ARCHIVE_MAP = {"gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-config.json",
                                  "gpt2-medium": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-config.json"}
 
-def prune_conv1d_layer(layer, index, dim=1):
-    """ Prune a Conv1D layer (a model parameters) to keep only entries in index.
-        A Conv1D work as a Linear layer (see e.g. BERT) but the weights are transposed.
-        Return the pruned layer as a new layer with requires_grad=True.
-        Used to remove heads.
-    """
-    index = index.to(layer.weight.device)
-    W = layer.weight.index_select(dim, index).clone().detach()
-    if dim == 0:
-        b = layer.bias.clone().detach()
-    else:
-        b = layer.bias[index].clone().detach()
-    new_size = list(layer.weight.size())
-    new_size[dim] = len(index)
-    new_layer = Conv1D(new_size[1], new_size[0]).to(layer.weight.device)
-    new_layer.weight.requires_grad = False
-    new_layer.weight.copy_(W.contiguous())
-    new_layer.weight.requires_grad = True
-    new_layer.bias.requires_grad = False
-    new_layer.bias.copy_(b.contiguous())
-    new_layer.bias.requires_grad = True
-    return new_layer
-
-
 def load_tf_weights_in_gpt2(model, gpt2_checkpoint_path):
     """ Load tf checkpoints in a pytorch model
     """
@@ -123,9 +100,10 @@ def gelu(x):
     return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
 
 
-class GPT2Config(object):
+class GPT2Config(PretrainedConfig):
     """Configuration class to store the configuration of a `GPT2Model`.
     """
+    pretrained_config_archive_map = PRETRAINED_CONFIG_ARCHIVE_MAP
 
     def __init__(
         self,
@@ -194,54 +172,6 @@ class GPT2Config(object):
     def total_tokens_embeddings(self):
         return self.vocab_size + self.n_special
 
-    @classmethod
-    def from_dict(cls, json_object):
-        """Constructs a `GPT2Config` from a Python dictionary of parameters."""
-        config = GPT2Config(vocab_size_or_config_json_file=-1)
-        for key, value in json_object.items():
-            config.__dict__[key] = value
-        return config
-
-    @classmethod
-    def from_json_file(cls, json_file):
-        """Constructs a `GPT2Config` from a json file of parameters."""
-        with open(json_file, "r", encoding="utf-8") as reader:
-            text = reader.read()
-        return cls.from_dict(json.loads(text))
-
-    def __repr__(self):
-        return str(self.to_json_string())
-
-    def to_dict(self):
-        """Serializes this instance to a Python dictionary."""
-        output = copy.deepcopy(self.__dict__)
-        return output
-
-    def to_json_string(self):
-        """Serializes this instance to a JSON string."""
-        return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"
-
-    def to_json_file(self, json_file_path):
-        """ Save this instance to a json file."""
-        with open(json_file_path, "w", encoding='utf-8') as writer:
-            writer.write(self.to_json_string())
-
-
-class Conv1D(nn.Module):
-    def __init__(self, nf, nx):
-        super(Conv1D, self).__init__()
-        self.nf = nf
-        w = torch.empty(nx, nf)
-        nn.init.normal_(w, std=0.02)
-        self.weight = Parameter(w)
-        self.bias = Parameter(torch.zeros(nf))
-
-    def forward(self, x):
-        size_out = x.size()[:-1] + (self.nf,)
-        x = torch.addmm(self.bias, x.view(-1, x.size(-1)), self.weight)
-        x = x.view(*size_out)
-        return x
-
 
 class Attention(nn.Module):
     def __init__(self, nx, n_ctx, config, scale=False, output_attentions=False, keep_multihead_output=False):
diff --git a/pytorch_pretrained_bert/modeling_openai.py b/pytorch_pretrained_bert/modeling_openai.py
index b4df679fe6..464bce26c0 100644
--- a/pytorch_pretrained_bert/modeling_openai.py
+++ b/pytorch_pretrained_bert/modeling_openai.py
@@ -31,9 +31,9 @@ import torch.nn as nn
 from torch.nn import CrossEntropyLoss
 from torch.nn.parameter import Parameter
 
-from .file_utils import cached_path, CONFIG_NAME, WEIGHTS_NAME
+from .file_utils import cached_path
+from .model_utils import Conv1D, CONFIG_NAME, WEIGHTS_NAME, PretrainedConfig, prune_conv1d_layer
 from .modeling import BertLayerNorm as LayerNorm
-from .modeling_gpt2 import prune_conv1d_layer
 
 logger = logging.getLogger(__name__)
 
@@ -122,9 +122,10 @@ def swish(x):
 ACT_FNS = {"relu": nn.ReLU, "swish": swish, "gelu": gelu}
 
 
-class OpenAIGPTConfig(object):
+class OpenAIGPTConfig(PretrainedConfig):
     """Configuration class to store the configuration of a `OpenAIGPTModel`.
     """
+    pretrained_config_archive_map = PRETRAINED_CONFIG_ARCHIVE_MAP
 
     def __init__(
         self,
@@ -197,61 +198,6 @@ class OpenAIGPTConfig(object):
     def total_tokens_embeddings(self):
         return self.vocab_size + self.n_special
 
-    @classmethod
-    def from_dict(cls, json_object):
-        """Constructs a `OpenAIGPTConfig` from a Python dictionary of parameters."""
-        config = OpenAIGPTConfig(vocab_size_or_config_json_file=-1)
-        for key, value in json_object.items():
-            config.__dict__[key] = value
-        return config
-
-    @classmethod
-    def from_json_file(cls, json_file):
-        """Constructs a `OpenAIGPTConfig` from a json file of parameters."""
-        with open(json_file, "r", encoding="utf-8") as reader:
-            text = reader.read()
-        return cls.from_dict(json.loads(text))
-
-    def __repr__(self):
-        return str(self.to_json_string())
-
-    def to_dict(self):
-        """Serializes this instance to a Python dictionary."""
-        output = copy.deepcopy(self.__dict__)
-        return output
-
-    def to_json_string(self):
-        """Serializes this instance to a JSON string."""
-        return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"
-
-    def to_json_file(self, json_file_path):
-        """ Save this instance to a json file."""
-        with open(json_file_path, "w", encoding='utf-8') as writer:
-            writer.write(self.to_json_string())
-
-
-class Conv1D(nn.Module):
-    def __init__(self, nf, rf, nx):
-        super(Conv1D, self).__init__()
-        self.rf = rf
-        self.nf = nf
-        if rf == 1:  # faster 1x1 conv
-            w = torch.empty(nx, nf)
-            nn.init.normal_(w, std=0.02)
-            self.weight = Parameter(w)
-            self.bias = Parameter(torch.zeros(nf))
-        else:  # was used to train LM
-            raise NotImplementedError
-
-    def forward(self, x):
-        if self.rf == 1:
-            size_out = x.size()[:-1] + (self.nf,)
-            x = torch.addmm(self.bias, x.view(-1, x.size(-1)), self.weight)
-            x = x.view(*size_out)
-        else:
-            raise NotImplementedError
-        return x
-
 
 class Attention(nn.Module):
     def __init__(self, nx, n_ctx, config, scale=False, output_attentions=False, keep_multihead_output=False):
@@ -268,8 +214,8 @@ class Attention(nn.Module):
         self.keep_multihead_output = keep_multihead_output
         self.multihead_output = None
 
-        self.c_attn = Conv1D(n_state * 3, 1, nx)
-        self.c_proj = Conv1D(n_state, 1, nx)
+        self.c_attn = Conv1D(n_state * 3, nx)
+        self.c_proj = Conv1D(n_state, nx)
         self.attn_dropout = nn.Dropout(config.attn_pdrop)
         self.resid_dropout = nn.Dropout(config.resid_pdrop)
 
@@ -348,8 +294,8 @@ class MLP(nn.Module):
     def __init__(self, n_state, config):  # in MLP: n_state=3072 (4 * n_embd)
         super(MLP, self).__init__()
         nx = config.n_embd
-        self.c_fc = Conv1D(n_state, 1, nx)
-        self.c_proj = Conv1D(nx, 1, n_state)
+        self.c_fc = Conv1D(n_state, nx)
+        self.c_proj = Conv1D(nx, n_state)
         self.act = ACT_FNS[config.afn]
         self.dropout = nn.Dropout(config.resid_pdrop)
 
diff --git a/pytorch_pretrained_bert/modeling_transfo_xl.py b/pytorch_pretrained_bert/modeling_transfo_xl.py
index c1a337a1d7..65e787c792 100644
--- a/pytorch_pretrained_bert/modeling_transfo_xl.py
+++ b/pytorch_pretrained_bert/modeling_transfo_xl.py
@@ -37,7 +37,8 @@ from torch.nn.parameter import Parameter
 
 from .modeling import BertLayerNorm as LayerNorm
 from .modeling_transfo_xl_utilities import ProjectedAdaptiveLogSoftmax, sample_logits
-from .file_utils import cached_path, CONFIG_NAME, WEIGHTS_NAME
+from .file_utils import cached_path
+from .model_utils import CONFIG_NAME, WEIGHTS_NAME, PretrainedConfig
 
 logger = logging.getLogger(__name__)
 
@@ -178,9 +179,11 @@ def load_tf_weights_in_transfo_xl(model, config, tf_path):
     return model
 
 
-class TransfoXLConfig(object):
+class TransfoXLConfig(PretrainedConfig):
     """Configuration class to store the configuration of a `TransfoXLModel`.
     """
+    pretrained_config_archive_map = PRETRAINED_CONFIG_ARCHIVE_MAP
+
     def __init__(self,
                  vocab_size_or_config_json_file=267735,
                  cutoffs=[20000, 40000, 200000],
@@ -285,38 +288,6 @@ class TransfoXLConfig(object):
             raise ValueError("First argument must be either a vocabulary size (int)"
                              "or the path to a pretrained model config file (str)")
 
-    @classmethod
-    def from_dict(cls, json_object):
-        """Constructs a `TransfoXLConfig` from a Python dictionary of parameters."""
-        config = TransfoXLConfig(vocab_size_or_config_json_file=-1)
-        for key, value in json_object.items():
-            config.__dict__[key] = value
-        return config
-
-    @classmethod
-    def from_json_file(cls, json_file):
-        """Constructs a `TransfoXLConfig` from a json file of parameters."""
-        with open(json_file, "r", encoding='utf-8') as reader:
-            text = reader.read()
-        return cls.from_dict(json.loads(text))
-
-    def __repr__(self):
-        return str(self.to_json_string())
-
-    def to_dict(self):
-        """Serializes this instance to a Python dictionary."""
-        output = copy.deepcopy(self.__dict__)
-        return output
-
-    def to_json_string(self):
-        """Serializes this instance to a JSON string."""
-        return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"
-
-    def to_json_file(self, json_file_path):
-        """ Save this instance to a json file."""
-        with open(json_file_path, "w", encoding='utf-8') as writer:
-            writer.write(self.to_json_string())
-
 
 class PositionalEmbedding(nn.Module):
     def __init__(self, demb):
diff --git a/pytorch_pretrained_bert/modeling_xlm.py b/pytorch_pretrained_bert/modeling_xlm.py
new file mode 100644
index 0000000000..aa23618199
--- /dev/null
+++ b/pytorch_pretrained_bert/modeling_xlm.py
@@ -0,0 +1,1748 @@
+# coding=utf-8
+# Copyright 2019-present, Facebook, Inc and the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch XLM model.
+"""
+from __future__ import (absolute_import, division, print_function,
+                        unicode_literals)
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import copy
+import json
+import logging
+import math
+import os
+import sys
+from io import open
+
+import math
+import itertools
+import numpy as np
+
+import torch
+from torch import nn
+from torch.nn import functional as F
+from torch.nn import CrossEntropyLoss, MSELoss
+
+from .file_utils import cached_path
+from .model_utils import CONFIG_NAME, WEIGHTS_NAME, PretrainedConfig
+
+logger = logging.getLogger(__name__)
+
+PRETRAINED_MODEL_ARCHIVE_MAP = {
+    'xlm-mlm-en-2048': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-en-2048-pytorch_model.bin",
+}
+PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    'xlm-mlm-en-2048': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-en-2048-config.json",
+}
+
+DECODER_ONLY_PARAMS = [
+    'layer_norm15.%i.weight', 'layer_norm15.%i.bias',
+    'encoder_attn.%i.q_lin.weight', 'encoder_attn.%i.q_lin.bias',
+    'encoder_attn.%i.k_lin.weight', 'encoder_attn.%i.k_lin.bias',
+    'encoder_attn.%i.v_lin.weight', 'encoder_attn.%i.v_lin.bias',
+    'encoder_attn.%i.out_lin.weight', 'encoder_attn.%i.out_lin.bias'
+]
+
+TRANSFORMER_LAYER_PARAMS = [
+    'attentions.%i.q_lin.weight', 'attentions.%i.q_lin.bias',
+    'attentions.%i.k_lin.weight', 'attentions.%i.k_lin.bias',
+    'attentions.%i.v_lin.weight', 'attentions.%i.v_lin.bias',
+    'attentions.%i.out_lin.weight', 'attentions.%i.out_lin.bias',
+    'layer_norm1.%i.weight', 'layer_norm1.%i.bias',
+    'ffns.%i.lin1.weight', 'ffns.%i.lin1.bias',
+    'ffns.%i.lin2.weight', 'ffns.%i.lin2.bias',
+    'layer_norm2.%i.weight', 'layer_norm2.%i.bias'
+]
+
+class XLMConfig(PretrainedConfig):
+    """Configuration class to store the configuration of a `XLMModel`.
+    """
+    pretrained_config_archive_map = PRETRAINED_CONFIG_ARCHIVE_MAP
+
+    def __init__(self,
+                 vocab_size_or_config_json_file,
+                 d_model=1024,
+                 n_layer=24,
+                 n_head=16,
+                 d_inner=4096,
+                 ff_activation="gelu",
+                 untie_r=True,
+                 attn_type="bi",
+
+                 max_position_embeddings=512,
+                 initializer_range=0.02,
+                 layer_norm_eps=1e-12,
+
+                 dropout=0.1,
+                 dropatt=0.1,
+                 init="normal",
+                 init_range=0.1,
+                 init_std=0.02,
+                 mem_len=None,
+                 reuse_len=None,
+                 bi_data=False,
+                 clamp_len=-1,
+                 same_length=False):
+        """Constructs XLMConfig.
+
+        Args:
+            vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `XLMModel`.
+            d_model: Size of the encoder layers and the pooler layer.
+            n_layer: Number of hidden layers in the Transformer encoder.
+            n_head: Number of attention heads for each attention layer in
+                the Transformer encoder.
+            d_inner: The size of the "intermediate" (i.e., feed-forward)
+                layer in the Transformer encoder.
+            ff_activation: The non-linear activation function (function or string) in the
+                encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
+            untie_r: untie relative position biases
+            attn_type: 'bi' for XLM, 'uni' for Transformer-XL
+
+            dropout: The dropout probabilitiy for all fully connected
+                layers in the embeddings, encoder, and pooler.
+            dropatt: The dropout ratio for the attention
+                probabilities.
+            max_position_embeddings: The maximum sequence length that this model might
+                ever be used with. Typically set this to something large just in case
+                (e.g., 512 or 1024 or 2048).
+            initializer_range: The sttdev of the truncated_normal_initializer for
+                initializing all weight matrices.
+            layer_norm_eps: The epsilon used by LayerNorm.
+
+            dropout: float, dropout rate.
+            dropatt: float, dropout rate on attention probabilities.
+            init: str, the initialization scheme, either "normal" or "uniform".
+            init_range: float, initialize the parameters with a uniform distribution
+                in [-init_range, init_range]. Only effective when init="uniform".
+            init_std: float, initialize the parameters with a normal distribution
+                with mean 0 and stddev init_std. Only effective when init="normal".
+            mem_len: int, the number of tokens to cache.
+            reuse_len: int, the number of tokens in the currect batch to be cached
+                and reused in the future.
+            bi_data: bool, whether to use bidirectional input pipeline.
+                Usually set to True during pretraining and False during finetuning.
+            clamp_len: int, clamp all relative distances larger than clamp_len.
+                -1 means no clamping.
+            same_length: bool, whether to use the same attention length for each token.
+        """
+        if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
+                        and isinstance(vocab_size_or_config_json_file, unicode)):
+            with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader:
+                json_config = json.loads(reader.read())
+            for key, value in json_config.items():
+                self.__dict__[key] = value
+        elif isinstance(vocab_size_or_config_json_file, int):
+            self.n_token = vocab_size_or_config_json_file
+            self.d_model = d_model
+            self.n_layer = n_layer
+            self.n_head = n_head
+            assert d_model % n_head == 0
+            self.d_head = d_model // n_head
+            self.ff_activation = ff_activation
+            self.d_inner = d_inner
+            self.untie_r = untie_r
+            self.attn_type = attn_type
+
+            self.max_position_embeddings = max_position_embeddings
+            self.initializer_range = initializer_range
+            self.layer_norm_eps = layer_norm_eps
+
+            self.init = init
+            self.init_range = init_range
+            self.init_std = init_std
+            self.dropout = dropout
+            self.dropatt = dropatt
+            self.mem_len = mem_len
+            self.reuse_len = reuse_len
+            self.bi_data = bi_data
+            self.clamp_len = clamp_len
+            self.same_length = same_length
+        else:
+            raise ValueError("First argument must be either a vocabulary size (int)"
+                             "or the path to a pretrained model config file (str)")
+
+
+try:
+    from apex.normalization.fused_layer_norm import FusedLayerNorm as XLMLayerNorm
+except ImportError:
+    logger.info("Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex .")
+    class XLMLayerNorm(nn.Module):
+        def __init__(self, d_model, eps=1e-12):
+            """Construct a layernorm module in the TF style (epsilon inside the square root).
+            """
+            super(XLMLayerNorm, self).__init__()
+            self.weight = nn.Parameter(torch.ones(d_model))
+            self.bias = nn.Parameter(torch.zeros(d_model))
+            self.variance_epsilon = eps
+
+        def forward(self, x):
+            u = x.mean(-1, keepdim=True)
+            s = (x - u).pow(2).mean(-1, keepdim=True)
+            x = (x - u) / torch.sqrt(s + self.variance_epsilon)
+            return self.weight * x + self.bias
+
+
+def Embedding(num_embeddings, embedding_dim, padding_idx=None):
+    m = nn.Embedding(num_embeddings, embedding_dim, padding_idx=padding_idx)
+    nn.init.normal_(m.weight, mean=0, std=embedding_dim ** -0.5)
+    if padding_idx is not None:
+        nn.init.constant_(m.weight[padding_idx], 0)
+    return m
+
+
+def Linear(in_features, out_features, bias=True):
+    m = nn.Linear(in_features, out_features, bias)
+    # nn.init.normal_(m.weight, mean=0, std=1)
+    # nn.init.xavier_uniform_(m.weight)
+    # nn.init.constant_(m.bias, 0.)
+    return m
+
+
+def create_sinusoidal_embeddings(n_pos, dim, out):
+    position_enc = np.array([
+        [pos / np.power(10000, 2 * (j // 2) / dim) for j in range(dim)]
+        for pos in range(n_pos)
+    ])
+    out[:, 0::2] = torch.FloatTensor(np.sin(position_enc[:, 0::2]))
+    out[:, 1::2] = torch.FloatTensor(np.cos(position_enc[:, 1::2]))
+    out.detach_()
+    out.requires_grad = False
+
+
+def gelu(x):
+    """
+    GELU activation
+    https://arxiv.org/abs/1606.08415
+    https://github.com/huggingface/pytorch-openai-transformer-lm/blob/master/model_pytorch.py#L14
+    https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/modeling.py
+    """
+    # return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
+    return 0.5 * x * (1.0 + torch.erf(x / math.sqrt(2.0)))
+
+
+def get_masks(slen, lengths, causal):
+    """
+    Generate hidden states mask, and optionally an attention mask.
+    """
+    assert lengths.max().item() <= slen
+    bs = lengths.size(0)
+    alen = torch.arange(slen, dtype=torch.long, device=lengths.device)
+    mask = alen < lengths[:, None]
+
+    # attention mask is the same as mask, or triangular inferior attention (causal)
+    if causal:
+        attn_mask = alen[None, None, :].repeat(bs, slen, 1) <= alen[None, :, None]
+    else:
+        attn_mask = mask
+
+    # sanity check
+    assert mask.size() == (bs, slen)
+    assert causal is False or attn_mask.size() == (bs, slen, slen)
+
+    return mask, attn_mask
+
+
+class MultiHeadAttention(nn.Module):
+
+    NEW_ID = itertools.count()
+
+    def __init__(self, n_heads, dim, dropout):
+        super().__init__()
+        self.layer_id = next(MultiHeadAttention.NEW_ID)
+        self.dim = dim
+        self.n_heads = n_heads
+        self.dropout = dropout
+        assert self.dim % self.n_heads == 0
+
+        self.q_lin = Linear(dim, dim)
+        self.k_lin = Linear(dim, dim)
+        self.v_lin = Linear(dim, dim)
+        self.out_lin = Linear(dim, dim)
+
+    def forward(self, input, mask, kv=None, cache=None):
+        """
+        Self-attention (if kv is None) or attention over source sentence (provided by kv).
+        """
+        # Input is (bs, qlen, dim)
+        # Mask is (bs, klen) (non-causal) or (bs, klen, klen)
+        bs, qlen, dim = input.size()
+        if kv is None:
+            klen = qlen if cache is None else cache['slen'] + qlen
+        else:
+            klen = kv.size(1)
+        assert dim == self.dim, 'Dimensions do not match: %s input vs %s configured' % (dim, self.dim)
+        n_heads = self.n_heads
+        dim_per_head = dim // n_heads
+        mask_reshape = (bs, 1, qlen, klen) if mask.dim() == 3 else (bs, 1, 1, klen)
+
+        def shape(x):
+            """  projection """
+            return x.view(bs, -1, self.n_heads, dim_per_head).transpose(1, 2)
+
+        def unshape(x):
+            """  compute context """
+            return x.transpose(1, 2).contiguous().view(bs, -1, self.n_heads * dim_per_head)
+
+        q = shape(self.q_lin(input))                                          # (bs, n_heads, qlen, dim_per_head)
+        if kv is None:
+            k = shape(self.k_lin(input))                                      # (bs, n_heads, qlen, dim_per_head)
+            v = shape(self.v_lin(input))                                      # (bs, n_heads, qlen, dim_per_head)
+        elif cache is None or self.layer_id not in cache:
+            k = v = kv
+            k = shape(self.k_lin(k))                                          # (bs, n_heads, qlen, dim_per_head)
+            v = shape(self.v_lin(v))                                          # (bs, n_heads, qlen, dim_per_head)
+
+        if cache is not None:
+            if self.layer_id in cache:
+                if kv is None:
+                    k_, v_ = cache[self.layer_id]
+                    k = torch.cat([k_, k], dim=2)                             # (bs, n_heads, klen, dim_per_head)
+                    v = torch.cat([v_, v], dim=2)                             # (bs, n_heads, klen, dim_per_head)
+                else:
+                    k, v = cache[self.layer_id]
+            cache[self.layer_id] = (k, v)
+
+        q = q / math.sqrt(dim_per_head)                                       # (bs, n_heads, qlen, dim_per_head)
+        scores = torch.matmul(q, k.transpose(2, 3))                           # (bs, n_heads, qlen, klen)
+        mask = (mask == 0).view(mask_reshape).expand_as(scores)               # (bs, n_heads, qlen, klen)
+        scores.masked_fill_(mask, -float('inf'))                              # (bs, n_heads, qlen, klen)
+
+        weights = F.softmax(scores.float(), dim=-1).type_as(scores)           # (bs, n_heads, qlen, klen)
+        weights = F.dropout(weights, p=self.dropout, training=self.training)  # (bs, n_heads, qlen, klen)
+        context = torch.matmul(weights, v)                                    # (bs, n_heads, qlen, dim_per_head)
+        context = unshape(context)                                            # (bs, qlen, dim)
+
+        return self.out_lin(context)
+
+
+class TransformerFFN(nn.Module):
+
+    def __init__(self, in_dim, dim_hidden, out_dim, dropout, gelu_activation):
+        super().__init__()
+        self.dropout = dropout
+        self.lin1 = Linear(in_dim, dim_hidden)
+        self.lin2 = Linear(dim_hidden, out_dim)
+        self.act = gelu if gelu_activation else F.relu
+
+    def forward(self, input):
+        x = self.lin1(input)
+        x = self.act(x)
+        x = self.lin2(x)
+        x = F.dropout(x, p=self.dropout, training=self.training)
+        return x
+
+
+class BeamHypotheses(object):
+
+    def __init__(self, n_hyp, max_len, length_penalty, early_stopping):
+        """
+        Initialize n-best list of hypotheses.
+        """
+        self.max_len = max_len - 1  # ignoring <BOS>
+        self.length_penalty = length_penalty
+        self.early_stopping = early_stopping
+        self.n_hyp = n_hyp
+        self.hyp = []
+        self.worst_score = 1e9
+
+    def __len__(self):
+        """
+        Number of hypotheses in the list.
+        """
+        return len(self.hyp)
+
+    def add(self, hyp, sum_logprobs):
+        """
+        Add a new hypothesis to the list.
+        """
+        score = sum_logprobs / len(hyp) ** self.length_penalty
+        if len(self) < self.n_hyp or score > self.worst_score:
+            self.hyp.append((score, hyp))
+            if len(self) > self.n_hyp:
+                sorted_scores = sorted([(s, idx) for idx, (s, _) in enumerate(self.hyp)])
+                del self.hyp[sorted_scores[0][1]]
+                self.worst_score = sorted_scores[1][0]
+            else:
+                self.worst_score = min(score, self.worst_score)
+
+    def is_done(self, best_sum_logprobs):
+        """
+        If there are enough hypotheses and that none of the hypotheses being generated
+        can become better than the worst one in the heap, then we are done with this sentence.
+        """
+        if len(self) < self.n_hyp:
+            return False
+        elif self.early_stopping:
+            return True
+        else:
+            return self.worst_score >= best_sum_logprobs / self.max_len ** self.length_penalty
+
+
+class XLMPreTrainedModel(nn.Module):
+    """ An abstract class to handle weights initialization and
+        a simple interface for dowloading and loading pretrained models.
+    """
+    def __init__(self, config, *inputs, **kwargs):
+        super(XLMPreTrainedModel, self).__init__()
+        if not isinstance(config, XLMBaseConfig):
+            raise ValueError(
+                "Parameter config in `{}(config)` should be an instance of class `XLMBaseConfig`. "
+                "To create a model from a Google pretrained model use "
+                "`model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`".format(
+                    self.__class__.__name__, self.__class__.__name__
+                ))
+        self.config = config
+
+    def init_weights(self, module):
+        """ Initialize the weights.
+        """
+        if isinstance(module, (nn.Linear, nn.Embedding)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+        elif isinstance(module, XLMLayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, XLMRelativeAttention):
+            for param in [module.q, module.k, module.v, module.o, module.r,
+                          module.r_r_bias, module.r_s_bias, module.r_w_bias,
+                          module.seg_embed]:
+                param.data.normal_(mean=0.0, std=self.config.initializer_range)
+        if isinstance(module, nn.Linear) and module.bias is not None:
+            module.bias.data.zero_()
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
+        """
+        Instantiate a XLMPreTrainedModel from a pre-trained model file or a pytorch state dict.
+        Download and cache the pre-trained model file if needed.
+
+        Params:
+            pretrained_model_name_or_path: either:
+                - a str with the name of a pre-trained model to load selected in the list of:
+                    . `xlnet-large-cased`
+                - a path or url to a pretrained model archive containing:
+                    . `config.json` a configuration file for the model
+                    . `pytorch_model.bin` a PyTorch dump of a XLMForPreTraining instance
+                - a path or url to a pretrained model archive containing:
+                    . `xlnet_config.json` a configuration file for the model
+                    . `model.chkpt` a TensorFlow checkpoint
+            cache_dir: an optional path to a folder in which the pre-trained models will be cached.
+            state_dict: an optional state dictionnary (collections.OrderedDict object) to use instead of Google pre-trained models
+            *inputs, **kwargs: additional input for the specific XLM class
+                (ex: num_labels for XLMForSequenceClassification)
+        """
+        state_dict = kwargs.get('state_dict', None)
+        kwargs.pop('state_dict', None)
+        cache_dir = kwargs.get('cache_dir', None)
+        kwargs.pop('cache_dir', None)
+
+        if pretrained_model_name_or_path in PRETRAINED_MODEL_ARCHIVE_MAP:
+            archive_file = PRETRAINED_MODEL_ARCHIVE_MAP[pretrained_model_name_or_path]
+            config_file = PRETRAINED_CONFIG_ARCHIVE_MAP[pretrained_model_name_or_path]
+        else:
+            if from_tf:
+                # Directly load from a TensorFlow checkpoint
+                archive_file = os.path.join(pretrained_model_name_or_path, TF_WEIGHTS_NAME)
+                config_file = os.path.join(pretrained_model_name_or_path, XLNET_CONFIG_NAME)
+            else:
+                archive_file = os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME)
+                config_file = os.path.join(pretrained_model_name_or_path, CONFIG_NAME)
+        # redirect to the cache, if necessary
+        try:
+            resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir)
+        except EnvironmentError:
+            if pretrained_model_name_or_path in PRETRAINED_MODEL_ARCHIVE_MAP:
+                logger.error(
+                    "Couldn't reach server at '{}' to download pretrained weights.".format(
+                        archive_file))
+            else:
+                logger.error(
+                    "Model name '{}' was not found in model name list ({}). "
+                    "We assumed '{}' was a path or url but couldn't find any file "
+                    "associated to this path or url.".format(
+                        pretrained_model_name_or_path,
+                        ', '.join(PRETRAINED_MODEL_ARCHIVE_MAP.keys()),
+                        archive_file))
+            return None
+        try:
+            resolved_config_file = cached_path(config_file, cache_dir=cache_dir)
+        except EnvironmentError:
+            if pretrained_model_name_or_path in PRETRAINED_CONFIG_ARCHIVE_MAP:
+                logger.error(
+                    "Couldn't reach server at '{}' to download pretrained model configuration file.".format(
+                        config_file))
+            else:
+                logger.error(
+                    "Model name '{}' was not found in model name list ({}). "
+                    "We assumed '{}' was a path or url but couldn't find any file "
+                    "associated to this path or url.".format(
+                        pretrained_model_name_or_path,
+                        ', '.join(PRETRAINED_CONFIG_ARCHIVE_MAP.keys()),
+                        config_file))
+            return None
+        if resolved_archive_file == archive_file and resolved_config_file == config_file:
+            logger.info("loading weights file {}".format(archive_file))
+            logger.info("loading configuration file {}".format(config_file))
+        else:
+            logger.info("loading weights file {} from cache at {}".format(
+                archive_file, resolved_archive_file))
+            logger.info("loading configuration file {} from cache at {}".format(
+                config_file, resolved_config_file))
+
+        # Load config
+        config = XLMConfig.from_json_file(resolved_config_file)
+
+        # Update config with kwargs if needed
+        to_remove = []
+        for key, value in kwargs.items():
+            if hasattr(config, key):
+                setattr(config, key, value)
+                to_remove.append(key)
+        for key in to_remove:
+            kwargs.pop(key, None)
+
+        logger.info("Model config {}".format(config))
+
+        # Instantiate model.
+        model = cls(config, *inputs, **kwargs)
+        if state_dict is None and not from_tf:
+            state_dict = torch.load(resolved_archive_file, map_location='cpu')
+
+        # Load from a PyTorch state_dict
+        missing_keys = []
+        unexpected_keys = []
+        error_msgs = []
+        # copy state_dict so _load_from_state_dict can modify it
+        metadata = getattr(state_dict, '_metadata', None)
+        state_dict = state_dict.copy()
+        if metadata is not None:
+            state_dict._metadata = metadata
+
+        def load(module, prefix=''):
+            local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {})
+            module._load_from_state_dict(
+                state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys, error_msgs)
+            for name, child in module._modules.items():
+                if child is not None:
+                    load(child, prefix + name + '.')
+        start_prefix = ''
+        if not hasattr(model, 'transformer') and any(s.startswith('transformer') for s in state_dict.keys()):
+            start_prefix = 'transformer.'
+        load(model, prefix=start_prefix)
+        if len(missing_keys) > 0:
+            logger.info("Weights of {} not initialized from pretrained model: {}".format(
+                model.__class__.__name__, missing_keys))
+        if len(unexpected_keys) > 0:
+            logger.info("Weights from pretrained model not used in {}: {}".format(
+                model.__class__.__name__, unexpected_keys))
+        if len(error_msgs) > 0:
+            raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format(
+                               model.__class__.__name__, "\n\t".join(error_msgs)))
+        if isinstance(model, XLMLMHeadModel):
+            model.tie_weights()  # make sure word embedding weights are still tied
+        return model
+
+
+class XLMModel(XLMPreTrainedModel):
+
+    ATTRIBUTES = ['encoder', 'eos_index', 'pad_index',  # 'with_output', 
+                  'n_langs', 'n_words', 'dim', 'n_layers', 'n_heads', 
+                  'hidden_dim', 'dropout', 'attention_dropout', 'asm',
+                  'asm_cutoffs', 'asm_div_value']
+
+    def __init__(self, params, output_attentions=False, keep_multihead_output=False):  #, dico, is_encoder, with_output):
+        """
+        Transformer model (encoder or decoder).
+        """
+        super(XLMModel, self).__init__(params)
+        self.output_attentions = output_attentions
+
+        # encoder / decoder, output layer
+        # self.is_encoder = is_encoder
+        # self.is_decoder = not is_encoder
+        # self.with_output = with_output
+
+        # dictionary / languages
+        self.n_langs = params.n_langs
+        self.n_words = params.n_words
+        self.eos_index = params.eos_index
+        self.pad_index = params.pad_index
+        # self.dico = dico
+        self.id2lang = params.id2lang
+        self.lang2id = params.lang2id
+        # assert len(self.dico) == self.n_words
+        assert len(self.id2lang) == len(self.lang2id) == self.n_langs
+
+        # model parameters
+        self.dim = params.emb_dim       # 512 by default
+        self.hidden_dim = self.dim * 4  # 2048 by default
+        self.n_heads = params.n_heads   # 8 by default
+        self.n_layers = params.n_layers
+        self.dropout = params.dropout
+        self.attention_dropout = params.attention_dropout
+        assert self.dim % self.n_heads == 0, 'transformer dim must be a multiple of n_heads'
+
+        # embeddings
+        self.position_embeddings = Embedding(params.max_position_embeddings, self.dim)
+        if params.sinusoidal_embeddings:
+            create_sinusoidal_embeddings(params.max_position_embeddings, self.dim, out=self.position_embeddings.weight)
+        if params.n_langs > 1:
+            self.lang_embeddings = Embedding(self.n_langs, self.dim)
+        self.embeddings = Embedding(self.n_words, self.dim, padding_idx=self.pad_index)
+        self.layer_norm_emb = nn.LayerNorm(self.dim, eps=1e-12)
+
+        # transformer layers
+        self.attentions = nn.ModuleList()
+        self.layer_norm1 = nn.ModuleList()
+        self.ffns = nn.ModuleList()
+        self.layer_norm2 = nn.ModuleList()
+        if self.is_decoder:
+            self.layer_norm15 = nn.ModuleList()
+            self.encoder_attn = nn.ModuleList()
+
+        for _ in range(self.n_layers):
+            self.attentions.append(MultiHeadAttention(self.n_heads, self.dim, dropout=self.attention_dropout))
+            self.layer_norm1.append(nn.LayerNorm(self.dim, eps=1e-12))
+            if self.is_decoder:
+                self.layer_norm15.append(nn.LayerNorm(self.dim, eps=1e-12))
+                self.encoder_attn.append(MultiHeadAttention(self.n_heads, self.dim, dropout=self.attention_dropout))
+            self.ffns.append(TransformerFFN(self.dim, self.hidden_dim, self.dim, dropout=self.dropout, gelu_activation=params.gelu_activation))
+            self.layer_norm2.append(nn.LayerNorm(self.dim, eps=1e-12))
+
+        # output layer
+        # if self.with_output:
+        #     self.pred_layer = PredLayer(params)
+        #     if params.share_inout_emb:
+        #         self.pred_layer.proj.weight = self.embeddings.weight
+
+    # def forward(self, mode, **kwargs):
+    #     """
+    #     Forward function with different forward modes.
+    #     ### Small hack to handle PyTorch distributed.
+    #     """
+    #     if mode == 'fwd':
+    #         return self.fwd(**kwargs)
+    #     elif mode == 'predict':
+    #         return self.predict(**kwargs)
+    #     else:
+    #         raise Exception("Unknown mode: %s" % mode)
+
+    def forward(self, x, lengths, causal, src_enc=None, src_len=None, positions=None, langs=None, cache=None):
+        """
+        Inputs:
+            `x` LongTensor(slen, bs), containing word indices
+            `lengths` LongTensor(bs), containing the length of each sentence
+            `causal` Boolean, if True, the attention is only done over previous hidden states
+            `positions` LongTensor(slen, bs), containing word positions
+            `langs` LongTensor(slen, bs), containing language IDs
+        """
+        # lengths = (x != self.pad_index).float().sum(dim=1)
+        # mask = x != self.pad_index
+
+        # check inputs
+        slen, bs = x.size()
+        assert lengths.size(0) == bs
+        assert lengths.max().item() <= slen
+        x = x.transpose(0, 1)  # batch size as dimension 0
+        assert (src_enc is None) == (src_len is None)
+        if src_enc is not None:
+            assert self.is_decoder
+            assert src_enc.size(0) == bs
+
+        # generate masks
+        mask, attn_mask = get_masks(slen, lengths, causal)
+        if self.is_decoder and src_enc is not None:
+            src_mask = torch.arange(src_len.max(), dtype=torch.long, device=lengths.device) < src_len[:, None]
+
+        # positions
+        if positions is None:
+            positions = x.new(slen).long()
+            positions = torch.arange(slen, out=positions).unsqueeze(0)
+        else:
+            assert positions.size() == (slen, bs)
+            positions = positions.transpose(0, 1)
+
+        # langs
+        if langs is not None:
+            assert langs.size() == (slen, bs)
+            langs = langs.transpose(0, 1)
+
+        # do not recompute cached elements
+        if cache is not None:
+            _slen = slen - cache['slen']
+            x = x[:, -_slen:]
+            positions = positions[:, -_slen:]
+            if langs is not None:
+                langs = langs[:, -_slen:]
+            mask = mask[:, -_slen:]
+            attn_mask = attn_mask[:, -_slen:]
+
+        # embeddings
+        tensor = self.embeddings(x)
+        tensor = tensor + self.position_embeddings(positions).expand_as(tensor)
+        if langs is not None:
+            tensor = tensor + self.lang_embeddings(langs)
+        tensor = self.layer_norm_emb(tensor)
+        tensor = F.dropout(tensor, p=self.dropout, training=self.training)
+        tensor *= mask.unsqueeze(-1).to(tensor.dtype)
+
+        # transformer layers
+        for i in range(self.n_layers):
+
+            # self attention
+            attn = self.attentions[i](tensor, attn_mask, cache=cache)
+            attn = F.dropout(attn, p=self.dropout, training=self.training)
+            tensor = tensor + attn
+            tensor = self.layer_norm1[i](tensor)
+
+            # encoder attention (for decoder only)
+            if self.is_decoder and src_enc is not None:
+                attn = self.encoder_attn[i](tensor, src_mask, kv=src_enc, cache=cache)
+                attn = F.dropout(attn, p=self.dropout, training=self.training)
+                tensor = tensor + attn
+                tensor = self.layer_norm15[i](tensor)
+
+            # FFN
+            tensor = tensor + self.ffns[i](tensor)
+            tensor = self.layer_norm2[i](tensor)
+            tensor *= mask.unsqueeze(-1).to(tensor.dtype)
+
+        # update cache length
+        if cache is not None:
+            cache['slen'] += tensor.size(1)
+
+        # move back sequence length to dimension 0
+        tensor = tensor.transpose(0, 1)
+
+        return tensor
+
+    def predict(self, tensor, pred_mask, y, get_scores):
+        """
+        Given the last hidden state, compute word scores and/or the loss.
+            `pred_mask` is a ByteTensor of shape (slen, bs), filled with 1 when
+                we need to predict a word
+            `y` is a LongTensor of shape (pred_mask.sum(),)
+            `get_scores` is a boolean specifying whether we need to return scores
+        """
+        masked_tensor = tensor[pred_mask.unsqueeze(-1).expand_as(tensor)].view(-1, self.dim)
+        scores, loss = self.pred_layer(masked_tensor, y, get_scores)
+        return scores, loss
+
+    def generate(self, src_enc, src_len, tgt_lang_id, max_len=200, sample_temperature=None):
+        """
+        Decode a sentence given initial start.
+        `x`:
+            - LongTensor(bs, slen)
+                <EOS> W1 W2 W3 <EOS> <PAD>
+                <EOS> W1 W2 W3   W4  <EOS>
+        `lengths`:
+            - LongTensor(bs) [5, 6]
+        `positions`:
+            - False, for regular "arange" positions (LM)
+            - True, to reset positions from the new generation (MT)
+        `langs`:
+            - must be None if the model only supports one language
+            - lang_id if only one language is involved (LM)
+            - (lang_id1, lang_id2) if two languages are involved (MT)
+        """
+
+        # input batch
+        bs = len(src_len)
+        assert src_enc.size(0) == bs
+
+        # generated sentences
+        generated = src_len.new(max_len, bs)  # upcoming output
+        generated.fill_(self.pad_index)       # fill upcoming ouput with <PAD>
+        generated[0].fill_(self.eos_index)    # we use <EOS> for <BOS> everywhere
+
+        # positions
+        positions = src_len.new(max_len).long()
+        positions = torch.arange(max_len, out=positions).unsqueeze(1).expand(max_len, bs)
+
+        # language IDs
+        langs = src_len.new(max_len).long().fill_(tgt_lang_id)
+        langs = langs.unsqueeze(1).expand(max_len, bs)
+
+        # current position / max lengths / length of generated sentences / unfinished sentences
+        cur_len = 1
+        gen_len = src_len.clone().fill_(1)
+        unfinished_sents = src_len.clone().fill_(1)
+
+        # cache compute states
+        cache = {'slen': 0}
+
+        while cur_len < max_len:
+
+            # compute word scores
+            tensor = self.forward(
+                'fwd',
+                x=generated[:cur_len],
+                lengths=gen_len,
+                positions=positions[:cur_len],
+                langs=langs[:cur_len],
+                causal=True,
+                src_enc=src_enc,
+                src_len=src_len,
+                cache=cache
+            )
+            assert tensor.size() == (1, bs, self.dim)
+            tensor = tensor.data[-1, :, :]               # (bs, dim)
+            scores = self.pred_layer.get_scores(tensor)  # (bs, n_words)
+
+            # select next words: sample or greedy
+            if sample_temperature is None:
+                next_words = torch.topk(scores, 1)[1].squeeze(1)
+            else:
+                next_words = torch.multinomial(F.softmax(scores / sample_temperature, dim=1), 1).squeeze(1)
+            assert next_words.size() == (bs,)
+
+            # update generations / lengths / finished sentences / current length
+            generated[cur_len] = next_words * unfinished_sents + self.pad_index * (1 - unfinished_sents)
+            gen_len.add_(unfinished_sents)
+            unfinished_sents.mul_(next_words.ne(self.eos_index).long())
+            cur_len = cur_len + 1
+
+            # stop when there is a </s> in each sentence, or if we exceed the maximul length
+            if unfinished_sents.max() == 0:
+                break
+
+        # add <EOS> to unfinished sentences
+        if cur_len == max_len:
+            generated[-1].masked_fill_(unfinished_sents.byte(), self.eos_index)
+
+        # sanity check
+        assert (generated == self.eos_index).sum() == 2 * bs
+
+        return generated[:cur_len], gen_len
+
+    def generate_beam(self, src_enc, src_len, tgt_lang_id, beam_size, length_penalty, early_stopping, max_len=200):
+        """
+        Decode a sentence given initial start.
+        `x`:
+            - LongTensor(bs, slen)
+                <EOS> W1 W2 W3 <EOS> <PAD>
+                <EOS> W1 W2 W3   W4  <EOS>
+        `lengths`:
+            - LongTensor(bs) [5, 6]
+        `positions`:
+            - False, for regular "arange" positions (LM)
+            - True, to reset positions from the new generation (MT)
+        `langs`:
+            - must be None if the model only supports one language
+            - lang_id if only one language is involved (LM)
+            - (lang_id1, lang_id2) if two languages are involved (MT)
+        """
+
+        # check inputs
+        assert src_enc.size(0) == src_len.size(0)
+        assert beam_size >= 1
+
+        # batch size / number of words
+        bs = len(src_len)
+        n_words = self.n_words
+
+        # expand to beam size the source latent representations / source lengths
+        src_enc = src_enc.unsqueeze(1).expand((bs, beam_size) + src_enc.shape[1:]).contiguous().view((bs * beam_size,) + src_enc.shape[1:])
+        src_len = src_len.unsqueeze(1).expand(bs, beam_size).contiguous().view(-1)
+
+        # generated sentences (batch with beam current hypotheses)
+        generated = src_len.new(max_len, bs * beam_size)  # upcoming output
+        generated.fill_(self.pad_index)                   # fill upcoming ouput with <PAD>
+        generated[0].fill_(self.eos_index)                # we use <EOS> for <BOS> everywhere
+
+        # generated hypotheses
+        generated_hyps = [BeamHypotheses(beam_size, max_len, length_penalty, early_stopping) for _ in range(bs)]
+
+        # positions
+        positions = src_len.new(max_len).long()
+        positions = torch.arange(max_len, out=positions).unsqueeze(1).expand_as(generated)
+
+        # language IDs
+        langs = positions.clone().fill_(tgt_lang_id)
+
+        # scores for each sentence in the beam
+        beam_scores = src_enc.new(bs, beam_size).fill_(0)
+        beam_scores[:, 1:] = -1e9
+        beam_scores = beam_scores.view(-1)
+
+        # current position
+        cur_len = 1
+
+        # cache compute states
+        cache = {'slen': 0}
+
+        # done sentences
+        done = [False for _ in range(bs)]
+
+        while cur_len < max_len:
+
+            # compute word scores
+            tensor = self.forward(
+                'fwd',
+                x=generated[:cur_len],
+                lengths=src_len.new(bs * beam_size).fill_(cur_len),
+                positions=positions[:cur_len],
+                langs=langs[:cur_len],
+                causal=True,
+                src_enc=src_enc,
+                src_len=src_len,
+                cache=cache
+            )
+            assert tensor.size() == (1, bs * beam_size, self.dim)
+            tensor = tensor.data[-1, :, :]               # (bs * beam_size, dim)
+            scores = self.pred_layer.get_scores(tensor)  # (bs * beam_size, n_words)
+            scores = F.log_softmax(scores, dim=-1)       # (bs * beam_size, n_words)
+            assert scores.size() == (bs * beam_size, n_words)
+
+            # select next words with scores
+            _scores = scores + beam_scores[:, None].expand_as(scores)  # (bs * beam_size, n_words)
+            _scores = _scores.view(bs, beam_size * n_words)            # (bs, beam_size * n_words)
+
+            next_scores, next_words = torch.topk(_scores, 2 * beam_size, dim=1, largest=True, sorted=True)
+            assert next_scores.size() == next_words.size() == (bs, 2 * beam_size)
+
+            # next batch beam content
+            # list of (bs * beam_size) tuple(next hypothesis score, next word, current position in the batch)
+            next_batch_beam = []
+
+            # for each sentence
+            for sent_id in range(bs):
+
+                # if we are done with this sentence
+                done[sent_id] = done[sent_id] or generated_hyps[sent_id].is_done(next_scores[sent_id].max().item())
+                if done[sent_id]:
+                    next_batch_beam.extend([(0, self.pad_index, 0)] * beam_size)  # pad the batch
+                    continue
+
+                # next sentence beam content
+                next_sent_beam = []
+
+                # next words for this sentence
+                for idx, value in zip(next_words[sent_id], next_scores[sent_id]):
+
+                    # get beam and word IDs
+                    beam_id = idx // n_words
+                    word_id = idx % n_words
+
+                    # end of sentence, or next word
+                    if word_id == self.eos_index or cur_len + 1 == max_len:
+                        generated_hyps[sent_id].add(generated[:cur_len, sent_id * beam_size + beam_id].clone(), value.item())
+                    else:
+                        next_sent_beam.append((value, word_id, sent_id * beam_size + beam_id))
+
+                    # the beam for next step is full
+                    if len(next_sent_beam) == beam_size:
+                        break
+
+                # update next beam content
+                assert len(next_sent_beam) == 0 if cur_len + 1 == max_len else beam_size
+                if len(next_sent_beam) == 0:
+                    next_sent_beam = [(0, self.pad_index, 0)] * beam_size  # pad the batch
+                next_batch_beam.extend(next_sent_beam)
+                assert len(next_batch_beam) == beam_size * (sent_id + 1)
+
+            # sanity check / prepare next batch
+            assert len(next_batch_beam) == bs * beam_size
+            beam_scores = beam_scores.new([x[0] for x in next_batch_beam])
+            beam_words = generated.new([x[1] for x in next_batch_beam])
+            beam_idx = src_len.new([x[2] for x in next_batch_beam])
+
+            # re-order batch and internal states
+            generated = generated[:, beam_idx]
+            generated[cur_len] = beam_words
+            for k in cache.keys():
+                if k != 'slen':
+                    cache[k] = (cache[k][0][beam_idx], cache[k][1][beam_idx])
+
+            # update current length
+            cur_len = cur_len + 1
+
+            # stop when we are done with each sentence
+            if all(done):
+                break
+
+        # visualize hypotheses
+        # print([len(x) for x in generated_hyps], cur_len)
+        # globals().update( locals() );
+        # !import code; code.interact(local=vars())
+        # for ii in range(bs):
+        #     for ss, ww in sorted(generated_hyps[ii].hyp, key=lambda x: x[0], reverse=True):
+        #         print("%.3f " % ss + " ".join(self.dico[x] for x in ww.tolist()))
+        #     print("")
+
+        # select the best hypotheses
+        tgt_len = src_len.new(bs)
+        best = []
+
+        for i, hypotheses in enumerate(generated_hyps):
+            best_hyp = max(hypotheses.hyp, key=lambda x: x[0])[1]
+            tgt_len[i] = len(best_hyp) + 1  # +1 for the <EOS> symbol
+            best.append(best_hyp)
+
+        # generate target batch
+        decoded = src_len.new(tgt_len.max().item(), bs).fill_(self.pad_index)
+        for i, hypo in enumerate(best):
+            decoded[:tgt_len[i] - 1, i] = hypo
+            decoded[tgt_len[i] - 1, i] = self.eos_index
+
+        # sanity check
+        assert (decoded == self.eos_index).sum() == 2 * bs
+
+        return decoded, tgt_len
+
+
+class XLMModel(XLMPreTrainedModel):
+    def __init__(self, config, output_attentions=False, keep_multihead_output=False):
+        super(XLMModel, self).__init__(config)
+        self.output_attentions = output_attentions
+        self.mem_len = config.mem_len
+        self.reuse_len = config.reuse_len
+        self.d_model = config.d_model
+        self.same_length = config.same_length
+        self.attn_type = config.attn_type
+        self.bi_data = config.bi_data
+        self.clamp_len = config.clamp_len
+
+        self.word_embedding = nn.Embedding(config.n_token, config.d_model)
+        self.mask_emb = nn.Parameter(torch.Tensor(1, 1, config.d_model))
+        layer = XLMLayer(config, output_attentions=output_attentions,
+                                   keep_multihead_output=keep_multihead_output)
+        self.layer = nn.ModuleList([copy.deepcopy(layer) for _ in range(config.n_layer)])
+        self.dropout = nn.Dropout(config.dropout)
+
+    def prune_heads(self, heads_to_prune):
+        """ Prunes heads of the model.
+            heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
+        """
+        for layer, heads in heads_to_prune.items():
+            self.layer[layer].attention.prune_heads(heads)
+
+    def get_multihead_outputs(self):
+        """ Gather all multi-head outputs.
+            Return: list (layers) of multihead module outputs with gradients
+        """
+        return [layer.attention.self.multihead_output for layer in self.layer]
+
+    def create_mask(self, qlen, mlen):
+        """ create causal attention mask.
+            float mask where 1.0 indicate masked, 0.0 indicated not-masked.
+             same_length=False:      same_length=True:
+             <mlen > <  qlen >       <mlen > <  qlen >
+          ^ [0 0 0 0 0 1 1 1 1]     [0 0 0 0 0 1 1 1 1]
+            [0 0 0 0 0 0 1 1 1]     [1 0 0 0 0 0 1 1 1]
+       qlen [0 0 0 0 0 0 0 1 1]     [1 1 0 0 0 0 0 1 1]
+            [0 0 0 0 0 0 0 0 1]     [1 1 1 0 0 0 0 0 1]
+          v [0 0 0 0 0 0 0 0 0]     [1 1 1 1 0 0 0 0 0]
+        """
+        attn_mask = torch.ones([qlen, qlen])
+        mask_up = torch.triu(attn_mask, diagonal=1)
+        attn_mask_pad = torch.zeros([qlen, mlen])
+        ret = torch.cat([attn_mask_pad, mask_up], dim=1)
+        if self.same_length:
+            mask_lo = torch.tril(attn_mask, diagonal=-1)
+            ret = torch.cat([ret[:, :qlen] + mask_lo, ret[:, qlen:]], dim=1)
+
+        ret = ret.to(next(self.parameters()))
+        return ret
+
+    def cache_mem(self, curr_out, prev_mem):
+        """cache hidden states into memory."""
+        if self.mem_len is None or self.mem_len == 0:
+            return None
+        else:
+            if self.reuse_len is not None and self.reuse_len > 0:
+                curr_out = curr_out[:self.reuse_len]
+
+            if prev_mem is None:
+                new_mem = curr_out[-self.mem_len:]
+            else:
+                new_mem = torch.cat([prev_mem, curr_out], dim=0)[-self.mem_len:]
+
+        return new_mem.detach()
+
+    @staticmethod
+    def positional_embedding(pos_seq, inv_freq, bsz=None):
+        sinusoid_inp = torch.einsum('i,d->id', pos_seq, inv_freq)
+        pos_emb = torch.cat([torch.sin(sinusoid_inp), torch.cos(sinusoid_inp)], dim=-1)
+        pos_emb = pos_emb[:, None, :]
+
+        if bsz is not None:
+            pos_emb = pos_emb.expand(-1, bsz, -1)
+
+        return pos_emb
+
+    def relative_positional_encoding(self, qlen, klen, bsz=None):
+        """create relative positional encoding."""
+        freq_seq = torch.arange(0, self.d_model, 2.0, dtype=torch.float)
+        inv_freq = 1 / (10000 ** (freq_seq / self.d_model))
+
+        if self.attn_type == 'bi':
+            # beg, end = klen - 1, -qlen
+            beg, end = klen, -qlen
+        elif self.attn_type == 'uni':
+            # beg, end = klen - 1, -1
+            beg, end = klen, -1
+        else:
+            raise ValueError('Unknown `attn_type` {}.'.format(self.attn_type))
+
+        if self.bi_data:
+            fwd_pos_seq = torch.arange(beg, end, -1.0, dtype=torch.float)
+            bwd_pos_seq = torch.arange(-beg, -end, 1.0, dtype=torch.float)
+
+            if self.clamp_len > 0:
+                fwd_pos_seq = fwd_pos_seq.clamp(-self.clamp_len, self.clamp_len)
+                bwd_pos_seq = bwd_pos_seq.clamp(-self.clamp_len, self.clamp_len)
+
+            if bsz is not None:
+                fwd_pos_emb = self.positional_embedding(fwd_pos_seq, inv_freq, bsz//2)
+                bwd_pos_emb = self.positional_embedding(bwd_pos_seq, inv_freq, bsz//2)
+            else:
+                fwd_pos_emb = self.positional_embedding(fwd_pos_seq, inv_freq)
+                bwd_pos_emb = self.positional_embedding(bwd_pos_seq, inv_freq)
+
+            pos_emb = torch.cat([fwd_pos_emb, bwd_pos_emb], dim=1)
+        else:
+            fwd_pos_seq = torch.arange(beg, end, -1.0)
+            if self.clamp_len > 0:
+                fwd_pos_seq = fwd_pos_seq.clamp(-self.clamp_len, self.clamp_len)
+            pos_emb = self.positional_embedding(fwd_pos_seq, inv_freq, bsz)
+
+        pos_emb = pos_emb.to(next(self.parameters()))
+        return pos_emb
+
+    def forward(self, inp_k, token_type_ids=None, input_mask=None, attention_mask=None,
+                mems=None, perm_mask=None, target_mapping=None, inp_q=None,
+                output_all_encoded_layers=True, head_mask=None):
+        """
+        Args:
+            inp_k: int32 Tensor in shape [bsz, len], the input token IDs.
+            token_type_ids: int32 Tensor in shape [bsz, len], the input segment IDs.
+            input_mask: [optional] float32 Tensor in shape [bsz, len], the input mask.
+                0 for real tokens and 1 for padding.
+            attention_mask: [optional] float32 Tensor, SAME FUNCTION as `input_mask`
+                but with 1 for real tokens and 0 for padding.
+                Added for easy compatibility with the BERT model (which uses this negative masking).
+                You can only uses one among `input_mask` and `attention_mask`
+            mems: [optional] a list of float32 Tensors in shape [mem_len, bsz, d_model], memory
+                from previous batches. The length of the list equals n_layer.
+                If None, no memory is used.
+            perm_mask: [optional] float32 Tensor in shape [bsz, len, len].
+                If perm_mask[k, i, j] = 0, i attend to j in batch k;
+                if perm_mask[k, i, j] = 1, i does not attend to j in batch k.
+                If None, each position attends to all the others.
+            target_mapping: [optional] float32 Tensor in shape [bsz, num_predict, len].
+                If target_mapping[k, i, j] = 1, the i-th predict in batch k is
+                on the j-th token.
+                Only used during pretraining for partial prediction.
+                Set to None during finetuning.
+            inp_q: [optional] float32 Tensor in shape [bsz, len].
+                1 for tokens with losses and 0 for tokens without losses.
+                Only used during pretraining for two-stream attention.
+                Set to None during finetuning.
+
+            mem_len: int, the number of tokens to cache.
+            reuse_len: int, the number of tokens in the currect batch to be cached
+                and reused in the future.
+            bi_data: bool, whether to use bidirectional input pipeline.
+                Usually set to True during pretraining and False during finetuning.
+            clamp_len: int, clamp all relative distances larger than clamp_len.
+                -1 means no clamping.
+            same_length: bool, whether to use the same attention length for each token.
+            summary_type: str, "last", "first", "mean", or "attn". The method
+                to pool the input to get a vector representation.
+        """
+        # the original code for XLM uses shapes [len, bsz] with the batch dimension at the end
+        # but we want a unified interface in the library with the batch size on the first dimension
+        # so we move here the first dimension (batch) to the end
+        inp_k = inp_k.transpose(0, 1).contiguous()
+        token_type_ids = token_type_ids.transpose(0, 1).contiguous() if token_type_ids is not None else None
+        input_mask = input_mask.transpose(0, 1).contiguous() if input_mask is not None else None
+        attention_mask = attention_mask.transpose(0, 1).contiguous() if attention_mask is not None else None
+        perm_mask = perm_mask.permute(1, 2, 0).contiguous() if perm_mask is not None else None
+        target_mapping = target_mapping.permute(1, 2, 0).contiguous() if target_mapping is not None else None
+        inp_q = inp_q.transpose(0, 1).contiguous() if inp_q is not None else None
+
+        qlen, bsz = inp_k.shape[0], inp_k.shape[1]
+        mlen = mems[0].shape[0] if mems is not None else 0
+        klen = mlen + qlen
+
+        dtype_float = next(self.parameters()).dtype
+        device = next(self.parameters()).device
+
+        ##### Attention mask
+        # causal attention mask
+        if self.attn_type == 'uni':
+            attn_mask = self.create_mask(qlen, mlen)
+            attn_mask = attn_mask[:, :, None, None]
+        elif self.attn_type == 'bi':
+            attn_mask = None
+        else:
+            raise ValueError('Unsupported attention type: {}'.format(self.attn_type))
+
+        # data mask: input mask & perm mask
+        assert input_mask is None or attention_mask is None, "You can only use one of input_mask (uses 1 for padding) "
+        "or attention_mask (uses 0 for padding, added for compatbility with BERT). Please choose one."
+        if input_mask is None and attention_mask is not None:
+            input_mask = 1.0 - attention_mask
+        if input_mask is not None and perm_mask is not None:
+            data_mask = input_mask[None] + perm_mask
+        elif input_mask is not None and perm_mask is None:
+            data_mask = input_mask[None]
+        elif input_mask is None and perm_mask is not None:
+            data_mask = perm_mask
+        else:
+            data_mask = None
+
+        if data_mask is not None:
+            # all mems can be attended to
+            mems_mask = torch.zeros([data_mask.shape[0], mlen, bsz]).to(data_mask)
+            data_mask = torch.cat([mems_mask, data_mask], dim=1)
+            if attn_mask is None:
+                attn_mask = data_mask[:, :, :, None]
+            else:
+                attn_mask += data_mask[:, :, :, None]
+
+        if attn_mask is not None:
+            attn_mask = (attn_mask > 0).to(dtype_float)
+
+        if attn_mask is not None:
+            non_tgt_mask = -torch.eye(qlen).to(attn_mask)
+            non_tgt_mask = torch.cat([torch.zeros([qlen, mlen]).to(attn_mask), non_tgt_mask], dim=-1)
+            non_tgt_mask = ((attn_mask + non_tgt_mask[:, :, None, None]) > 0).to(attn_mask)
+        else:
+            non_tgt_mask = None
+
+        ##### Word embeddings and prepare h & g hidden states
+        word_emb_k = self.word_embedding(inp_k)
+        output_h = self.dropout(word_emb_k)
+        if inp_q is not None:
+            if target_mapping is not None:
+                word_emb_q = self.mask_emb.expand(target_mapping.shape[0], bsz, -1)
+            else:
+                inp_q_ext = inp_q[:, :, None]
+                word_emb_q = inp_q_ext * self.mask_emb + (1 - inp_q_ext) * word_emb_k
+            output_g = self.dropout(word_emb_q)
+        else:
+            output_g = None
+
+        ##### Segment embedding
+        if token_type_ids is not None:
+            # Convert `token_type_ids` to one-hot `seg_mat`
+            mem_pad = torch.zeros([mlen, bsz], dtype=torch.long, device=device)
+            cat_ids = torch.cat([mem_pad, token_type_ids], dim=0)
+
+            # `1` indicates not in the same segment [qlen x klen x bsz]
+            seg_mat = (token_type_ids[:, None] != cat_ids[None, :]).long()
+            seg_mat = F.one_hot(seg_mat, num_classes=2).to(dtype_float)
+        else:
+            seg_mat = None
+
+        ##### Positional encoding
+        pos_emb = self.relative_positional_encoding(qlen, klen, bsz=bsz)
+        pos_emb = self.dropout(pos_emb)
+
+        ##### Head mask if needed (for bertology/pruning)
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [n_layer x num_heads]
+        # and head_mask is converted to shape [n_layer x batch x num_heads x seq_length x seq_length]
+        if head_mask is not None:
+            if head_mask.dim() == 1:
+                head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
+                head_mask = head_mask.expand(self.config.n_layer, -1, -1, -1, -1)
+            elif head_mask.dim() == 2:
+                head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)  # We can specify head_mask for each layer
+            head_mask = head_mask.to(dtype=next(self.parameters()).dtype) # switch to fload if need + fp16 compatibility
+        else:
+            head_mask = [None] * self.config.n_layer
+
+        new_mems = []
+        if mems is None:
+            mems = [None] * len(self.layer)
+
+        hidden_states = []
+        for i, layer_module in enumerate(self.layer):
+            # cache new mems
+            new_mems.append(self.cache_mem(output_h, mems[i]))
+
+            output_h, output_g = layer_module(output_h, output_g,
+                                              attn_mask_h=non_tgt_mask, attn_mask_g=attn_mask,
+                                              r=pos_emb, seg_mat=seg_mat,
+                                              mems=mems[i], target_mapping=target_mapping,
+                                              head_mask=head_mask)
+            hidden_states.append(output_h)
+        output = self.dropout(output_g if output_g is not None else output_h)
+
+        # We transpose back here to shape [bsz, len, hidden_dim] (cf. beginning of forward() method)
+        output = output.permute(1, 0, 2).contiguous()
+        hidden_states = [hs.permute(1, 0, 2).contiguous() for hs in hidden_states]
+
+        return output, hidden_states, new_mems
+
+
+class XLMPredLayer(nn.Module):
+    """
+    Prediction layer (cross_entropy or adaptive_softmax).
+    """
+    def __init__(self, params):
+        super().__init__()
+        self.asm = params.asm
+        self.n_words = params.n_words
+        self.pad_index = params.pad_index
+        dim = params.emb_dim
+
+        if params.asm is False:
+            self.proj = Linear(dim, params.n_words, bias=True)
+        else:
+            self.proj = nn.AdaptiveLogSoftmaxWithLoss(
+                in_features=dim,
+                n_classes=params.n_words,
+                cutoffs=params.asm_cutoffs,
+                div_value=params.asm_div_value,
+                head_bias=True,  # default is False
+            )
+
+    def forward(self, x, y, get_scores=False):
+        """
+        Compute the loss, and optionally the scores.
+        """
+        assert (y == self.pad_index).sum().item() == 0
+
+        if self.asm is False:
+            scores = self.proj(x).view(-1, self.n_words)
+            loss = F.cross_entropy(scores, y, reduction='elementwise_mean')
+        else:
+            _, loss = self.proj(x, y)
+            scores = self.proj.log_prob(x) if get_scores else None
+
+        return scores, loss
+
+    def get_scores(self, x):
+        """
+        Compute scores.
+        """
+        assert x.dim() == 2
+        return self.proj.log_prob(x) if self.asm else self.proj(x)
+
+
+class XLMLMHeadModel(XLMPreTrainedModel):
+    """XLM model ("XLM: Generalized Autoregressive Pretraining for Language Understanding").
+
+    Params:
+        `config`: a XLMConfig class instance with the configuration to build a new model
+        `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
+        `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
+            This can be used to compute head importance metrics. Default: False
+
+    Inputs:
+        inp_k: int32 Tensor in shape [bsz, len], the input token IDs.
+        token_type_ids: int32 Tensor in shape [bsz, len], the input segment IDs.
+        input_mask: [optional] float32 Tensor in shape [bsz, len], the input mask.
+            0 for real tokens and 1 for padding.
+        attention_mask: [optional] float32 Tensor, SAME FUNCTION as `input_mask`
+            but with 1 for real tokens and 0 for padding.
+            Added for easy compatibility with the BERT model (which uses this negative masking).
+            You can only uses one among `input_mask` and `attention_mask`
+        mems: [optional] a list of float32 Tensors in shape [mem_len, bsz, d_model], memory
+            from previous batches. The length of the list equals n_layer.
+            If None, no memory is used.
+        perm_mask: [optional] float32 Tensor in shape [bsz, len, len].
+            If perm_mask[k, i, j] = 0, i attend to j in batch k;
+            if perm_mask[k, i, j] = 1, i does not attend to j in batch k.
+            If None, each position attends to all the others.
+        target_mapping: [optional] float32 Tensor in shape [bsz, num_predict, len].
+            If target_mapping[k, i, j] = 1, the i-th predict in batch k is
+            on the j-th token.
+            Only used during pretraining for partial prediction.
+            Set to None during finetuning.
+        inp_q: [optional] float32 Tensor in shape [bsz, len].
+            1 for tokens with losses and 0 for tokens without losses.
+            Only used during pretraining for two-stream attention.
+            Set to None during finetuning.
+
+
+    Outputs: Tuple of (encoded_layers, pooled_output)
+        `encoded_layers`: controled by `output_all_encoded_layers` argument:
+            - `output_all_encoded_layers=True`: outputs a list of the full sequences of encoded-hidden-states at the end
+                of each attention block (i.e. 12 full sequences for XLM-base, 24 for XLM-large), each
+                encoded-hidden-state is a torch.FloatTensor of size [batch_size, sequence_length, d_model],
+            - `output_all_encoded_layers=False`: outputs only the full sequence of hidden-states corresponding
+                to the last attention block of shape [batch_size, sequence_length, d_model],
+        `pooled_output`: a torch.FloatTensor of size [batch_size, d_model] which is the output of a
+            classifier pretrained on top of the hidden state associated to the first character of the
+            input (`CLS`) to train on the Next-Sentence task (see XLM's paper).
+
+    Example usage:
+    ```python
+    # Already been converted into WordPiece token ids
+    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
+    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
+    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
+
+    config = modeling.XLMConfig(vocab_size_or_config_json_file=32000, d_model=768,
+        n_layer=12, num_attention_heads=12, intermediate_size=3072)
+
+    model = modeling.XLMModel(config=config)
+    all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)
+    ```
+    """
+    def __init__(self, config, output_attentions=False, keep_multihead_output=False):
+        super(XLMLMHeadModel, self).__init__(config)
+        self.output_attentions = output_attentions
+        self.attn_type = config.attn_type
+        self.same_length = config.same_length
+
+        self.transformer = XLMModel(config, output_attentions=output_attentions,
+                                              keep_multihead_output=keep_multihead_output)
+        self.lm_loss = nn.Linear(config.d_model, config.n_token, bias=True)
+
+        # Tie weights
+
+        self.apply(self.init_weights)
+        self.tie_weights()
+
+    def tie_weights(self):
+        """ Make sure we are sharing the embeddings
+        """
+        self.lm_loss.weight = self.transformer.word_embedding.weight
+
+    def forward(self, inp_k, token_type_ids=None, input_mask=None, attention_mask=None,
+                mems=None, perm_mask=None, target_mapping=None, inp_q=None,
+                labels=None, output_all_encoded_layers=True, head_mask=None):
+        """
+        Args:
+            inp_k: int32 Tensor in shape [bsz, len], the input token IDs.
+            token_type_ids: int32 Tensor in shape [bsz, len], the input segment IDs.
+            input_mask: float32 Tensor in shape [bsz, len], the input mask.
+                0 for real tokens and 1 for padding.
+            attention_mask: [optional] float32 Tensor, SAME FUNCTION as `input_mask`
+                but with 1 for real tokens and 0 for padding.
+                Added for easy compatibility with the BERT model (which uses this negative masking).
+                You can only uses one among `input_mask` and `attention_mask`
+            mems: a list of float32 Tensors in shape [mem_len, bsz, d_model], memory
+                from previous batches. The length of the list equals n_layer.
+                If None, no memory is used.
+            perm_mask: float32 Tensor in shape [bsz, len, len].
+                If perm_mask[k, i, j] = 0, i attend to j in batch k;
+                if perm_mask[k, i, j] = 1, i does not attend to j in batch k.
+                If None, each position attends to all the others.
+            target_mapping: float32 Tensor in shape [bsz, num_predict, len].
+                If target_mapping[k, i, j] = 1, the i-th predict in batch k is
+                on the j-th token.
+                Only used during pretraining for partial prediction.
+                Set to None during finetuning.
+            inp_q: float32 Tensor in shape [bsz, len].
+                1 for tokens with losses and 0 for tokens without losses.
+                Only used during pretraining for two-stream attention.
+                Set to None during finetuning.
+
+            summary_type: str, "last", "first", "mean", or "attn". The method
+                to pool the input to get a vector representation.
+        """
+        output, hidden_states, new_mems = self.transformer(inp_k, token_type_ids, input_mask, attention_mask,
+                                            mems, perm_mask, target_mapping, inp_q,
+                                            output_all_encoded_layers, head_mask)
+
+        logits = self.lm_loss(output)
+
+        if labels is not None:
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss(ignore_index=-1)
+            loss = loss_fct(logits.view(-1, logits.size(-1)),
+                            labels.view(-1))
+            return loss, new_mems
+
+        # if self.output_attentions:
+        #     all_attentions, encoded_layers = encoded_layers
+        # sequence_output = encoded_layers[-1]
+        # pooled_output = self.pooler(sequence_output)
+        # if not output_all_encoded_layers:
+        #     encoded_layers = encoded_layers[-1]
+        # if self.output_attentions:
+        return logits, new_mems
+        #     return all_attentions, encoded_layers, pooled_output
+
+
+class XLMSequenceSummary(nn.Module):
+    def __init__(self, config, summary_type="last", use_proj=True,
+                 output_attentions=False, keep_multihead_output=False):
+        super(XLMSequenceSummary, self).__init__()
+        self.summary_type = summary_type
+        if use_proj:
+            self.summary = nn.Linear(config.d_model, config.d_model)
+        else:
+            self.summary = None
+        if summary_type == 'attn':
+            # We should use a standard multi-head attention module with absolute positional embedding for that.
+            # Cf. https://github.com/zihangdai/xlnet/blob/master/modeling.py#L253-L276
+            # We can probably just use the multi-head attention module of PyTorch >=1.1.0
+            raise NotImplementedError
+        self.dropout = nn.Dropout(config.dropout)
+        self.activation = nn.Tanh()
+
+    def forward(self, hidden_states):
+        """ hidden_states: float Tensor in shape [bsz, seq_len, d_model], the hidden-states of the last layer."""
+        if self.summary_type == 'last':
+            output = hidden_states[:, -1]
+        elif self.summary_type == 'first':
+            output = hidden_states[:, 0]
+        elif self.summary_type == 'mean':
+            output = hidden_states.mean(dim=1)
+        elif summary_type == 'attn':
+            raise NotImplementedError
+
+        output = self.summary(output)
+        output = self.activation(output)
+        output = self.dropout(output)
+        return output
+
+
+class XLMForSequenceClassification(XLMPreTrainedModel):
+    """XLM model ("XLM: Generalized Autoregressive Pretraining for Language Understanding").
+
+    Params:
+        `config`: a XLMConfig class instance with the configuration to build a new model
+        `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
+        `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
+            This can be used to compute head importance metrics. Default: False
+        `summary_type`: str, "last", "first", "mean", or "attn". The method
+            to pool the input to get a vector representation. Default: last
+
+    Inputs:
+        inp_k: int32 Tensor in shape [bsz, len], the input token IDs.
+        token_type_ids: int32 Tensor in shape [bsz, len], the input segment IDs.
+        input_mask: float32 Tensor in shape [bsz, len], the input mask.
+            0 for real tokens and 1 for padding.
+        attention_mask: [optional] float32 Tensor, SAME FUNCTION as `input_mask`
+            but with 1 for real tokens and 0 for padding.
+            Added for easy compatibility with the BERT model (which uses this negative masking).
+            You can only uses one among `input_mask` and `attention_mask`
+        mems: a list of float32 Tensors in shape [mem_len, bsz, d_model], memory
+            from previous batches. The length of the list equals n_layer.
+            If None, no memory is used.
+        perm_mask: float32 Tensor in shape [bsz, len, len].
+            If perm_mask[k, i, j] = 0, i attend to j in batch k;
+            if perm_mask[k, i, j] = 1, i does not attend to j in batch k.
+            If None, each position attends to all the others.
+        target_mapping: float32 Tensor in shape [bsz, num_predict, len].
+            If target_mapping[k, i, j] = 1, the i-th predict in batch k is
+            on the j-th token.
+            Only used during pretraining for partial prediction.
+            Set to None during finetuning.
+        inp_q: float32 Tensor in shape [bsz, len].
+            1 for tokens with losses and 0 for tokens without losses.
+            Only used during pretraining for two-stream attention.
+            Set to None during finetuning.
+        `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
+            It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
+
+
+    Outputs: Tuple of (logits or loss, mems)
+        `logits or loss`:
+            if labels is None:
+                Token logits with shape [batch_size, sequence_length] 
+            else:
+                CrossEntropy loss with the targets
+        `new_mems`: list (num layers) of updated mem states at the entry of each layer
+            each mem state is a torch.FloatTensor of size [self.config.mem_len, batch_size, self.config.d_model]
+            Note that the first two dimensions are transposed in `mems` with regards to `input_ids` and `labels`
+
+    Example usage:
+    ```python
+    # Already been converted into WordPiece token ids
+    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
+    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
+    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
+
+    config = modeling.XLMConfig(vocab_size_or_config_json_file=32000, d_model=768,
+        n_layer=12, num_attention_heads=12, intermediate_size=3072)
+
+    model = modeling.XLMModel(config=config)
+    all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)
+    ```
+    """
+    def __init__(self, config, summary_type="last", use_proj=True, num_labels=2,
+                 output_attentions=False, keep_multihead_output=False):
+        super(XLMForSequenceClassification, self).__init__(config)
+        self.output_attentions = output_attentions
+        self.attn_type = config.attn_type
+        self.same_length = config.same_length
+        self.summary_type = summary_type
+        self.num_labels = num_labels
+
+        self.transformer = XLMModel(config, output_attentions=output_attentions,
+                                              keep_multihead_output=keep_multihead_output)
+
+        self.sequence_summary = XLMSequenceSummary(config, summary_type=summary_type,
+                                                     use_proj=use_proj, output_attentions=output_attentions,
+                                                     keep_multihead_output=keep_multihead_output)
+        self.logits_proj = nn.Linear(config.d_model, num_labels)
+        self.apply(self.init_weights)
+
+    def forward(self, inp_k, token_type_ids=None, input_mask=None, attention_mask=None,
+                mems=None, perm_mask=None, target_mapping=None, inp_q=None,
+                labels=None, output_all_encoded_layers=True, head_mask=None):
+        """
+        Args:
+            inp_k: int32 Tensor in shape [bsz, len], the input token IDs.
+            token_type_ids: int32 Tensor in shape [bsz, len], the input segment IDs.
+            input_mask: float32 Tensor in shape [bsz, len], the input mask.
+                0 for real tokens and 1 for padding.
+            attention_mask: [optional] float32 Tensor, SAME FUNCTION as `input_mask`
+                but with 1 for real tokens and 0 for padding.
+                Added for easy compatibility with the BERT model (which uses this negative masking).
+                You can only uses one among `input_mask` and `attention_mask`
+            mems: a list of float32 Tensors in shape [mem_len, bsz, d_model], memory
+                from previous batches. The length of the list equals n_layer.
+                If None, no memory is used.
+            perm_mask: float32 Tensor in shape [bsz, len, len].
+                If perm_mask[k, i, j] = 0, i attend to j in batch k;
+                if perm_mask[k, i, j] = 1, i does not attend to j in batch k.
+                If None, each position attends to all the others.
+            target_mapping: float32 Tensor in shape [bsz, num_predict, len].
+                If target_mapping[k, i, j] = 1, the i-th predict in batch k is
+                on the j-th token.
+                Only used during pretraining for partial prediction.
+                Set to None during finetuning.
+            inp_q: float32 Tensor in shape [bsz, len].
+                1 for tokens with losses and 0 for tokens without losses.
+                Only used during pretraining for two-stream attention.
+                Set to None during finetuning.
+        """
+        output, _, new_mems = self.transformer(inp_k, token_type_ids, input_mask, attention_mask,
+                                               mems, perm_mask, target_mapping, inp_q,
+                                               output_all_encoded_layers, head_mask)
+
+        output = self.sequence_summary(output)
+        logits = self.logits_proj(output)
+
+        if labels is not None:
+            if self.num_labels == 1:
+                #  We are doing regression
+                loss_fct = MSELoss()
+                loss = loss_fct(logits.view(-1), labels.view(-1))
+            else:
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            return loss, new_mems
+
+        # if self.output_attentions:
+        #     all_attentions, encoded_layers = encoded_layers
+        # sequence_output = encoded_layers[-1]
+        # pooled_output = self.pooler(sequence_output)
+        # if not output_all_encoded_layers:
+        #     encoded_layers = encoded_layers[-1]
+        # if self.output_attentions:
+        return logits, new_mems
+        #     return all_attentions, encoded_layers, pooled_output
+
+
+class XLMForQuestionAnswering(XLMPreTrainedModel):
+    """XLM model for Question Answering (span extraction).
+    This module is composed of the XLM model with a linear layer on top of
+    the sequence output that computes start_logits and end_logits
+
+    Params:
+        `config`: a XLMConfig class instance with the configuration to build a new model
+        `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
+        `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
+            This can be used to compute head importance metrics. Default: False
+
+    Inputs:
+        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
+            with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
+            `run_bert_extract_features.py`, `run_bert_classifier.py` and `run_bert_squad.py`)
+        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
+            types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
+            a `sentence B` token (see XLM paper for more details).
+        `attention_mask`: [optional] float32 Tensor, SAME FUNCTION as `input_mask`
+            but with 1 for real tokens and 0 for padding.
+            Added for easy compatibility with the BERT model (which uses this negative masking).
+            You can only uses one among `input_mask` and `attention_mask`
+        `input_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
+            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
+            input sequence length in the current batch. It's the mask that we typically use for attention when
+            a batch has varying length sentences.
+        `start_positions`: position of the first token for the labeled span: torch.LongTensor of shape [batch_size].
+            Positions are clamped to the length of the sequence and position outside of the sequence are not taken
+            into account for computing the loss.
+        `end_positions`: position of the last token for the labeled span: torch.LongTensor of shape [batch_size].
+            Positions are clamped to the length of the sequence and position outside of the sequence are not taken
+            into account for computing the loss.
+        `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
+            It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
+
+    Outputs:
+        if `start_positions` and `end_positions` are not `None`:
+            Outputs the total_loss which is the sum of the CrossEntropy loss for the start and end token positions.
+        if `start_positions` or `end_positions` is `None`:
+            Outputs a tuple of start_logits, end_logits which are the logits respectively for the start and end
+            position tokens of shape [batch_size, sequence_length].
+
+    Example usage:
+    ```python
+    # Already been converted into WordPiece token ids
+    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
+    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
+    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
+
+    config = XLMConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
+        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
+
+    model = XLMForQuestionAnswering(config)
+    start_logits, end_logits = model(input_ids, token_type_ids, input_mask)
+    ```
+    """
+    def __init__(self, config, output_attentions=False, keep_multihead_output=False):
+        super(XLMForQuestionAnswering, self).__init__(config)
+        self.output_attentions = output_attentions
+        self.transformer = XLMModel(config, output_attentions=output_attentions,
+                                      keep_multihead_output=keep_multihead_output)
+        self.qa_outputs = nn.Linear(config.hidden_size, 2)
+        self.apply(self.init_weights)
+
+    def forward(self, inp_k, token_type_ids=None, input_mask=None, attention_mask=None,
+                mems=None, perm_mask=None, target_mapping=None, inp_q=None,
+                start_positions=None, end_positions=None,
+                output_all_encoded_layers=True, head_mask=None):
+        output, _, new_mems = self.transformer(inp_k, token_type_ids, input_mask, attention_mask,
+                                            mems, perm_mask, target_mapping, inp_q,
+                                            output_all_encoded_layers, head_mask)
+
+        logits = self.qa_outputs(output)
+        start_logits, end_logits = logits.split(1, dim=-1)
+        start_logits = start_logits.squeeze(-1)
+        end_logits = end_logits.squeeze(-1)
+
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, split add a dimension
+            if len(start_positions.size()) > 1:
+                start_positions = start_positions.squeeze(-1)
+            if len(end_positions.size()) > 1:
+                end_positions = end_positions.squeeze(-1)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.size(1)
+            start_positions.clamp_(0, ignored_index)
+            end_positions.clamp_(0, ignored_index)
+
+            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+            return total_loss
+        elif self.output_attentions:
+            return all_attentions, start_logits, end_logits
+        return start_logits, end_logits
diff --git a/pytorch_pretrained_bert/modeling_xlnet.py b/pytorch_pretrained_bert/modeling_xlnet.py
index 8963f53615..e9c7c72e12 100644
--- a/pytorch_pretrained_bert/modeling_xlnet.py
+++ b/pytorch_pretrained_bert/modeling_xlnet.py
@@ -32,7 +32,9 @@ from torch import nn
 from torch.nn import functional as F
 from torch.nn import CrossEntropyLoss, MSELoss
 
-from .file_utils import cached_path, WEIGHTS_NAME, CONFIG_NAME
+from .file_utils import cached_path
+from .model_utils import CONFIG_NAME, WEIGHTS_NAME, PretrainedConfig
+
 
 logger = logging.getLogger(__name__)
 
@@ -192,48 +194,12 @@ def swish(x):
 
 ACT2FN = {"gelu": gelu, "relu": torch.nn.functional.relu, "swish": swish}
 
-class XLNetBaseConfig(object):
-    @classmethod
-    def from_dict(cls, json_object):
-        """Constructs a `XLNetBaseConfig` from a Python dictionary of parameters."""
-        config = cls(vocab_size_or_config_json_file=-1)
-        for key, value in json_object.items():
-            config.__dict__[key] = value
-        return config
 
-    @classmethod
-    def from_json_file(cls, json_file):
-        """Constructs a `XLNetBaseConfig` from a json file of parameters."""
-        with open(json_file, "r", encoding='utf-8') as reader:
-            text = reader.read()
-        return cls.from_dict(json.loads(text))
-
-    def update(self, other):
-        dict_b = other.to_dict()
-        for key, value in dict_b.items():
-            self.__dict__[key] = value
-
-    def __repr__(self):
-        return str(self.to_json_string())
-
-    def to_dict(self):
-        """Serializes this instance to a Python dictionary."""
-        output = copy.deepcopy(self.__dict__)
-        return output
-
-    def to_json_string(self):
-        """Serializes this instance to a JSON string."""
-        return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"
-
-    def to_json_file(self, json_file_path):
-        """ Save this instance to a json file."""
-        with open(json_file_path, "w", encoding='utf-8') as writer:
-            writer.write(self.to_json_string())
-
-
-class XLNetConfig(XLNetBaseConfig):
+class XLNetConfig(PretrainedConfig):
     """Configuration class to store the configuration of a `XLNetModel`.
     """
+    pretrained_config_archive_map = PRETRAINED_CONFIG_ARCHIVE_MAP
+
     def __init__(self,
                  vocab_size_or_config_json_file,
                  d_model=1024,
@@ -337,53 +303,6 @@ class XLNetConfig(XLNetBaseConfig):
                              "or the path to a pretrained model config file (str)")
 
 
-class XLNetRunConfig(XLNetBaseConfig):
-    """XLNetRunConfig contains hyperparameters that could be different
-    between pretraining and finetuning.
-    These hyperparameters can also be changed from run to run.
-    We store them separately from XLNetConfig for flexibility.
-    """
-    def __init__(self, 
-                 dropout=0.1,
-                 dropatt=0.1,
-                 init="normal",
-                 init_range=0.1,
-                 init_std=0.02,
-                 mem_len=None,
-                 reuse_len=None,
-                 bi_data=False,
-                 clamp_len=-1,
-                 same_length=False):
-        """
-        Args:
-        dropout: float, dropout rate.
-        dropatt: float, dropout rate on attention probabilities.
-        init: str, the initialization scheme, either "normal" or "uniform".
-        init_range: float, initialize the parameters with a uniform distribution
-            in [-init_range, init_range]. Only effective when init="uniform".
-        init_std: float, initialize the parameters with a normal distribution
-            with mean 0 and stddev init_std. Only effective when init="normal".
-        mem_len: int, the number of tokens to cache.
-        reuse_len: int, the number of tokens in the currect batch to be cached
-            and reused in the future.
-        bi_data: bool, whether to use bidirectional input pipeline.
-            Usually set to True during pretraining and False during finetuning.
-        clamp_len: int, clamp all relative distances larger than clamp_len.
-            -1 means no clamping.
-        same_length: bool, whether to use the same attention length for each token.
-        """
-
-        self.init = init
-        self.init_range = init_range
-        self.init_std = init_std
-        self.dropout = dropout
-        self.dropatt = dropatt
-        self.mem_len = mem_len
-        self.reuse_len = reuse_len
-        self.bi_data = bi_data
-        self.clamp_len = clamp_len
-        self.same_length = same_length
-
 try:
     from apex.normalization.fused_layer_norm import FusedLayerNorm as XLNetLayerNorm
 except ImportError:
@@ -637,9 +556,9 @@ class XLNetPreTrainedModel(nn.Module):
     """
     def __init__(self, config, *inputs, **kwargs):
         super(XLNetPreTrainedModel, self).__init__()
-        if not isinstance(config, XLNetBaseConfig):
+        if not isinstance(config, XLNetConfig):
             raise ValueError(
-                "Parameter config in `{}(config)` should be an instance of class `XLNetBaseConfig`. "
+                "Parameter config in `{}(config)` should be an instance of class `XLNetConfig`. "
                 "To create a model from a Google pretrained model use "
                 "`model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`".format(
                     self.__class__.__name__, self.__class__.__name__
diff --git a/tests/modeling_xlnet_test.py b/tests/modeling_xlnet_test.py
index 3d14af5d7c..237fa16d3a 100644
--- a/tests/modeling_xlnet_test.py
+++ b/tests/modeling_xlnet_test.py
@@ -25,7 +25,7 @@ import pytest
 
 import torch
 
-from pytorch_pretrained_bert import (XLNetConfig, XLNetRunConfig, XLNetModel, XLNetLMHeadModel)
+from pytorch_pretrained_bert import (XLNetConfig, XLNetModel, XLNetLMHeadModel)
 from pytorch_pretrained_bert.modeling_xlnet import PRETRAINED_MODEL_ARCHIVE_MAP
 
 class XLNetModelTest(unittest.TestCase):
@@ -117,17 +117,13 @@ class XLNetModelTest(unittest.TestCase):
                 d_inner=self.d_inner,
                 n_layer=self.n_layer,
                 untie_r=self.untie_r,
-                max_position_embeddings=self.max_position_embeddings)
-
-            run_config = XLNetRunConfig(
+                max_position_embeddings=self.max_position_embeddings,
                 mem_len=self.mem_len,
                 clamp_len=self.clamp_len,
                 same_length=self.same_length,
                 reuse_len=self.reuse_len,
                 bi_data=self.bi_data)
 
-            config.update(run_config)
-
             return (config, input_ids_1, input_ids_2, input_ids_q, perm_mask, target_mapping, inp_q, segment_ids, lm_labels)
 
         def set_seed(self):

From ddc2cc61a6655ce0ac00d8d0fbb185d3848a6c3d Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Wed, 26 Jun 2019 11:17:42 +0200
Subject: [PATCH 022/139] fix python2 tests

---
 pytorch_pretrained_bert/model_utils.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pytorch_pretrained_bert/model_utils.py b/pytorch_pretrained_bert/model_utils.py
index af35e7bcc4..2ae9281240 100644
--- a/pytorch_pretrained_bert/model_utils.py
+++ b/pytorch_pretrained_bert/model_utils.py
@@ -21,6 +21,7 @@ import logging
 import os
 import json
 import copy
+from io import open
 
 import torch
 from torch import nn

From 59cefd4f985b7221846189690ead3300ff864b3d Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Wed, 26 Jun 2019 11:28:27 +0200
Subject: [PATCH 023/139] fix #726 - get_lr in examples

---
 examples/run_bert_squad.py       | 3 ++-
 examples/run_xlnet_classifier.py | 3 ++-
 examples/run_xlnet_squad.py      | 3 ++-
 3 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/examples/run_bert_squad.py b/examples/run_bert_squad.py
index b35a9175ec..9aaa711c2b 100644
--- a/examples/run_bert_squad.py
+++ b/examples/run_bert_squad.py
@@ -313,7 +313,8 @@ def main():
                     optimizer.zero_grad()
                     global_step += 1
                     if args.local_rank in [-1, 0]:
-                        tb_writer.add_scalar('lr', optimizer.get_lr()[0], global_step)
+                        if not args.fp16:
+                            tb_writer.add_scalar('lr', optimizer.get_lr()[0], global_step)
                         tb_writer.add_scalar('loss', loss.item(), global_step)
 
     if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
diff --git a/examples/run_xlnet_classifier.py b/examples/run_xlnet_classifier.py
index 0278b40cdd..2309815981 100644
--- a/examples/run_xlnet_classifier.py
+++ b/examples/run_xlnet_classifier.py
@@ -319,7 +319,8 @@ def main():
                     optimizer.zero_grad()
                     global_step += 1
                     if args.local_rank in [-1, 0] and (args.log_every <= 0 or (step + 1) % args.log_every == 0):
-                        tb_writer.add_scalar('lr', optimizer.get_lr()[0], global_step)
+                        if not args.fp16:
+                            tb_writer.add_scalar('lr', optimizer.get_lr()[0], global_step)
                         tb_writer.add_scalar('loss', loss.item(), global_step)
 
     ### Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
diff --git a/examples/run_xlnet_squad.py b/examples/run_xlnet_squad.py
index a72d648ff7..927668c57a 100644
--- a/examples/run_xlnet_squad.py
+++ b/examples/run_xlnet_squad.py
@@ -313,7 +313,8 @@ def main():
                     optimizer.zero_grad()
                     global_step += 1
                     if args.local_rank in [-1, 0]:
-                        tb_writer.add_scalar('lr', optimizer.get_lr()[0], global_step)
+                        if not args.fp16:
+                            tb_writer.add_scalar('lr', optimizer.get_lr()[0], global_step)
                         tb_writer.add_scalar('loss', loss.item(), global_step)
 
     if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):

From 4d47f4985dfb09237b6e11b5eafb0b1935f8c634 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Wed, 26 Jun 2019 12:52:44 +0200
Subject: [PATCH 024/139] slight refactoring, add abstract class for model
 loading

---
 pytorch_pretrained_bert/__init__.py           |   3 +-
 pytorch_pretrained_bert/model_utils.py        | 177 ++++++++++++
 pytorch_pretrained_bert/modeling.py           | 177 ++----------
 pytorch_pretrained_bert/modeling_gpt2.py      | 254 +++++++++---------
 pytorch_pretrained_bert/modeling_openai.py    | 164 ++---------
 .../modeling_transfo_xl.py                    | 172 ++----------
 pytorch_pretrained_bert/modeling_xlm.py       | 154 +----------
 pytorch_pretrained_bert/modeling_xlnet.py     | 180 ++-----------
 8 files changed, 397 insertions(+), 884 deletions(-)

diff --git a/pytorch_pretrained_bert/__init__.py b/pytorch_pretrained_bert/__init__.py
index 12e251522c..7d823a045d 100644
--- a/pytorch_pretrained_bert/__init__.py
+++ b/pytorch_pretrained_bert/__init__.py
@@ -28,4 +28,5 @@ from .optimization_openai import OpenAIAdam
 
 from .file_utils import (PYTORCH_PRETRAINED_BERT_CACHE, cached_path)
 
-from .model_utils import (WEIGHTS_NAME, CONFIG_NAME, PretrainedConfig)
+from .model_utils import (WEIGHTS_NAME, CONFIG_NAME, TF_WEIGHTS_NAME,
+                          PretrainedConfig, PreTrainedModel, prune_layer, Conv1D)
diff --git a/pytorch_pretrained_bert/model_utils.py b/pytorch_pretrained_bert/model_utils.py
index 2ae9281240..c262d7b6c8 100644
--- a/pytorch_pretrained_bert/model_utils.py
+++ b/pytorch_pretrained_bert/model_utils.py
@@ -33,6 +33,7 @@ logger = logging.getLogger(__name__)
 
 CONFIG_NAME = "config.json"
 WEIGHTS_NAME = "pytorch_model.bin"
+TF_WEIGHTS_NAME = 'model.ckpt'
 
 
 class PretrainedConfig(object):
@@ -131,6 +132,169 @@ class PretrainedConfig(object):
             writer.write(self.to_json_string())
 
 
+class PreTrainedModel(nn.Module):
+    """ An abstract class to handle weights initialization and
+        a simple interface for dowloading and loading pretrained models.
+    """
+    config_class = PretrainedConfig
+    pretrained_model_archive_map = {}
+    pretrained_config_archive_map = {}
+    load_tf_weights = lambda model, config, path: None
+    base_model_prefix = ""
+
+    def __init__(self, config, *inputs, **kwargs):
+        super(PreTrainedModel, self).__init__()
+        if not isinstance(config, PretrainedConfig):
+            raise ValueError(
+                "Parameter config in `{}(config)` should be an instance of class `PretrainedConfig`. "
+                "To create a model from a pretrained model use "
+                "`model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`".format(
+                    self.__class__.__name__, self.__class__.__name__
+                ))
+        self.config = config
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
+        """
+        Instantiate a PreTrainedModel from a pre-trained model file or a pytorch state dict.
+        Download and cache the pre-trained model file if needed.
+
+        Params:
+            pretrained_model_name_or_path: either:
+                - a str with the name of a pre-trained model to load, or
+                - a path or url to a pretrained model archive containing:
+                    . `config.json` a configuration file for the model
+                    . `pytorch_model.bin` a PyTorch dump of a XLNetForPreTraining instance
+                - a path or url to a tensorflow pretrained model checkpoint containing:
+                    . `config.json` a configuration file for the model
+                    . `model.chkpt` a TensorFlow checkpoint
+            from_tf: should we load the weights from a locally saved TensorFlow checkpoint
+            cache_dir: an optional path to a folder in which the pre-trained models will be cached.
+            state_dict: an optional state dictionnary (collections.OrderedDict object) to use
+                instead of Google pre-trained models
+            *inputs, **kwargs: additional input for the specific XLNet class
+                (ex: num_labels for XLNetForSequenceClassification)
+        """
+        state_dict = kwargs.get('state_dict', None)
+        kwargs.pop('state_dict', None)
+        cache_dir = kwargs.get('cache_dir', None)
+        kwargs.pop('cache_dir', None)
+        from_tf = kwargs.get('from_tf', False)
+        kwargs.pop('from_tf', None)
+
+        if pretrained_model_name_or_path in cls.pretrained_model_archive_map:
+            archive_file = cls.pretrained_model_archive_map[pretrained_model_name_or_path]
+            config_file = cls.pretrained_config_archive_map[pretrained_model_name_or_path]
+        else:
+            if from_tf:
+                # Directly load from a TensorFlow checkpoint
+                archive_file = os.path.join(pretrained_model_name_or_path, TF_WEIGHTS_NAME + ".index")
+                config_file = os.path.join(pretrained_model_name_or_path, CONFIG_NAME)
+            else:
+                archive_file = os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME)
+                config_file = os.path.join(pretrained_model_name_or_path, CONFIG_NAME)
+        # redirect to the cache, if necessary
+        try:
+            resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir)
+        except EnvironmentError:
+            if pretrained_model_name_or_path in cls.pretrained_model_archive_map:
+                logger.error(
+                    "Couldn't reach server at '{}' to download pretrained weights.".format(
+                        archive_file))
+            else:
+                logger.error(
+                    "Model name '{}' was not found in model name list ({}). "
+                    "We assumed '{}' was a path or url but couldn't find any file "
+                    "associated to this path or url.".format(
+                        pretrained_model_name_or_path,
+                        ', '.join(cls.pretrained_model_archive_map.keys()),
+                        archive_file))
+            return None
+        try:
+            resolved_config_file = cached_path(config_file, cache_dir=cache_dir)
+        except EnvironmentError:
+            if pretrained_model_name_or_path in cls.pretrained_config_archive_map:
+                logger.error(
+                    "Couldn't reach server at '{}' to download pretrained model configuration file.".format(
+                        config_file))
+            else:
+                logger.error(
+                    "Model name '{}' was not found in model name list ({}). "
+                    "We assumed '{}' was a path or url but couldn't find any file "
+                    "associated to this path or url.".format(
+                        pretrained_model_name_or_path,
+                        ', '.join(cls.pretrained_config_archive_map.keys()),
+                        config_file))
+            return None
+        if resolved_archive_file == archive_file and resolved_config_file == config_file:
+            logger.info("loading weights file {}".format(archive_file))
+            logger.info("loading configuration file {}".format(config_file))
+        else:
+            logger.info("loading weights file {} from cache at {}".format(
+                archive_file, resolved_archive_file))
+            logger.info("loading configuration file {} from cache at {}".format(
+                config_file, resolved_config_file))
+
+        # Load config
+        config = cls.config_class.from_json_file(resolved_config_file)
+
+        # Update config with kwargs if needed
+        to_remove = []
+        for key, value in kwargs.items():
+            if hasattr(config, key):
+                setattr(config, key, value)
+                to_remove.append(key)
+        for key in to_remove:
+            kwargs.pop(key, None)
+
+        logger.info("Model config {}".format(config))
+
+        # Instantiate model.
+        model = cls(config, *inputs, **kwargs)
+        if state_dict is None and not from_tf:
+            state_dict = torch.load(resolved_archive_file, map_location='cpu')
+        if from_tf:
+            # Directly load from a TensorFlow checkpoint
+            return load_tf_weights(model, config, resolved_archive_file[:-6])  # Remove the '.index'
+
+        # Load from a PyTorch state_dict
+        missing_keys = []
+        unexpected_keys = []
+        error_msgs = []
+        # copy state_dict so _load_from_state_dict can modify it
+        metadata = getattr(state_dict, '_metadata', None)
+        state_dict = state_dict.copy()
+        if metadata is not None:
+            state_dict._metadata = metadata
+
+        def load(module, prefix=''):
+            local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {})
+            module._load_from_state_dict(
+                state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys, error_msgs)
+            for name, child in module._modules.items():
+                if child is not None:
+                    load(child, prefix + name + '.')
+
+        start_prefix = ''
+        if not hasattr(model, cls.base_model_prefix) and any(s.startswith(cls.base_model_prefix) for s in state_dict.keys()):
+            start_prefix = cls.base_model_prefix + '.'  # Used to be able to load base models as well as derived modesl (with heads)
+        load(model, prefix=start_prefix)
+        if len(missing_keys) > 0:
+            logger.info("Weights of {} not initialized from pretrained model: {}".format(
+                model.__class__.__name__, missing_keys))
+        if len(unexpected_keys) > 0:
+            logger.info("Weights from pretrained model not used in {}: {}".format(
+                model.__class__.__name__, unexpected_keys))
+        if len(error_msgs) > 0:
+            raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format(
+                               model.__class__.__name__, "\n\t".join(error_msgs)))
+
+        if hasattr(model, tie_weights):
+            model.tie_weights()  # make sure word embedding weights are still tied
+
+        return model
+
+
 def prune_linear_layer(layer, index, dim=0):
     """ Prune a linear layer (a model parameters) to keep only entries in index.
         Return the pruned layer as a new layer with requires_grad=True.
@@ -197,3 +361,16 @@ def prune_conv1d_layer(layer, index, dim=1):
     new_layer.bias.copy_(b.contiguous())
     new_layer.bias.requires_grad = True
     return new_layer
+
+
+def prune_layer(layer, index, dim=None):
+    """ Prune a Conv1D or nn.Linear layer (a model parameters) to keep only entries in index.
+        Return the pruned layer as a new layer with requires_grad=True.
+        Used to remove heads.
+    """
+    if isinstance(layer, nn.Linear):
+        return prune_linear_layer(layer, index, dim=0 if dim is None else dim)
+    elif isinstance(layer, Conv1D):
+        return prune_conv1d_layer(layer, index, dim=1 if dim is None else dim)
+    else:
+        raise ValueError("Can't prune layer of class {}".format(layer.__class__))
diff --git a/pytorch_pretrained_bert/modeling.py b/pytorch_pretrained_bert/modeling.py
index 13ad591f72..27c747e405 100644
--- a/pytorch_pretrained_bert/modeling.py
+++ b/pytorch_pretrained_bert/modeling.py
@@ -30,7 +30,7 @@ from torch import nn
 from torch.nn import CrossEntropyLoss, MSELoss
 
 from .file_utils import cached_path
-from .model_utils import WEIGHTS_NAME, CONFIG_NAME, PretrainedConfig, prune_linear_layer
+from .model_utils import WEIGHTS_NAME, CONFIG_NAME, PretrainedConfig, PreTrainedModel, prune_linear_layer
 
 logger = logging.getLogger(__name__)
 
@@ -64,11 +64,9 @@ PRETRAINED_CONFIG_ARCHIVE_MAP = {
     'bert-large-cased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-config.json",
     'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-config.json",
 }
-BERT_CONFIG_NAME = 'bert_config.json'
-TF_WEIGHTS_NAME = 'model.ckpt'
 
 
-def load_tf_weights_in_bert(model, tf_checkpoint_path):
+def load_tf_weights_in_bert(model, config, tf_checkpoint_path):
     """ Load tf checkpoints in a pytorch model
     """
     try:
@@ -168,7 +166,8 @@ class BertConfig(PretrainedConfig):
                  max_position_embeddings=512,
                  type_vocab_size=2,
                  initializer_range=0.02,
-                 layer_norm_eps=1e-12):
+                 layer_norm_eps=1e-12,
+                 finetuning_task=None):
         """Constructs BertConfig.
 
         Args:
@@ -193,6 +192,7 @@ class BertConfig(PretrainedConfig):
             initializer_range: The sttdev of the truncated_normal_initializer for
                 initializing all weight matrices.
             layer_norm_eps: The epsilon used by LayerNorm.
+            finetuning_task: name of the glue task on which the model was fine-tuned if any
         """
         if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
                         and isinstance(vocab_size_or_config_json_file, unicode)):
@@ -213,6 +213,7 @@ class BertConfig(PretrainedConfig):
             self.type_vocab_size = type_vocab_size
             self.initializer_range = initializer_range
             self.layer_norm_eps = layer_norm_eps
+            self.finetuning_task = finetuning_task
         else:
             raise ValueError("First argument must be either a vocabulary size (int)"
                              "or the path to a pretrained model config file (str)")
@@ -539,20 +540,18 @@ class BertPreTrainingHeads(nn.Module):
         return prediction_scores, seq_relationship_score
 
 
-class BertPreTrainedModel(nn.Module):
+class BertPreTrainedModel(PreTrainedModel):
     """ An abstract class to handle weights initialization and
         a simple interface for dowloading and loading pretrained models.
     """
-    def __init__(self, config, *inputs, **kwargs):
-        super(BertPreTrainedModel, self).__init__()
-        if not isinstance(config, BertConfig):
-            raise ValueError(
-                "Parameter config in `{}(config)` should be an instance of class `BertConfig`. "
-                "To create a model from a Google pretrained model use "
-                "`model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`".format(
-                    self.__class__.__name__, self.__class__.__name__
-                ))
-        self.config = config
+    config_class = BertConfig
+    pretrained_model_archive_map = PRETRAINED_MODEL_ARCHIVE_MAP
+    pretrained_config_archive_map = PRETRAINED_CONFIG_ARCHIVE_MAP
+    load_tf_weights = load_tf_weights_in_bert
+    base_model_prefix = "bert"
+
+    def __init__(self, *inputs, **kwargs):
+        super(BertPreTrainedModel, self).__init__(*inputs, **kwargs)
 
     def init_weights(self, module):
         """ Initialize the weights.
@@ -567,152 +566,6 @@ class BertPreTrainedModel(nn.Module):
         if isinstance(module, nn.Linear) and module.bias is not None:
             module.bias.data.zero_()
 
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
-        """
-        Instantiate a BertPreTrainedModel from a pre-trained model file or a pytorch state dict.
-        Download and cache the pre-trained model file if needed.
-
-        Params:
-            pretrained_model_name_or_path: either:
-                - a str with the name of a pre-trained model to load selected in the list of:
-                    . `bert-base-uncased`
-                    . `bert-large-uncased`
-                    . `bert-base-cased`
-                    . `bert-large-cased`
-                    . `bert-base-multilingual-uncased`
-                    . `bert-base-multilingual-cased`
-                    . `bert-base-chinese`
-                    . `bert-base-german-cased`
-                    . `bert-large-uncased-whole-word-masking`
-                    . `bert-large-cased-whole-word-masking`
-                - a path or url to a pretrained model archive containing:
-                    . `bert_config.json` a configuration file for the model
-                    . `pytorch_model.bin` a PyTorch dump of a BertForPreTraining instance
-                - a path or url to a pretrained model archive containing:
-                    . `bert_config.json` a configuration file for the model
-                    . `model.chkpt` a TensorFlow checkpoint
-            from_tf: should we load the weights from a locally saved TensorFlow checkpoint
-            cache_dir: an optional path to a folder in which the pre-trained models will be cached.
-            state_dict: an optional state dictionnary (collections.OrderedDict object) to use instead of Google pre-trained models
-            *inputs, **kwargs: additional input for the specific Bert class
-                (ex: num_labels for BertForSequenceClassification)
-        """
-        state_dict = kwargs.get('state_dict', None)
-        kwargs.pop('state_dict', None)
-        cache_dir = kwargs.get('cache_dir', None)
-        kwargs.pop('cache_dir', None)
-        from_tf = kwargs.get('from_tf', False)
-        kwargs.pop('from_tf', None)
-
-        if pretrained_model_name_or_path in PRETRAINED_MODEL_ARCHIVE_MAP:
-            archive_file = PRETRAINED_MODEL_ARCHIVE_MAP[pretrained_model_name_or_path]
-            config_file = PRETRAINED_CONFIG_ARCHIVE_MAP[pretrained_model_name_or_path]
-        else:
-            if from_tf:
-                # Directly load from a TensorFlow checkpoint
-                archive_file = os.path.join(pretrained_model_name_or_path, TF_WEIGHTS_NAME)
-                config_file = os.path.join(pretrained_model_name_or_path, BERT_CONFIG_NAME)
-            else:
-                archive_file = os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME)
-                config_file = os.path.join(pretrained_model_name_or_path, CONFIG_NAME)
-        # redirect to the cache, if necessary
-        try:
-            resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir)
-        except EnvironmentError:
-            if pretrained_model_name_or_path in PRETRAINED_MODEL_ARCHIVE_MAP:
-                logger.error(
-                    "Couldn't reach server at '{}' to download pretrained weights.".format(
-                        archive_file))
-            else:
-                logger.error(
-                    "Model name '{}' was not found in model name list ({}). "
-                    "We assumed '{}' was a path or url but couldn't find any file "
-                    "associated to this path or url.".format(
-                        pretrained_model_name_or_path,
-                        ', '.join(PRETRAINED_MODEL_ARCHIVE_MAP.keys()),
-                        archive_file))
-            return None
-        try:
-            resolved_config_file = cached_path(config_file, cache_dir=cache_dir)
-        except EnvironmentError:
-            if pretrained_model_name_or_path in PRETRAINED_CONFIG_ARCHIVE_MAP:
-                logger.error(
-                    "Couldn't reach server at '{}' to download pretrained model configuration file.".format(
-                        config_file))
-            else:
-                logger.error(
-                    "Model name '{}' was not found in model name list ({}). "
-                    "We assumed '{}' was a path or url but couldn't find any file "
-                    "associated to this path or url.".format(
-                        pretrained_model_name_or_path,
-                        ', '.join(PRETRAINED_CONFIG_ARCHIVE_MAP.keys()),
-                        config_file))
-            return None
-        if resolved_archive_file == archive_file and resolved_config_file == config_file:
-            logger.info("loading weights file {}".format(archive_file))
-            logger.info("loading configuration file {}".format(config_file))
-        else:
-            logger.info("loading weights file {} from cache at {}".format(
-                archive_file, resolved_archive_file))
-            logger.info("loading configuration file {} from cache at {}".format(
-                config_file, resolved_config_file))
-        # Load config
-        config = BertConfig.from_json_file(resolved_config_file)
-        logger.info("Model config {}".format(config))
-        # Instantiate model.
-        model = cls(config, *inputs, **kwargs)
-        if state_dict is None and not from_tf:
-            state_dict = torch.load(resolved_archive_file, map_location='cpu')
-        if from_tf:
-            # Directly load from a TensorFlow checkpoint
-            return load_tf_weights_in_bert(model, resolved_archive_file)
-        # Load from a PyTorch state_dict
-        old_keys = []
-        new_keys = []
-        for key in state_dict.keys():
-            new_key = None
-            if 'gamma' in key:
-                new_key = key.replace('gamma', 'weight')
-            if 'beta' in key:
-                new_key = key.replace('beta', 'bias')
-            if new_key:
-                old_keys.append(key)
-                new_keys.append(new_key)
-        for old_key, new_key in zip(old_keys, new_keys):
-            state_dict[new_key] = state_dict.pop(old_key)
-
-        missing_keys = []
-        unexpected_keys = []
-        error_msgs = []
-        # copy state_dict so _load_from_state_dict can modify it
-        metadata = getattr(state_dict, '_metadata', None)
-        state_dict = state_dict.copy()
-        if metadata is not None:
-            state_dict._metadata = metadata
-
-        def load(module, prefix=''):
-            local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {})
-            module._load_from_state_dict(
-                state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys, error_msgs)
-            for name, child in module._modules.items():
-                if child is not None:
-                    load(child, prefix + name + '.')
-        start_prefix = ''
-        if not hasattr(model, 'bert') and any(s.startswith('bert.') for s in state_dict.keys()):
-            start_prefix = 'bert.'
-        load(model, prefix=start_prefix)
-        if len(missing_keys) > 0:
-            logger.info("Weights of {} not initialized from pretrained model: {}".format(
-                model.__class__.__name__, missing_keys))
-        if len(unexpected_keys) > 0:
-            logger.info("Weights from pretrained model not used in {}: {}".format(
-                model.__class__.__name__, unexpected_keys))
-        if len(error_msgs) > 0:
-            raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format(
-                               model.__class__.__name__, "\n\t".join(error_msgs)))
-        return model
-
 
 class BertModel(BertPreTrainedModel):
     """BERT model ("Bidirectional Embedding Representations from a Transformer").
diff --git a/pytorch_pretrained_bert/modeling_gpt2.py b/pytorch_pretrained_bert/modeling_gpt2.py
index 88ff521196..792d5e6777 100644
--- a/pytorch_pretrained_bert/modeling_gpt2.py
+++ b/pytorch_pretrained_bert/modeling_gpt2.py
@@ -32,7 +32,7 @@ from torch.nn import CrossEntropyLoss
 from torch.nn.parameter import Parameter
 
 from .file_utils import cached_path
-from .model_utils import Conv1D, CONFIG_NAME, WEIGHTS_NAME, PretrainedConfig, prune_conv1d_layer
+from .model_utils import Conv1D, CONFIG_NAME, WEIGHTS_NAME, PretrainedConfig, PreTrainedModel, prune_conv1d_layer
 from .modeling import BertLayerNorm as LayerNorm
 
 logger = logging.getLogger(__name__)
@@ -42,7 +42,7 @@ PRETRAINED_MODEL_ARCHIVE_MAP = {"gpt2": "https://s3.amazonaws.com/models.hugging
 PRETRAINED_CONFIG_ARCHIVE_MAP = {"gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-config.json",
                                  "gpt2-medium": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-config.json"}
 
-def load_tf_weights_in_gpt2(model, gpt2_checkpoint_path):
+def load_tf_weights_in_gpt2(model, config, gpt2_checkpoint_path):
     """ Load tf checkpoints in a pytorch model
     """
     try:
@@ -356,22 +356,18 @@ class GPT2MultipleChoiceHead(nn.Module):
         return multiple_choice_logits
 
 
-class GPT2PreTrainedModel(nn.Module):
+class GPT2PreTrainedModel(PreTrainedModel):
     """ An abstract class to handle weights initialization and
         a simple interface for dowloading and loading pretrained models.
     """
+    config_class = GPT2Config
+    pretrained_model_archive_map = PRETRAINED_MODEL_ARCHIVE_MAP
+    pretrained_config_archive_map = PRETRAINED_CONFIG_ARCHIVE_MAP
+    load_tf_weights = load_tf_weights_in_gpt2
+    base_model_prefix = "transformer"
 
-    def __init__(self, config, *inputs, **kwargs):
-        super(GPT2PreTrainedModel, self).__init__()
-        if not isinstance(config, GPT2Config):
-            raise ValueError(
-                "Parameter config in `{}(config)` should be an instance of class `GPT2Config`. "
-                "To create a model from a pretrained model use "
-                "`model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`".format(
-                    self.__class__.__name__, self.__class__.__name__
-                )
-            )
-        self.config = config
+    def __init__(self, *inputs, **kwargs):
+        super(GPT2PreTrainedModel, self).__init__(*inputs, **kwargs)
 
     def init_weights(self, module):
         """ Initialize the weights.
@@ -407,130 +403,130 @@ class GPT2PreTrainedModel(nn.Module):
             state_dict: an optional state dictionary (collections.OrderedDict object) to use instead of pre-trained models
             *inputs, **kwargs: additional input for the specific GPT2 class
         """
-        state_dict = kwargs.get('state_dict', None)
-        kwargs.pop('state_dict', None)
-        cache_dir = kwargs.get('cache_dir', None)
-        kwargs.pop('cache_dir', None)
-        from_tf = kwargs.get('from_tf', False)
-        kwargs.pop('from_tf', None)
+        # state_dict = kwargs.get('state_dict', None)
+        # kwargs.pop('state_dict', None)
+        # cache_dir = kwargs.get('cache_dir', None)
+        # kwargs.pop('cache_dir', None)
+        # from_tf = kwargs.get('from_tf', False)
+        # kwargs.pop('from_tf', None)
         num_special_tokens = kwargs.get('num_special_tokens', None)
         kwargs.pop('num_special_tokens', None)
 
-        if pretrained_model_name_or_path in PRETRAINED_MODEL_ARCHIVE_MAP:
-            archive_file = PRETRAINED_MODEL_ARCHIVE_MAP[pretrained_model_name_or_path]
-            config_file = PRETRAINED_CONFIG_ARCHIVE_MAP[pretrained_model_name_or_path]
-        else:
-            archive_file = os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME)
-            config_file = os.path.join(pretrained_model_name_or_path, CONFIG_NAME)
-        # redirect to the cache, if necessary
-        try:
-            resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir)
-        except EnvironmentError:
-            if pretrained_model_name_or_path in PRETRAINED_MODEL_ARCHIVE_MAP:
-                logger.error(
-                    "Couldn't reach server at '{}' to download pretrained weights.".format(
-                        archive_file))
-            else:
-                logger.error(
-                    "Model name '{}' was not found in model name list ({}). "
-                    "We assumed '{}' was a path or url but couldn't find file {} "
-                    "at this path or url.".format(
-                        pretrained_model_name_or_path, ", ".join(PRETRAINED_MODEL_ARCHIVE_MAP.keys()), pretrained_model_name_or_path,
-                        archive_file
-                    )
-                )
-            return None
-        try:
-            resolved_config_file = cached_path(config_file, cache_dir=cache_dir)
-        except EnvironmentError:
-            if pretrained_model_name_or_path in PRETRAINED_CONFIG_ARCHIVE_MAP:
-                logger.error(
-                    "Couldn't reach server at '{}' to download pretrained model configuration file.".format(
-                        config_file))
-            else:
-                logger.error(
-                    "Model name '{}' was not found in model name list ({}). "
-                    "We assumed '{}' was a path or url but couldn't find file {} "
-                    "at this path or url.".format(
-                        pretrained_model_name_or_path, ", ".join(PRETRAINED_CONFIG_ARCHIVE_MAP.keys()), pretrained_model_name_or_path,
-                        config_file
-                    )
-                )
-            return None
-        if resolved_archive_file == archive_file and resolved_config_file == config_file:
-            logger.info("loading weights file {}".format(archive_file))
-            logger.info("loading configuration file {}".format(config_file))
-        else:
-            logger.info("loading weights file {} from cache at {}".format(
-                archive_file, resolved_archive_file))
-            logger.info("loading configuration file {} from cache at {}".format(
-                config_file, resolved_config_file))
-        # Load config
-        config = GPT2Config.from_json_file(resolved_config_file)
-        logger.info("Model config {}".format(config))
-        # Instantiate model.
-        model = cls(config, *inputs, **kwargs)
-        if state_dict is None and not from_tf:
-            state_dict = torch.load(resolved_archive_file, map_location='cpu')
-        if from_tf:
-            # Directly load from a TensorFlow checkpoint (stored as NumPy array)
-            return load_tf_weights_in_gpt2(model, resolved_archive_file)
+        # if pretrained_model_name_or_path in PRETRAINED_MODEL_ARCHIVE_MAP:
+        #     archive_file = PRETRAINED_MODEL_ARCHIVE_MAP[pretrained_model_name_or_path]
+        #     config_file = PRETRAINED_CONFIG_ARCHIVE_MAP[pretrained_model_name_or_path]
+        # else:
+        #     archive_file = os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME)
+        #     config_file = os.path.join(pretrained_model_name_or_path, CONFIG_NAME)
+        # # redirect to the cache, if necessary
+        # try:
+        #     resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir)
+        # except EnvironmentError:
+        #     if pretrained_model_name_or_path in PRETRAINED_MODEL_ARCHIVE_MAP:
+        #         logger.error(
+        #             "Couldn't reach server at '{}' to download pretrained weights.".format(
+        #                 archive_file))
+        #     else:
+        #         logger.error(
+        #             "Model name '{}' was not found in model name list ({}). "
+        #             "We assumed '{}' was a path or url but couldn't find file {} "
+        #             "at this path or url.".format(
+        #                 pretrained_model_name_or_path, ", ".join(PRETRAINED_MODEL_ARCHIVE_MAP.keys()), pretrained_model_name_or_path,
+        #                 archive_file
+        #             )
+        #         )
+        #     return None
+        # try:
+        #     resolved_config_file = cached_path(config_file, cache_dir=cache_dir)
+        # except EnvironmentError:
+        #     if pretrained_model_name_or_path in PRETRAINED_CONFIG_ARCHIVE_MAP:
+        #         logger.error(
+        #             "Couldn't reach server at '{}' to download pretrained model configuration file.".format(
+        #                 config_file))
+        #     else:
+        #         logger.error(
+        #             "Model name '{}' was not found in model name list ({}). "
+        #             "We assumed '{}' was a path or url but couldn't find file {} "
+        #             "at this path or url.".format(
+        #                 pretrained_model_name_or_path, ", ".join(PRETRAINED_CONFIG_ARCHIVE_MAP.keys()), pretrained_model_name_or_path,
+        #                 config_file
+        #             )
+        #         )
+        #     return None
+        # if resolved_archive_file == archive_file and resolved_config_file == config_file:
+        #     logger.info("loading weights file {}".format(archive_file))
+        #     logger.info("loading configuration file {}".format(config_file))
+        # else:
+        #     logger.info("loading weights file {} from cache at {}".format(
+        #         archive_file, resolved_archive_file))
+        #     logger.info("loading configuration file {} from cache at {}".format(
+        #         config_file, resolved_config_file))
+        # # Load config
+        # config = GPT2Config.from_json_file(resolved_config_file)
+        # logger.info("Model config {}".format(config))
+        # # Instantiate model.
+        # model = cls(config, *inputs, **kwargs)
+        # if state_dict is None and not from_tf:
+        #     state_dict = torch.load(resolved_archive_file, map_location='cpu')
+        # if from_tf:
+        #     # Directly load from a TensorFlow checkpoint (stored as NumPy array)
+        #     return load_tf_weights_in_gpt2(model, resolved_archive_file)
 
-        old_keys = []
-        new_keys = []
-        for key in state_dict.keys():
-            new_key = None
-            if key.endswith(".g"):
-                new_key = key[:-2] + ".weight"
-            elif key.endswith(".b"):
-                new_key = key[:-2] + ".bias"
-            elif key.endswith(".w"):
-                new_key = key[:-2] + ".weight"
-            if new_key:
-                old_keys.append(key)
-                new_keys.append(new_key)
-        for old_key, new_key in zip(old_keys, new_keys):
-            state_dict[new_key] = state_dict.pop(old_key)
+        # old_keys = []
+        # new_keys = []
+        # for key in state_dict.keys():
+        #     new_key = None
+        #     if key.endswith(".g"):
+        #         new_key = key[:-2] + ".weight"
+        #     elif key.endswith(".b"):
+        #         new_key = key[:-2] + ".bias"
+        #     elif key.endswith(".w"):
+        #         new_key = key[:-2] + ".weight"
+        #     if new_key:
+        #         old_keys.append(key)
+        #         new_keys.append(new_key)
+        # for old_key, new_key in zip(old_keys, new_keys):
+        #     state_dict[new_key] = state_dict.pop(old_key)
 
-        missing_keys = []
-        unexpected_keys = []
-        error_msgs = []
-        # copy state_dict so _load_from_state_dict can modify it
-        metadata = getattr(state_dict, "_metadata", None)
-        state_dict = state_dict.copy()
-        if metadata is not None:
-            state_dict._metadata = metadata
+        # missing_keys = []
+        # unexpected_keys = []
+        # error_msgs = []
+        # # copy state_dict so _load_from_state_dict can modify it
+        # metadata = getattr(state_dict, "_metadata", None)
+        # state_dict = state_dict.copy()
+        # if metadata is not None:
+        #     state_dict._metadata = metadata
 
-        def load(module, prefix=""):
-            local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {})
-            module._load_from_state_dict(
-                state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys, error_msgs
-            )
-            for name, child in module._modules.items():
-                if child is not None:
-                    load(child, prefix + name + ".")
+        # def load(module, prefix=""):
+        #     local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {})
+        #     module._load_from_state_dict(
+        #         state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys, error_msgs
+        #     )
+        #     for name, child in module._modules.items():
+        #         if child is not None:
+        #             load(child, prefix + name + ".")
 
-        start_model = model
-        if hasattr(model, "transformer") and all(not s.startswith('transformer.') for s in state_dict.keys()):
-            start_model = model.transformer
-        load(start_model, prefix="")
+        # start_model = model
+        # if hasattr(model, "transformer") and all(not s.startswith('transformer.') for s in state_dict.keys()):
+        #     start_model = model.transformer
+        # load(start_model, prefix="")
 
-        if len(missing_keys) > 0:
-            logger.info(
-                "Weights of {} not initialized from pretrained model: {}".format(model.__class__.__name__, missing_keys)
-            )
-        if len(unexpected_keys) > 0:
-            logger.info(
-                "Weights from pretrained model not used in {}: {}".format(model.__class__.__name__, unexpected_keys)
-            )
-        if len(error_msgs) > 0:
-            raise RuntimeError(
-                "Error(s) in loading state_dict for {}:\n\t{}".format(model.__class__.__name__, "\n\t".join(error_msgs))
-            )
+        # if len(missing_keys) > 0:
+        #     logger.info(
+        #         "Weights of {} not initialized from pretrained model: {}".format(model.__class__.__name__, missing_keys)
+        #     )
+        # if len(unexpected_keys) > 0:
+        #     logger.info(
+        #         "Weights from pretrained model not used in {}: {}".format(model.__class__.__name__, unexpected_keys)
+        #     )
+        # if len(error_msgs) > 0:
+        #     raise RuntimeError(
+        #         "Error(s) in loading state_dict for {}:\n\t{}".format(model.__class__.__name__, "\n\t".join(error_msgs))
+        #     )
 
         # Add additional embeddings for special tokens if needed
         # This step also make sure we are still sharing the output and input embeddings after loading weights
-        model.set_num_special_tokens(num_special_tokens if num_special_tokens is not None else config.n_special)
+        model.set_num_special_tokens(num_special_tokens)
         return model
 
 
@@ -608,9 +604,9 @@ class GPT2Model(GPT2PreTrainedModel):
 
         self.apply(self.init_weights)
 
-    def set_num_special_tokens(self, num_special_tokens):
+    def set_num_special_tokens(self, num_special_tokens=None):
         " Update input embeddings with new embedding matrice if needed "
-        if self.config.n_special == num_special_tokens:
+        if num_special_tokens is None or self.config.n_special == num_special_tokens:
             return
         # Update config
         self.config.n_special = num_special_tokens
diff --git a/pytorch_pretrained_bert/modeling_openai.py b/pytorch_pretrained_bert/modeling_openai.py
index 464bce26c0..670f250ef9 100644
--- a/pytorch_pretrained_bert/modeling_openai.py
+++ b/pytorch_pretrained_bert/modeling_openai.py
@@ -32,7 +32,7 @@ from torch.nn import CrossEntropyLoss
 from torch.nn.parameter import Parameter
 
 from .file_utils import cached_path
-from .model_utils import Conv1D, CONFIG_NAME, WEIGHTS_NAME, PretrainedConfig, prune_conv1d_layer
+from .model_utils import Conv1D, CONFIG_NAME, WEIGHTS_NAME, PretrainedConfig, PreTrainedModel, prune_conv1d_layer
 from .modeling import BertLayerNorm as LayerNorm
 
 logger = logging.getLogger(__name__)
@@ -41,12 +41,17 @@ PRETRAINED_MODEL_ARCHIVE_MAP = {"openai-gpt": "https://s3.amazonaws.com/models.h
 PRETRAINED_CONFIG_ARCHIVE_MAP = {"openai-gpt": "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-config.json"}
 
 
-def load_tf_weights_in_openai_gpt(model, openai_checkpoint_folder_path):
+def load_tf_weights_in_openai_gpt(model, config, openai_checkpoint_folder_path):
     """ Load tf pre-trained weights in a pytorch model (from NumPy arrays here)
     """
     import re
     import numpy as np
-    print("Loading weights...")
+
+    if '.ckpt' in openai_checkpoint_folder_path:
+        openai_checkpoint_folder_path = os.path.dirname(openai_checkpoint_folder_path)
+
+    logger.info("Loading weights from {}".format(openai_checkpoint_folder_path))
+
     names = json.load(open(openai_checkpoint_folder_path + '/parameters_names.json', "r", encoding='utf-8'))
     shapes = json.load(open(openai_checkpoint_folder_path + '/params_shapes.json', "r", encoding='utf-8'))
     offsets = np.cumsum([np.prod(shape) for shape in shapes])
@@ -377,22 +382,18 @@ class OpenAIGPTMultipleChoiceHead(nn.Module):
         return multiple_choice_logits
 
 
-class OpenAIGPTPreTrainedModel(nn.Module):
+class OpenAIGPTPreTrainedModel(PreTrainedModel):
     """ An abstract class to handle weights initialization and
         a simple interface for dowloading and loading pretrained models.
     """
+    config_class = OpenAIGPTConfig
+    pretrained_model_archive_map = PRETRAINED_MODEL_ARCHIVE_MAP
+    pretrained_config_archive_map = PRETRAINED_CONFIG_ARCHIVE_MAP
+    load_tf_weights = load_tf_weights_in_openai_gpt
+    base_model_prefix = "transformer"
 
-    def __init__(self, config, *inputs, **kwargs):
-        super(OpenAIGPTPreTrainedModel, self).__init__()
-        if not isinstance(config, OpenAIGPTConfig):
-            raise ValueError(
-                "Parameter config in `{}(config)` should be an instance of class `OpenAIGPTConfig`. "
-                "To create a model from a pretrained model use "
-                "`model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`".format(
-                    self.__class__.__name__, self.__class__.__name__
-                )
-            )
-        self.config = config
+    def __init__(self, *inputs, **kwargs):
+        super(OpenAIGPTPreTrainedModel, self).__init__(*inputs, **kwargs)
 
     def init_weights(self, module):
         """ Initialize the weights.
@@ -408,7 +409,7 @@ class OpenAIGPTPreTrainedModel(nn.Module):
             module.bias.data.zero_()
 
     @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path, num_special_tokens=None, *inputs, **kwargs):
+    def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
         """
         Instantiate a OpenAIGPTPreTrainedModel from a pre-trained model file or a pytorch state dict.
         Download and cache the pre-trained model file if needed.
@@ -416,140 +417,25 @@ class OpenAIGPTPreTrainedModel(nn.Module):
         Params:
             pretrained_model_name_or_path: either:
                 - a str with the name of a pre-trained model to load selected in the list of:
-                    . `openai-gpt`
                 - a path or url to a pretrained model archive containing:
-                    . `openai_gpt_config.json` a configuration file for the model
+                    . `config.json` a configuration file for the model
                     . `pytorch_model.bin` a PyTorch dump of a OpenAIGPTModel instance
                 - a path or url to a pretrained model archive containing:
-                    . `openai-gpt-config.json` a configuration file for the model
+                    . `config.json` a configuration file for the model
                     . a series of NumPy files containing OpenAI TensorFlow trained weights
             from_tf: should we load the weights from a locally saved TensorFlow checkpoint
             cache_dir: an optional path to a folder in which the pre-trained models will be cached.
             state_dict: an optional state dictionnary (collections.OrderedDict object) to use instead of pre-trained models
             *inputs, **kwargs: additional input for the specific OpenAI-GPT class
         """
-        state_dict = kwargs.get('state_dict', None)
-        kwargs.pop('state_dict', None)
-        cache_dir = kwargs.get('cache_dir', None)
-        kwargs.pop('cache_dir', None)
-        from_tf = kwargs.get('from_tf', False)
-        kwargs.pop('from_tf', None)
+        num_special_tokens = kwargs.get('num_special_tokens', None)
+        kwargs.pop('num_special_tokens', None)
 
-        if pretrained_model_name_or_path in PRETRAINED_MODEL_ARCHIVE_MAP:
-            archive_file = PRETRAINED_MODEL_ARCHIVE_MAP[pretrained_model_name_or_path]
-            config_file = PRETRAINED_CONFIG_ARCHIVE_MAP[pretrained_model_name_or_path]
-        else:
-            archive_file = os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME)
-            config_file = os.path.join(pretrained_model_name_or_path, CONFIG_NAME)
-        # redirect to the cache, if necessary
-        try:
-            resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir)
-        except EnvironmentError:
-            if pretrained_model_name_or_path in PRETRAINED_MODEL_ARCHIVE_MAP:
-                logger.error(
-                    "Couldn't reach server at '{}' to download pretrained weights.".format(
-                        archive_file))
-            else:
-                logger.error(
-                    "Model name '{}' was not found in model name list ({}). "
-                    "We assumed '{}' was a path or url but couldn't find file {} "
-                    "at this path or url.".format(
-                        pretrained_model_name_or_path, ", ".join(PRETRAINED_MODEL_ARCHIVE_MAP.keys()), pretrained_model_name_or_path,
-                        archive_file
-                    )
-                )
-            return None
-        try:
-            resolved_config_file = cached_path(config_file, cache_dir=cache_dir)
-        except EnvironmentError:
-            if pretrained_model_name_or_path in PRETRAINED_CONFIG_ARCHIVE_MAP:
-                logger.error(
-                    "Couldn't reach server at '{}' to download pretrained model configuration file.".format(
-                        config_file))
-            else:
-                logger.error(
-                    "Model name '{}' was not found in model name list ({}). "
-                    "We assumed '{}' was a path or url but couldn't find file {} "
-                    "at this path or url.".format(
-                        pretrained_model_name_or_path, ", ".join(PRETRAINED_CONFIG_ARCHIVE_MAP.keys()), pretrained_model_name_or_path,
-                        config_file
-                    )
-                )
-            return None
-        if resolved_archive_file == archive_file and resolved_config_file == config_file:
-            logger.info("loading weights file {}".format(archive_file))
-            logger.info("loading configuration file {}".format(config_file))
-        else:
-            logger.info("loading weights file {} from cache at {}".format(
-                archive_file, resolved_archive_file))
-            logger.info("loading configuration file {} from cache at {}".format(
-                config_file, resolved_config_file))
-        # Load config
-        config = OpenAIGPTConfig.from_json_file(resolved_config_file)
-        logger.info("Model config {}".format(config))
-        # Instantiate model.
-        model = cls(config, *inputs, **kwargs)
-        if state_dict is None and not from_tf:
-            state_dict = torch.load(resolved_archive_file, map_location='cpu')
-        if from_tf:
-            # Directly load from a TensorFlow checkpoint (stored as NumPy array)
-            return load_tf_weights_in_openai_gpt(model, resolved_archive_file)
-
-        old_keys = []
-        new_keys = []
-        for key in state_dict.keys():
-            new_key = None
-            if key.endswith(".g"):
-                new_key = key[:-2] + ".weight"
-            elif key.endswith(".b"):
-                new_key = key[:-2] + ".bias"
-            elif key.endswith(".w"):
-                new_key = key[:-2] + ".weight"
-            if new_key:
-                old_keys.append(key)
-                new_keys.append(new_key)
-        for old_key, new_key in zip(old_keys, new_keys):
-            state_dict[new_key] = state_dict.pop(old_key)
-
-        missing_keys = []
-        unexpected_keys = []
-        error_msgs = []
-        # copy state_dict so _load_from_state_dict can modify it
-        metadata = getattr(state_dict, "_metadata", None)
-        state_dict = state_dict.copy()
-        if metadata is not None:
-            state_dict._metadata = metadata
-
-        def load(module, prefix=""):
-            local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {})
-            module._load_from_state_dict(
-                state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys, error_msgs
-            )
-            for name, child in module._modules.items():
-                if child is not None:
-                    load(child, prefix + name + ".")
-
-        start_model = model
-        if hasattr(model, "transformer") and all(not s.startswith('transformer.') for s in state_dict.keys()):
-            start_model = model.transformer
-        load(start_model, prefix="")
-
-        if len(missing_keys) > 0:
-            logger.info(
-                "Weights of {} not initialized from pretrained model: {}".format(model.__class__.__name__, missing_keys)
-            )
-        if len(unexpected_keys) > 0:
-            logger.info(
-                "Weights from pretrained model not used in {}: {}".format(model.__class__.__name__, unexpected_keys)
-            )
-        if len(error_msgs) > 0:
-            raise RuntimeError(
-                "Error(s) in loading state_dict for {}:\n\t{}".format(model.__class__.__name__, "\n\t".join(error_msgs))
-            )
+        model = PreTrainedModel.from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs)
 
         # Add additional embeddings for special tokens if needed
         # This step also make sure we are still sharing the output and input embeddings after loading weights
-        model.set_num_special_tokens(num_special_tokens if num_special_tokens is not None else config.n_special)
+        model.set_num_special_tokens(num_special_tokens)
         return model
 
 
@@ -621,9 +507,9 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
 
         self.apply(self.init_weights)
 
-    def set_num_special_tokens(self, num_special_tokens):
+    def set_num_special_tokens(self, num_special_tokens=None):
         " Update input embeddings with new embedding matrice if needed "
-        if self.config.n_special == num_special_tokens:
+        if num_special_tokens is None or self.config.n_special == num_special_tokens:
             return
         # Update config
         self.config.n_special = num_special_tokens
diff --git a/pytorch_pretrained_bert/modeling_transfo_xl.py b/pytorch_pretrained_bert/modeling_transfo_xl.py
index 65e787c792..518abb86a3 100644
--- a/pytorch_pretrained_bert/modeling_transfo_xl.py
+++ b/pytorch_pretrained_bert/modeling_transfo_xl.py
@@ -38,7 +38,7 @@ from torch.nn.parameter import Parameter
 from .modeling import BertLayerNorm as LayerNorm
 from .modeling_transfo_xl_utilities import ProjectedAdaptiveLogSoftmax, sample_logits
 from .file_utils import cached_path
-from .model_utils import CONFIG_NAME, WEIGHTS_NAME, PretrainedConfig
+from .model_utils import CONFIG_NAME, WEIGHTS_NAME, PretrainedConfig, PreTrainedModel
 
 logger = logging.getLogger(__name__)
 
@@ -49,8 +49,6 @@ PRETRAINED_CONFIG_ARCHIVE_MAP = {
     'transfo-xl-wt103': "https://s3.amazonaws.com/models.huggingface.co/bert/transfo-xl-wt103-config.json",
 }
 
-TF_WEIGHTS_NAME = 'model.ckpt'
-
 def build_tf_to_pytorch_map(model, config):
     """ A map of modules from TF to PyTorch.
         This time I use a map to keep the PyTorch model as identical to the original PyTorch model as possible.
@@ -787,28 +785,26 @@ class AdaptiveEmbedding(nn.Module):
         return embed
 
 
-class TransfoXLPreTrainedModel(nn.Module):
+class TransfoXLPreTrainedModel(PreTrainedModel):
     """ An abstract class to handle weights initialization and
         a simple interface for dowloading and loading pretrained models.
     """
-    def __init__(self, config, *inputs, **kwargs):
-        super(TransfoXLPreTrainedModel, self).__init__()
-        if not isinstance(config, TransfoXLConfig):
-            raise ValueError(
-                "Parameter config in `{}(config)` should be an instance of class `TransfoXLConfig`. "
-                "To create a model from a pretrained model use "
-                "`model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`".format(
-                    self.__class__.__name__, self.__class__.__name__
-                ))
-        self.config = config
+    config_class = TransfoXLConfig
+    pretrained_model_archive_map = PRETRAINED_MODEL_ARCHIVE_MAP
+    pretrained_config_archive_map = PRETRAINED_CONFIG_ARCHIVE_MAP
+    load_tf_weights = load_tf_weights_in_transfo_xl
+    base_model_prefix = "transformer"
 
-    def init_weight(self, weight):
+    def __init__(self, *inputs, **kwargs):
+        super(TransfoXLPreTrainedModel, self).__init__(*inputs, **kwargs)
+
+    def _init_weight(self, weight):
         if self.config.init == 'uniform':
             nn.init.uniform_(weight, -self.config.init_range, self.config.init_range)
         elif self.config.init == 'normal':
             nn.init.normal_(weight, 0.0, self.config.init_std)
 
-    def init_bias(self, bias):
+    def _init_bias(self, bias):
         nn.init.constant_(bias, 0.0)
 
     def init_weights(self, m):
@@ -817,9 +813,9 @@ class TransfoXLPreTrainedModel(nn.Module):
         classname = m.__class__.__name__
         if classname.find('Linear') != -1:
             if hasattr(m, 'weight') and m.weight is not None:
-                self.init_weight(m.weight)
+                self._init_weight(m.weight)
             if hasattr(m, 'bias') and m.bias is not None:
-                self.init_bias(m.bias)
+                self._init_bias(m.bias)
         elif classname.find('AdaptiveEmbedding') != -1:
             if hasattr(m, 'emb_projs'):
                 for i in range(len(m.emb_projs)):
@@ -827,12 +823,12 @@ class TransfoXLPreTrainedModel(nn.Module):
                         nn.init.normal_(m.emb_projs[i], 0.0, self.config.proj_init_std)
         elif classname.find('Embedding') != -1:
             if hasattr(m, 'weight'):
-                self.init_weight(m.weight)
+                self._init_weight(m.weight)
         elif classname.find('ProjectedAdaptiveLogSoftmax') != -1:
             if hasattr(m, 'cluster_weight') and m.cluster_weight is not None:
-                self.init_weight(m.cluster_weight)
+                self._init_weight(m.cluster_weight)
             if hasattr(m, 'cluster_bias') and m.cluster_bias is not None:
-                self.init_bias(m.cluster_bias)
+                self._init_bias(m.cluster_bias)
             if hasattr(m, 'out_projs'):
                 for i in range(len(m.out_projs)):
                     if m.out_projs[i] is not None:
@@ -841,144 +837,20 @@ class TransfoXLPreTrainedModel(nn.Module):
             if hasattr(m, 'weight'):
                 nn.init.normal_(m.weight, 1.0, self.config.init_std)
             if hasattr(m, 'bias') and m.bias is not None:
-                self.init_bias(m.bias)
+                self._init_bias(m.bias)
         elif classname.find('TransformerLM') != -1:
             if hasattr(m, 'r_emb'):
-                self.init_weight(m.r_emb)
+                self._init_weight(m.r_emb)
             if hasattr(m, 'r_w_bias'):
-                self.init_weight(m.r_w_bias)
+                self._init_weight(m.r_w_bias)
             if hasattr(m, 'r_r_bias'):
-                self.init_weight(m.r_r_bias)
+                self._init_weight(m.r_r_bias)
             if hasattr(m, 'r_bias'):
-                self.init_bias(m.r_bias)
+                self._init_bias(m.r_bias)
 
     def set_num_special_tokens(self, num_special_tokens):
         pass
 
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
-        """
-        Instantiate a TransfoXLPreTrainedModel from a pre-trained model file or a pytorch state dict.
-        Download and cache the pre-trained model file if needed.
-
-        Params:
-            pretrained_model_name_or_path: either:
-                - a str with the name of a pre-trained model to load selected in the list of:
-                    . `transfo-xl-wt103`
-                - a path or url to a pretrained model archive containing:
-                    . `transfo_xl_config.json` a configuration file for the model
-                    . `pytorch_model.bin` a PyTorch dump of a TransfoXLModel instance
-                - a path or url to a pretrained model archive containing:
-                    . `transfo_xl_config.json` a configuration file for the model
-                    . `model.chkpt` a TensorFlow checkpoint
-            from_tf: should we load the weights from a locally saved TensorFlow checkpoint
-            cache_dir: an optional path to a folder in which the pre-trained models will be cached.
-            state_dict: an optional state dictionnary (collections.OrderedDict object) to use instead of pre-trained models
-            *inputs, **kwargs: additional input for the specific TransformerXL class
-        """
-        state_dict = kwargs.get('state_dict', None)
-        kwargs.pop('state_dict', None)
-        cache_dir = kwargs.get('cache_dir', None)
-        kwargs.pop('cache_dir', None)
-        from_tf = kwargs.get('from_tf', False)
-        kwargs.pop('from_tf', None)
-
-        if pretrained_model_name_or_path in PRETRAINED_MODEL_ARCHIVE_MAP:
-            archive_file = PRETRAINED_MODEL_ARCHIVE_MAP[pretrained_model_name_or_path]
-            config_file = PRETRAINED_CONFIG_ARCHIVE_MAP[pretrained_model_name_or_path]
-        else:
-            archive_file = os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME)
-            config_file = os.path.join(pretrained_model_name_or_path, CONFIG_NAME)
-        # redirect to the cache, if necessary
-        try:
-            resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir)
-        except EnvironmentError:
-            if pretrained_model_name_or_path in PRETRAINED_MODEL_ARCHIVE_MAP:
-                logger.error(
-                    "Couldn't reach server at '{}' to download pretrained weights.".format(
-                        archive_file))
-            else:
-                logger.error(
-                    "Model name '{}' was not found in model name list ({}). "
-                    "We assumed '{}' was a path or url but couldn't find file {} "
-                    "at this path or url.".format(
-                        pretrained_model_name_or_path, ", ".join(PRETRAINED_MODEL_ARCHIVE_MAP.keys()), pretrained_model_name_or_path,
-                        archive_file
-                    )
-                )
-            return None
-        try:
-            resolved_config_file = cached_path(config_file, cache_dir=cache_dir)
-        except EnvironmentError:
-            if pretrained_model_name_or_path in PRETRAINED_CONFIG_ARCHIVE_MAP:
-                logger.error(
-                    "Couldn't reach server at '{}' to download pretrained model configuration file.".format(
-                        config_file))
-            else:
-                logger.error(
-                    "Model name '{}' was not found in model name list ({}). "
-                    "We assumed '{}' was a path or url but couldn't find file {} "
-                    "at this path or url.".format(
-                        pretrained_model_name_or_path, ", ".join(PRETRAINED_CONFIG_ARCHIVE_MAP.keys()), pretrained_model_name_or_path,
-                        config_file
-                    )
-                )
-            return None
-        if resolved_archive_file == archive_file and resolved_config_file == config_file:
-            logger.info("loading weights file {}".format(archive_file))
-            logger.info("loading configuration file {}".format(config_file))
-        else:
-            logger.info("loading weights file {} from cache at {}".format(
-                archive_file, resolved_archive_file))
-            logger.info("loading configuration file {} from cache at {}".format(
-                config_file, resolved_config_file))
-        # Load config
-        config = TransfoXLConfig.from_json_file(resolved_config_file)
-        logger.info("Model config {}".format(config))
-        # Instantiate model.
-        model = cls(config, *inputs, **kwargs)
-        if state_dict is None and not from_tf:
-            state_dict = torch.load(resolved_archive_file, map_location='cpu')
-        if from_tf:
-            # Directly load from a TensorFlow checkpoint
-            return load_tf_weights_in_transfo_xl(model, config, pretrained_model_name_or_path)
-
-        missing_keys = []
-        unexpected_keys = []
-        error_msgs = []
-        # copy state_dict so _load_from_state_dict can modify it
-        metadata = getattr(state_dict, '_metadata', None)
-        state_dict = state_dict.copy()
-        if metadata is not None:
-            state_dict._metadata = metadata
-
-        def load(module, prefix=''):
-            local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {})
-            module._load_from_state_dict(
-                state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys, error_msgs)
-            for name, child in module._modules.items():
-                if child is not None:
-                    load(child, prefix + name + '.')
-
-        start_prefix = ''
-        if not hasattr(model, 'transformer') and any(s.startswith('transformer.') for s in state_dict.keys()):
-            start_prefix = 'transformer.'
-        load(model, prefix=start_prefix)
-
-        if len(missing_keys) > 0:
-            logger.info("Weights of {} not initialized from pretrained model: {}".format(
-                model.__class__.__name__, missing_keys))
-        if len(unexpected_keys) > 0:
-            logger.info("Weights from pretrained model not used in {}: {}".format(
-                model.__class__.__name__, unexpected_keys))
-        if len(error_msgs) > 0:
-            raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format(
-                               model.__class__.__name__, "\n\t".join(error_msgs)))
-        # Make sure we are still sharing the input and output embeddings
-        if hasattr(model, 'tie_weights'):
-            model.tie_weights()
-        return model
-
 
 class TransfoXLModel(TransfoXLPreTrainedModel):
     """Transformer XL model ("Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context").
diff --git a/pytorch_pretrained_bert/modeling_xlm.py b/pytorch_pretrained_bert/modeling_xlm.py
index aa23618199..f0b9538ae4 100644
--- a/pytorch_pretrained_bert/modeling_xlm.py
+++ b/pytorch_pretrained_bert/modeling_xlm.py
@@ -36,7 +36,7 @@ from torch.nn import functional as F
 from torch.nn import CrossEntropyLoss, MSELoss
 
 from .file_utils import cached_path
-from .model_utils import CONFIG_NAME, WEIGHTS_NAME, PretrainedConfig
+from .model_utils import CONFIG_NAME, WEIGHTS_NAME, PretrainedConfig, PreTrainedModel
 
 logger = logging.getLogger(__name__)
 
@@ -390,20 +390,18 @@ class BeamHypotheses(object):
             return self.worst_score >= best_sum_logprobs / self.max_len ** self.length_penalty
 
 
-class XLMPreTrainedModel(nn.Module):
+class XLMPreTrainedModel(PreTrainedModel):
     """ An abstract class to handle weights initialization and
         a simple interface for dowloading and loading pretrained models.
     """
-    def __init__(self, config, *inputs, **kwargs):
-        super(XLMPreTrainedModel, self).__init__()
-        if not isinstance(config, XLMBaseConfig):
-            raise ValueError(
-                "Parameter config in `{}(config)` should be an instance of class `XLMBaseConfig`. "
-                "To create a model from a Google pretrained model use "
-                "`model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`".format(
-                    self.__class__.__name__, self.__class__.__name__
-                ))
-        self.config = config
+    config_class = XLMConfig
+    pretrained_model_archive_map = PRETRAINED_MODEL_ARCHIVE_MAP
+    pretrained_config_archive_map = PRETRAINED_CONFIG_ARCHIVE_MAP
+    load_tf_weights = None
+    base_model_prefix = "xlm"
+
+    def __init__(self, *inputs, **kwargs):
+        super(XLMPreTrainedModel, self).__init__(*inputs, **kwargs)
 
     def init_weights(self, module):
         """ Initialize the weights.
@@ -423,138 +421,6 @@ class XLMPreTrainedModel(nn.Module):
         if isinstance(module, nn.Linear) and module.bias is not None:
             module.bias.data.zero_()
 
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
-        """
-        Instantiate a XLMPreTrainedModel from a pre-trained model file or a pytorch state dict.
-        Download and cache the pre-trained model file if needed.
-
-        Params:
-            pretrained_model_name_or_path: either:
-                - a str with the name of a pre-trained model to load selected in the list of:
-                    . `xlnet-large-cased`
-                - a path or url to a pretrained model archive containing:
-                    . `config.json` a configuration file for the model
-                    . `pytorch_model.bin` a PyTorch dump of a XLMForPreTraining instance
-                - a path or url to a pretrained model archive containing:
-                    . `xlnet_config.json` a configuration file for the model
-                    . `model.chkpt` a TensorFlow checkpoint
-            cache_dir: an optional path to a folder in which the pre-trained models will be cached.
-            state_dict: an optional state dictionnary (collections.OrderedDict object) to use instead of Google pre-trained models
-            *inputs, **kwargs: additional input for the specific XLM class
-                (ex: num_labels for XLMForSequenceClassification)
-        """
-        state_dict = kwargs.get('state_dict', None)
-        kwargs.pop('state_dict', None)
-        cache_dir = kwargs.get('cache_dir', None)
-        kwargs.pop('cache_dir', None)
-
-        if pretrained_model_name_or_path in PRETRAINED_MODEL_ARCHIVE_MAP:
-            archive_file = PRETRAINED_MODEL_ARCHIVE_MAP[pretrained_model_name_or_path]
-            config_file = PRETRAINED_CONFIG_ARCHIVE_MAP[pretrained_model_name_or_path]
-        else:
-            if from_tf:
-                # Directly load from a TensorFlow checkpoint
-                archive_file = os.path.join(pretrained_model_name_or_path, TF_WEIGHTS_NAME)
-                config_file = os.path.join(pretrained_model_name_or_path, XLNET_CONFIG_NAME)
-            else:
-                archive_file = os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME)
-                config_file = os.path.join(pretrained_model_name_or_path, CONFIG_NAME)
-        # redirect to the cache, if necessary
-        try:
-            resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir)
-        except EnvironmentError:
-            if pretrained_model_name_or_path in PRETRAINED_MODEL_ARCHIVE_MAP:
-                logger.error(
-                    "Couldn't reach server at '{}' to download pretrained weights.".format(
-                        archive_file))
-            else:
-                logger.error(
-                    "Model name '{}' was not found in model name list ({}). "
-                    "We assumed '{}' was a path or url but couldn't find any file "
-                    "associated to this path or url.".format(
-                        pretrained_model_name_or_path,
-                        ', '.join(PRETRAINED_MODEL_ARCHIVE_MAP.keys()),
-                        archive_file))
-            return None
-        try:
-            resolved_config_file = cached_path(config_file, cache_dir=cache_dir)
-        except EnvironmentError:
-            if pretrained_model_name_or_path in PRETRAINED_CONFIG_ARCHIVE_MAP:
-                logger.error(
-                    "Couldn't reach server at '{}' to download pretrained model configuration file.".format(
-                        config_file))
-            else:
-                logger.error(
-                    "Model name '{}' was not found in model name list ({}). "
-                    "We assumed '{}' was a path or url but couldn't find any file "
-                    "associated to this path or url.".format(
-                        pretrained_model_name_or_path,
-                        ', '.join(PRETRAINED_CONFIG_ARCHIVE_MAP.keys()),
-                        config_file))
-            return None
-        if resolved_archive_file == archive_file and resolved_config_file == config_file:
-            logger.info("loading weights file {}".format(archive_file))
-            logger.info("loading configuration file {}".format(config_file))
-        else:
-            logger.info("loading weights file {} from cache at {}".format(
-                archive_file, resolved_archive_file))
-            logger.info("loading configuration file {} from cache at {}".format(
-                config_file, resolved_config_file))
-
-        # Load config
-        config = XLMConfig.from_json_file(resolved_config_file)
-
-        # Update config with kwargs if needed
-        to_remove = []
-        for key, value in kwargs.items():
-            if hasattr(config, key):
-                setattr(config, key, value)
-                to_remove.append(key)
-        for key in to_remove:
-            kwargs.pop(key, None)
-
-        logger.info("Model config {}".format(config))
-
-        # Instantiate model.
-        model = cls(config, *inputs, **kwargs)
-        if state_dict is None and not from_tf:
-            state_dict = torch.load(resolved_archive_file, map_location='cpu')
-
-        # Load from a PyTorch state_dict
-        missing_keys = []
-        unexpected_keys = []
-        error_msgs = []
-        # copy state_dict so _load_from_state_dict can modify it
-        metadata = getattr(state_dict, '_metadata', None)
-        state_dict = state_dict.copy()
-        if metadata is not None:
-            state_dict._metadata = metadata
-
-        def load(module, prefix=''):
-            local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {})
-            module._load_from_state_dict(
-                state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys, error_msgs)
-            for name, child in module._modules.items():
-                if child is not None:
-                    load(child, prefix + name + '.')
-        start_prefix = ''
-        if not hasattr(model, 'transformer') and any(s.startswith('transformer') for s in state_dict.keys()):
-            start_prefix = 'transformer.'
-        load(model, prefix=start_prefix)
-        if len(missing_keys) > 0:
-            logger.info("Weights of {} not initialized from pretrained model: {}".format(
-                model.__class__.__name__, missing_keys))
-        if len(unexpected_keys) > 0:
-            logger.info("Weights from pretrained model not used in {}: {}".format(
-                model.__class__.__name__, unexpected_keys))
-        if len(error_msgs) > 0:
-            raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format(
-                               model.__class__.__name__, "\n\t".join(error_msgs)))
-        if isinstance(model, XLMLMHeadModel):
-            model.tie_weights()  # make sure word embedding weights are still tied
-        return model
-
 
 class XLMModel(XLMPreTrainedModel):
 
diff --git a/pytorch_pretrained_bert/modeling_xlnet.py b/pytorch_pretrained_bert/modeling_xlnet.py
index e9c7c72e12..d4b66bc4ad 100644
--- a/pytorch_pretrained_bert/modeling_xlnet.py
+++ b/pytorch_pretrained_bert/modeling_xlnet.py
@@ -33,7 +33,7 @@ from torch.nn import functional as F
 from torch.nn import CrossEntropyLoss, MSELoss
 
 from .file_utils import cached_path
-from .model_utils import CONFIG_NAME, WEIGHTS_NAME, PretrainedConfig
+from .model_utils import CONFIG_NAME, WEIGHTS_NAME, PretrainedConfig, PreTrainedModel
 
 
 logger = logging.getLogger(__name__)
@@ -44,11 +44,9 @@ PRETRAINED_MODEL_ARCHIVE_MAP = {
 PRETRAINED_CONFIG_ARCHIVE_MAP = {
     'xlnet-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-large-cased-config.json",
 }
-XLNET_CONFIG_NAME = 'xlnet_config.json'
-TF_WEIGHTS_NAME = 'model.ckpt'
 
 
-def build_tf_xlnet_to_pytorch_map(model, config, tf_weights=None, finetuning_task=None):
+def build_tf_xlnet_to_pytorch_map(model, config, tf_weights=None):
     """ A map of modules from TF to PyTorch.
         I use a map to keep the PyTorch model as
         identical to the original PyTorch model as possible.
@@ -64,9 +62,9 @@ def build_tf_xlnet_to_pytorch_map(model, config, tf_weights=None, finetuning_tas
             # We will load also the sequence summary
             tf_to_pt_map['model/sequnece_summary/summary/kernel'] = model.sequence_summary.summary.weight
             tf_to_pt_map['model/sequnece_summary/summary/bias'] = model.sequence_summary.summary.bias
-        if hasattr(model, 'logits_proj') and finetuning_task is not None and 'model/regression_{}/logit/kernel'.format(finetuning_task) in tf_weights:
-            tf_to_pt_map['model/regression_{}/logit/kernel'.format(finetuning_task)] = model.logits_proj.weight
-            tf_to_pt_map['model/regression_{}/logit/bias'.format(finetuning_task)] = model.logits_proj.bias
+        if hasattr(model, 'logits_proj') and config.finetuning_task is not None and 'model/regression_{}/logit/kernel'.format(finetuning_task) in tf_weights:
+            tf_to_pt_map['model/regression_{}/logit/kernel'.format(config.finetuning_task)] = model.logits_proj.weight
+            tf_to_pt_map['model/regression_{}/logit/bias'.format(config.finetuning_task)] = model.logits_proj.bias
 
         # Now load the rest of the transformer
         model = model.transformer
@@ -117,7 +115,7 @@ def build_tf_xlnet_to_pytorch_map(model, config, tf_weights=None, finetuning_tas
         'model/transformer/seg_embed': seg_embed_list})
     return tf_to_pt_map
 
-def load_tf_weights_in_xlnet(model, config, tf_path, finetuning_task=None):
+def load_tf_weights_in_xlnet(model, config, tf_path):
     """ Load tf checkpoints in a pytorch model
     """
     try:
@@ -138,7 +136,7 @@ def load_tf_weights_in_xlnet(model, config, tf_path, finetuning_task=None):
     input("Press Enter to continue...")
 
     # Build TF to PyTorch weights loading map
-    tf_to_pt_map = build_tf_xlnet_to_pytorch_map(model, config, tf_weights, finetuning_task)
+    tf_to_pt_map = build_tf_xlnet_to_pytorch_map(model, config, tf_weights)
 
     for name, pointer in tf_to_pt_map.items():
         print("Importing {}".format(name))
@@ -223,7 +221,8 @@ class XLNetConfig(PretrainedConfig):
                  reuse_len=None,
                  bi_data=False,
                  clamp_len=-1,
-                 same_length=False):
+                 same_length=False,
+                 finetuning_task=None):
         """Constructs XLNetConfig.
 
         Args:
@@ -265,6 +264,7 @@ class XLNetConfig(PretrainedConfig):
             clamp_len: int, clamp all relative distances larger than clamp_len.
                 -1 means no clamping.
             same_length: bool, whether to use the same attention length for each token.
+            finetuning_task: name of the glue task on which the model was fine-tuned if any
         """
         if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
                         and isinstance(vocab_size_or_config_json_file, unicode)):
@@ -298,6 +298,7 @@ class XLNetConfig(PretrainedConfig):
             self.bi_data = bi_data
             self.clamp_len = clamp_len
             self.same_length = same_length
+            self.finetuning_task = finetuning_task
         else:
             raise ValueError("First argument must be either a vocabulary size (int)"
                              "or the path to a pretrained model config file (str)")
@@ -550,20 +551,19 @@ class XLNetLayer(nn.Module):
         #     return attentions, layer_output
         return output_h, output_g
 
-class XLNetPreTrainedModel(nn.Module):
+
+class XLNetPreTrainedModel(PreTrainedModel):
     """ An abstract class to handle weights initialization and
         a simple interface for dowloading and loading pretrained models.
     """
-    def __init__(self, config, *inputs, **kwargs):
-        super(XLNetPreTrainedModel, self).__init__()
-        if not isinstance(config, XLNetConfig):
-            raise ValueError(
-                "Parameter config in `{}(config)` should be an instance of class `XLNetConfig`. "
-                "To create a model from a Google pretrained model use "
-                "`model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`".format(
-                    self.__class__.__name__, self.__class__.__name__
-                ))
-        self.config = config
+    config_class = XLNetConfig
+    pretrained_model_archive_map = PRETRAINED_MODEL_ARCHIVE_MAP
+    pretrained_config_archive_map = PRETRAINED_CONFIG_ARCHIVE_MAP
+    load_tf_weights = load_tf_weights_in_xlnet
+    base_model_prefix = "transformer"
+
+    def __init__(self, *inputs, **kwargs):
+        super(XLNetPreTrainedModel, self).__init__(*inputs, **kwargs)
 
     def init_weights(self, module):
         """ Initialize the weights.
@@ -583,144 +583,6 @@ class XLNetPreTrainedModel(nn.Module):
         if isinstance(module, nn.Linear) and module.bias is not None:
             module.bias.data.zero_()
 
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
-        """
-        Instantiate a XLNetPreTrainedModel from a pre-trained model file or a pytorch state dict.
-        Download and cache the pre-trained model file if needed.
-
-        Params:
-            pretrained_model_name_or_path: either:
-                - a str with the name of a pre-trained model to load selected in the list of:
-                    . `xlnet-large-cased`
-                - a path or url to a pretrained model archive containing:
-                    . `config.json` a configuration file for the model
-                    . `pytorch_model.bin` a PyTorch dump of a XLNetForPreTraining instance
-                - a path or url to a pretrained model archive containing:
-                    . `xlnet_config.json` a configuration file for the model
-                    . `model.chkpt` a TensorFlow checkpoint
-            from_tf: should we load the weights from a locally saved TensorFlow checkpoint
-            cache_dir: an optional path to a folder in which the pre-trained models will be cached.
-            state_dict: an optional state dictionnary (collections.OrderedDict object) to use instead of Google pre-trained models
-            *inputs, **kwargs: additional input for the specific XLNet class
-                (ex: num_labels for XLNetForSequenceClassification)
-        """
-        state_dict = kwargs.get('state_dict', None)
-        kwargs.pop('state_dict', None)
-        cache_dir = kwargs.get('cache_dir', None)
-        kwargs.pop('cache_dir', None)
-        from_tf = kwargs.get('from_tf', False)
-        kwargs.pop('from_tf', None)
-
-        if pretrained_model_name_or_path in PRETRAINED_MODEL_ARCHIVE_MAP:
-            archive_file = PRETRAINED_MODEL_ARCHIVE_MAP[pretrained_model_name_or_path]
-            config_file = PRETRAINED_CONFIG_ARCHIVE_MAP[pretrained_model_name_or_path]
-        else:
-            if from_tf:
-                # Directly load from a TensorFlow checkpoint
-                archive_file = os.path.join(pretrained_model_name_or_path, TF_WEIGHTS_NAME)
-                config_file = os.path.join(pretrained_model_name_or_path, XLNET_CONFIG_NAME)
-            else:
-                archive_file = os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME)
-                config_file = os.path.join(pretrained_model_name_or_path, CONFIG_NAME)
-        # redirect to the cache, if necessary
-        try:
-            resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir)
-        except EnvironmentError:
-            if pretrained_model_name_or_path in PRETRAINED_MODEL_ARCHIVE_MAP:
-                logger.error(
-                    "Couldn't reach server at '{}' to download pretrained weights.".format(
-                        archive_file))
-            else:
-                logger.error(
-                    "Model name '{}' was not found in model name list ({}). "
-                    "We assumed '{}' was a path or url but couldn't find any file "
-                    "associated to this path or url.".format(
-                        pretrained_model_name_or_path,
-                        ', '.join(PRETRAINED_MODEL_ARCHIVE_MAP.keys()),
-                        archive_file))
-            return None
-        try:
-            resolved_config_file = cached_path(config_file, cache_dir=cache_dir)
-        except EnvironmentError:
-            if pretrained_model_name_or_path in PRETRAINED_CONFIG_ARCHIVE_MAP:
-                logger.error(
-                    "Couldn't reach server at '{}' to download pretrained model configuration file.".format(
-                        config_file))
-            else:
-                logger.error(
-                    "Model name '{}' was not found in model name list ({}). "
-                    "We assumed '{}' was a path or url but couldn't find any file "
-                    "associated to this path or url.".format(
-                        pretrained_model_name_or_path,
-                        ', '.join(PRETRAINED_CONFIG_ARCHIVE_MAP.keys()),
-                        config_file))
-            return None
-        if resolved_archive_file == archive_file and resolved_config_file == config_file:
-            logger.info("loading weights file {}".format(archive_file))
-            logger.info("loading configuration file {}".format(config_file))
-        else:
-            logger.info("loading weights file {} from cache at {}".format(
-                archive_file, resolved_archive_file))
-            logger.info("loading configuration file {} from cache at {}".format(
-                config_file, resolved_config_file))
-
-        # Load config
-        config = XLNetConfig.from_json_file(resolved_config_file)
-
-        # Update config with kwargs if needed
-        to_remove = []
-        for key, value in kwargs.items():
-            if hasattr(config, key):
-                setattr(config, key, value)
-                to_remove.append(key)
-        for key in to_remove:
-            kwargs.pop(key, None)
-
-        logger.info("Model config {}".format(config))
-
-        # Instantiate model.
-        model = cls(config, *inputs, **kwargs)
-        if state_dict is None and not from_tf:
-            state_dict = torch.load(resolved_archive_file, map_location='cpu')
-        if from_tf:
-            # Directly load from a TensorFlow checkpoint
-            return load_tf_weights_in_xlnet(model, config, resolved_archive_file)
-
-        # Load from a PyTorch state_dict
-        missing_keys = []
-        unexpected_keys = []
-        error_msgs = []
-        # copy state_dict so _load_from_state_dict can modify it
-        metadata = getattr(state_dict, '_metadata', None)
-        state_dict = state_dict.copy()
-        if metadata is not None:
-            state_dict._metadata = metadata
-
-        def load(module, prefix=''):
-            local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {})
-            module._load_from_state_dict(
-                state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys, error_msgs)
-            for name, child in module._modules.items():
-                if child is not None:
-                    load(child, prefix + name + '.')
-        start_prefix = ''
-        if not hasattr(model, 'transformer') and any(s.startswith('transformer') for s in state_dict.keys()):
-            start_prefix = 'transformer.'
-        load(model, prefix=start_prefix)
-        if len(missing_keys) > 0:
-            logger.info("Weights of {} not initialized from pretrained model: {}".format(
-                model.__class__.__name__, missing_keys))
-        if len(unexpected_keys) > 0:
-            logger.info("Weights from pretrained model not used in {}: {}".format(
-                model.__class__.__name__, unexpected_keys))
-        if len(error_msgs) > 0:
-            raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format(
-                               model.__class__.__name__, "\n\t".join(error_msgs)))
-        if isinstance(model, XLNetLMHeadModel):
-            model.tie_weights()  # make sure word embedding weights are still tied
-        return model
-
 
 class XLNetModel(XLNetPreTrainedModel):
     def __init__(self, config, output_attentions=False, keep_multihead_output=False):

From f56b8033f09c01f1217d944f18c45355e4bdc65b Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Wed, 26 Jun 2019 13:13:15 +0200
Subject: [PATCH 025/139] more versatile loading

---
 pytorch_pretrained_bert/model_utils.py  | 13 +++++--
 pytorch_pretrained_bert/modeling_xlm.py | 50 ++++++++++++++++++++++++-
 2 files changed, 57 insertions(+), 6 deletions(-)

diff --git a/pytorch_pretrained_bert/model_utils.py b/pytorch_pretrained_bert/model_utils.py
index c262d7b6c8..1bc1632580 100644
--- a/pytorch_pretrained_bert/model_utils.py
+++ b/pytorch_pretrained_bert/model_utils.py
@@ -255,7 +255,7 @@ class PreTrainedModel(nn.Module):
             state_dict = torch.load(resolved_archive_file, map_location='cpu')
         if from_tf:
             # Directly load from a TensorFlow checkpoint
-            return load_tf_weights(model, config, resolved_archive_file[:-6])  # Remove the '.index'
+            return cls.load_tf_weights(model, config, resolved_archive_file[:-6])  # Remove the '.index'
 
         # Load from a PyTorch state_dict
         missing_keys = []
@@ -275,10 +275,15 @@ class PreTrainedModel(nn.Module):
                 if child is not None:
                     load(child, prefix + name + '.')
 
+        # Be able to load base models as well as derived models (with heads)
         start_prefix = ''
+        model_to_load = model
         if not hasattr(model, cls.base_model_prefix) and any(s.startswith(cls.base_model_prefix) for s in state_dict.keys()):
-            start_prefix = cls.base_model_prefix + '.'  # Used to be able to load base models as well as derived modesl (with heads)
-        load(model, prefix=start_prefix)
+            start_prefix = cls.base_model_prefix + '.'
+        if hasattr(model, cls.base_model_prefix) and not any(s.startswith(cls.base_model_prefix) for s in state_dict.keys()):
+            model_to_load = getattr(model, cls.base_model_prefix)
+
+        load(model_to_load, prefix=start_prefix)
         if len(missing_keys) > 0:
             logger.info("Weights of {} not initialized from pretrained model: {}".format(
                 model.__class__.__name__, missing_keys))
@@ -289,7 +294,7 @@ class PreTrainedModel(nn.Module):
             raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format(
                                model.__class__.__name__, "\n\t".join(error_msgs)))
 
-        if hasattr(model, tie_weights):
+        if hasattr(model, 'tie_weights'):
             model.tie_weights()  # make sure word embedding weights are still tied
 
         return model
diff --git a/pytorch_pretrained_bert/modeling_xlm.py b/pytorch_pretrained_bert/modeling_xlm.py
index f0b9538ae4..36fc7db191 100644
--- a/pytorch_pretrained_bert/modeling_xlm.py
+++ b/pytorch_pretrained_bert/modeling_xlm.py
@@ -430,8 +430,54 @@ class XLMModel(XLMPreTrainedModel):
                   'asm_cutoffs', 'asm_div_value']
 
     def __init__(self, params, output_attentions=False, keep_multihead_output=False):  #, dico, is_encoder, with_output):
-        """
-        Transformer model (encoder or decoder).
+        """XLM model ("Bidirectional Embedding Representations from a Transformer").
+
+        Params:
+            `config`: a BertConfig class instance with the configuration to build a new model
+            `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
+            `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
+                This can be used to compute head importance metrics. Default: False
+
+        Inputs:
+            `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
+                with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
+                `run_bert_extract_features.py`, `run_bert_classifier.py` and `run_bert_squad.py`)
+            `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
+                types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
+                a `sentence B` token (see BERT paper for more details).
+            `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
+                selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
+                input sequence length in the current batch. It's the mask that we typically use for attention when
+                a batch has varying length sentences.
+            `output_all_encoded_layers`: boolean which controls the content of the `encoded_layers` output as described below. Default: `True`.
+            `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
+                It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
+
+
+        Outputs: Tuple of (encoded_layers, pooled_output)
+            `encoded_layers`: controled by `output_all_encoded_layers` argument:
+                - `output_all_encoded_layers=True`: outputs a list of the full sequences of encoded-hidden-states at the end
+                    of each attention block (i.e. 12 full sequences for BERT-base, 24 for BERT-large), each
+                    encoded-hidden-state is a torch.FloatTensor of size [batch_size, sequence_length, hidden_size],
+                - `output_all_encoded_layers=False`: outputs only the full sequence of hidden-states corresponding
+                    to the last attention block of shape [batch_size, sequence_length, hidden_size],
+            `pooled_output`: a torch.FloatTensor of size [batch_size, hidden_size] which is the output of a
+                classifier pretrained on top of the hidden state associated to the first character of the
+                input (`CLS`) to train on the Next-Sentence task (see BERT's paper).
+
+        Example usage:
+        ```python
+        # Already been converted into WordPiece token ids
+        input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
+        input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
+        token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
+
+        config = modeling.BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
+            num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
+
+        model = modeling.BertModel(config=config)
+        all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)
+        ```
         """
         super(XLMModel, self).__init__(params)
         self.output_attentions = output_attentions

From 3deea56c07d4361da74e57947c3a573463ea4457 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Wed, 26 Jun 2019 13:41:12 +0200
Subject: [PATCH 026/139] fixing loading fucntion

---
 pytorch_pretrained_bert/modeling_xlm.py   | 36 ++++++++++-------------
 pytorch_pretrained_bert/modeling_xlnet.py |  5 ++--
 2 files changed, 17 insertions(+), 24 deletions(-)

diff --git a/pytorch_pretrained_bert/modeling_xlm.py b/pytorch_pretrained_bert/modeling_xlm.py
index 36fc7db191..8cb56de253 100644
--- a/pytorch_pretrained_bert/modeling_xlm.py
+++ b/pytorch_pretrained_bert/modeling_xlm.py
@@ -430,10 +430,12 @@ class XLMModel(XLMPreTrainedModel):
                   'asm_cutoffs', 'asm_div_value']
 
     def __init__(self, params, output_attentions=False, keep_multihead_output=False):  #, dico, is_encoder, with_output):
-        """XLM model ("Bidirectional Embedding Representations from a Transformer").
+        """ XLM model from: "Cross-lingual Language Model Pretraining" by Guillaume Lample, Alexis Conneau
+            Paper: https://arxiv.org/abs/1901.07291
+            Original code: https://github.com/facebookresearch/XLM
 
         Params:
-            `config`: a BertConfig class instance with the configuration to build a new model
+            `config`: a XLMConfig class instance with the configuration to build a new model
             `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
             `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
                 This can be used to compute head importance metrics. Default: False
@@ -444,7 +446,7 @@ class XLMModel(XLMPreTrainedModel):
                 `run_bert_extract_features.py`, `run_bert_classifier.py` and `run_bert_squad.py`)
             `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
                 types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
-                a `sentence B` token (see BERT paper for more details).
+                a `sentence B` token (see XLM paper for more details).
             `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
                 selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
                 input sequence length in the current batch. It's the mask that we typically use for attention when
@@ -457,13 +459,13 @@ class XLMModel(XLMPreTrainedModel):
         Outputs: Tuple of (encoded_layers, pooled_output)
             `encoded_layers`: controled by `output_all_encoded_layers` argument:
                 - `output_all_encoded_layers=True`: outputs a list of the full sequences of encoded-hidden-states at the end
-                    of each attention block (i.e. 12 full sequences for BERT-base, 24 for BERT-large), each
+                    of each attention block (i.e. 12 full sequences for XLM-base, 24 for XLM-large), each
                     encoded-hidden-state is a torch.FloatTensor of size [batch_size, sequence_length, hidden_size],
                 - `output_all_encoded_layers=False`: outputs only the full sequence of hidden-states corresponding
                     to the last attention block of shape [batch_size, sequence_length, hidden_size],
             `pooled_output`: a torch.FloatTensor of size [batch_size, hidden_size] which is the output of a
                 classifier pretrained on top of the hidden state associated to the first character of the
-                input (`CLS`) to train on the Next-Sentence task (see BERT's paper).
+                input (`CLS`) to train on the Next-Sentence task (see XLM's paper).
 
         Example usage:
         ```python
@@ -472,10 +474,10 @@ class XLMModel(XLMPreTrainedModel):
         input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
         token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
 
-        config = modeling.BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
+        config = modeling.XLMConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
             num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
 
-        model = modeling.BertModel(config=config)
+        model = modeling.XLMModel(config=config)
         all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)
         ```
         """
@@ -1046,7 +1048,7 @@ class XLMModel(XLMPreTrainedModel):
                 0 for real tokens and 1 for padding.
             attention_mask: [optional] float32 Tensor, SAME FUNCTION as `input_mask`
                 but with 1 for real tokens and 0 for padding.
-                Added for easy compatibility with the BERT model (which uses this negative masking).
+                Added for easy compatibility with the XLM model (which uses this negative masking).
                 You can only uses one among `input_mask` and `attention_mask`
             mems: [optional] a list of float32 Tensors in shape [mem_len, bsz, d_model], memory
                 from previous batches. The length of the list equals n_layer.
@@ -1106,7 +1108,7 @@ class XLMModel(XLMPreTrainedModel):
 
         # data mask: input mask & perm mask
         assert input_mask is None or attention_mask is None, "You can only use one of input_mask (uses 1 for padding) "
-        "or attention_mask (uses 0 for padding, added for compatbility with BERT). Please choose one."
+        "or attention_mask (uses 0 for padding, added for compatbility with XLM). Please choose one."
         if input_mask is None and attention_mask is not None:
             input_mask = 1.0 - attention_mask
         if input_mask is not None and perm_mask is not None:
@@ -1262,12 +1264,8 @@ class XLMLMHeadModel(XLMPreTrainedModel):
     Inputs:
         inp_k: int32 Tensor in shape [bsz, len], the input token IDs.
         token_type_ids: int32 Tensor in shape [bsz, len], the input segment IDs.
-        input_mask: [optional] float32 Tensor in shape [bsz, len], the input mask.
+        attention_mask: [optional] float32 Tensor in shape [bsz, len], the input mask.
             0 for real tokens and 1 for padding.
-        attention_mask: [optional] float32 Tensor, SAME FUNCTION as `input_mask`
-            but with 1 for real tokens and 0 for padding.
-            Added for easy compatibility with the BERT model (which uses this negative masking).
-            You can only uses one among `input_mask` and `attention_mask`
         mems: [optional] a list of float32 Tensors in shape [mem_len, bsz, d_model], memory
             from previous batches. The length of the list equals n_layer.
             If None, no memory is used.
@@ -1340,10 +1338,6 @@ class XLMLMHeadModel(XLMPreTrainedModel):
             token_type_ids: int32 Tensor in shape [bsz, len], the input segment IDs.
             input_mask: float32 Tensor in shape [bsz, len], the input mask.
                 0 for real tokens and 1 for padding.
-            attention_mask: [optional] float32 Tensor, SAME FUNCTION as `input_mask`
-                but with 1 for real tokens and 0 for padding.
-                Added for easy compatibility with the BERT model (which uses this negative masking).
-                You can only uses one among `input_mask` and `attention_mask`
             mems: a list of float32 Tensors in shape [mem_len, bsz, d_model], memory
                 from previous batches. The length of the list equals n_layer.
                 If None, no memory is used.
@@ -1440,7 +1434,7 @@ class XLMForSequenceClassification(XLMPreTrainedModel):
             0 for real tokens and 1 for padding.
         attention_mask: [optional] float32 Tensor, SAME FUNCTION as `input_mask`
             but with 1 for real tokens and 0 for padding.
-            Added for easy compatibility with the BERT model (which uses this negative masking).
+            Added for easy compatibility with the XLM model (which uses this negative masking).
             You can only uses one among `input_mask` and `attention_mask`
         mems: a list of float32 Tensors in shape [mem_len, bsz, d_model], memory
             from previous batches. The length of the list equals n_layer.
@@ -1515,7 +1509,7 @@ class XLMForSequenceClassification(XLMPreTrainedModel):
                 0 for real tokens and 1 for padding.
             attention_mask: [optional] float32 Tensor, SAME FUNCTION as `input_mask`
                 but with 1 for real tokens and 0 for padding.
-                Added for easy compatibility with the BERT model (which uses this negative masking).
+                Added for easy compatibility with the XLM model (which uses this negative masking).
                 You can only uses one among `input_mask` and `attention_mask`
             mems: a list of float32 Tensors in shape [mem_len, bsz, d_model], memory
                 from previous batches. The length of the list equals n_layer.
@@ -1582,7 +1576,7 @@ class XLMForQuestionAnswering(XLMPreTrainedModel):
             a `sentence B` token (see XLM paper for more details).
         `attention_mask`: [optional] float32 Tensor, SAME FUNCTION as `input_mask`
             but with 1 for real tokens and 0 for padding.
-            Added for easy compatibility with the BERT model (which uses this negative masking).
+            Added for easy compatibility with the XLM model (which uses this negative masking).
             You can only uses one among `input_mask` and `attention_mask`
         `input_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
             selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
diff --git a/pytorch_pretrained_bert/modeling_xlnet.py b/pytorch_pretrained_bert/modeling_xlnet.py
index d4b66bc4ad..dd45f93d62 100644
--- a/pytorch_pretrained_bert/modeling_xlnet.py
+++ b/pytorch_pretrained_bert/modeling_xlnet.py
@@ -62,7 +62,8 @@ def build_tf_xlnet_to_pytorch_map(model, config, tf_weights=None):
             # We will load also the sequence summary
             tf_to_pt_map['model/sequnece_summary/summary/kernel'] = model.sequence_summary.summary.weight
             tf_to_pt_map['model/sequnece_summary/summary/bias'] = model.sequence_summary.summary.bias
-        if hasattr(model, 'logits_proj') and config.finetuning_task is not None and 'model/regression_{}/logit/kernel'.format(finetuning_task) in tf_weights:
+        if hasattr(model, 'logits_proj') and config.finetuning_task is not None \
+                and 'model/regression_{}/logit/kernel'.format(config.finetuning_task) in tf_weights:
             tf_to_pt_map['model/regression_{}/logit/kernel'.format(config.finetuning_task)] = model.logits_proj.weight
             tf_to_pt_map['model/regression_{}/logit/bias'.format(config.finetuning_task)] = model.logits_proj.bias
 
@@ -133,8 +134,6 @@ def load_tf_weights_in_xlnet(model, config, tf_path):
         array = tf.train.load_variable(tf_path, name)
         tf_weights[name] = array
 
-    input("Press Enter to continue...")
-
     # Build TF to PyTorch weights loading map
     tf_to_pt_map = build_tf_xlnet_to_pytorch_map(model, config, tf_weights)
 

From 0c2ff348151cab8f245e352bb249d18b5623e3bc Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Thu, 27 Jun 2019 09:27:50 +0200
Subject: [PATCH 027/139] extracting double hidden-state from xlnet

---
 pytorch_pretrained_bert/modeling_xlnet.py | 30 +++++++----------------
 1 file changed, 9 insertions(+), 21 deletions(-)

diff --git a/pytorch_pretrained_bert/modeling_xlnet.py b/pytorch_pretrained_bert/modeling_xlnet.py
index dd45f93d62..2da50313af 100644
--- a/pytorch_pretrained_bert/modeling_xlnet.py
+++ b/pytorch_pretrained_bert/modeling_xlnet.py
@@ -703,8 +703,7 @@ class XLNetModel(XLNetPreTrainedModel):
         return pos_emb
 
     def forward(self, inp_k, token_type_ids=None, input_mask=None, attention_mask=None,
-                mems=None, perm_mask=None, target_mapping=None, inp_q=None,
-                output_all_encoded_layers=True, head_mask=None):
+                mems=None, perm_mask=None, target_mapping=None, inp_q=None, head_mask=None):
         """
         Args:
             inp_k: int32 Tensor in shape [bsz, len], the input token IDs.
@@ -856,13 +855,14 @@ class XLNetModel(XLNetPreTrainedModel):
         for i, layer_module in enumerate(self.layer):
             # cache new mems
             new_mems.append(self.cache_mem(output_h, mems[i]))
+            hidden_states.append((output_h, output_g))
 
             output_h, output_g = layer_module(output_h, output_g,
                                               attn_mask_h=non_tgt_mask, attn_mask_g=attn_mask,
                                               r=pos_emb, seg_mat=seg_mat,
                                               mems=mems[i], target_mapping=target_mapping,
                                               head_mask=head_mask)
-            hidden_states.append(output_h)
+        hidden_states.append((output_h, output_g))
         output = self.dropout(output_g if output_g is not None else output_h)
 
         # We transpose back here to shape [bsz, len, hidden_dim] (cf. beginning of forward() method)
@@ -955,7 +955,7 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
 
     def forward(self, inp_k, token_type_ids=None, input_mask=None, attention_mask=None,
                 mems=None, perm_mask=None, target_mapping=None, inp_q=None,
-                labels=None, output_all_encoded_layers=True, head_mask=None):
+                labels=None, head_mask=None):
         """
         Args:
             inp_k: int32 Tensor in shape [bsz, len], the input token IDs.
@@ -987,8 +987,7 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
                 to pool the input to get a vector representation.
         """
         output, hidden_states, new_mems = self.transformer(inp_k, token_type_ids, input_mask, attention_mask,
-                                            mems, perm_mask, target_mapping, inp_q,
-                                            output_all_encoded_layers, head_mask)
+                                            mems, perm_mask, target_mapping, inp_q, head_mask)
 
         logits = self.lm_loss(output)
 
@@ -1001,10 +1000,6 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
 
         # if self.output_attentions:
         #     all_attentions, encoded_layers = encoded_layers
-        # sequence_output = encoded_layers[-1]
-        # pooled_output = self.pooler(sequence_output)
-        # if not output_all_encoded_layers:
-        #     encoded_layers = encoded_layers[-1]
         # if self.output_attentions:
         return logits, new_mems
         #     return all_attentions, encoded_layers, pooled_output
@@ -1127,7 +1122,7 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel):
 
     def forward(self, inp_k, token_type_ids=None, input_mask=None, attention_mask=None,
                 mems=None, perm_mask=None, target_mapping=None, inp_q=None,
-                labels=None, output_all_encoded_layers=True, head_mask=None):
+                labels=None, head_mask=None):
         """
         Args:
             inp_k: int32 Tensor in shape [bsz, len], the input token IDs.
@@ -1156,8 +1151,7 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel):
                 Set to None during finetuning.
         """
         output, _, new_mems = self.transformer(inp_k, token_type_ids, input_mask, attention_mask,
-                                               mems, perm_mask, target_mapping, inp_q,
-                                               output_all_encoded_layers, head_mask)
+                                               mems, perm_mask, target_mapping, inp_q, head_mask)
 
         output = self.sequence_summary(output)
         logits = self.logits_proj(output)
@@ -1174,10 +1168,6 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel):
 
         # if self.output_attentions:
         #     all_attentions, encoded_layers = encoded_layers
-        # sequence_output = encoded_layers[-1]
-        # pooled_output = self.pooler(sequence_output)
-        # if not output_all_encoded_layers:
-        #     encoded_layers = encoded_layers[-1]
         # if self.output_attentions:
         return logits, new_mems
         #     return all_attentions, encoded_layers, pooled_output
@@ -1248,11 +1238,9 @@ class XLNetForQuestionAnswering(XLNetPreTrainedModel):
 
     def forward(self, inp_k, token_type_ids=None, input_mask=None, attention_mask=None,
                 mems=None, perm_mask=None, target_mapping=None, inp_q=None,
-                start_positions=None, end_positions=None,
-                output_all_encoded_layers=True, head_mask=None):
+                start_positions=None, end_positions=None, head_mask=None):
         output, _, new_mems = self.transformer(inp_k, token_type_ids, input_mask, attention_mask,
-                                            mems, perm_mask, target_mapping, inp_q,
-                                            output_all_encoded_layers, head_mask)
+                                            mems, perm_mask, target_mapping, inp_q, head_mask)
 
         logits = self.qa_outputs(output)
         start_logits, end_logits = logits.split(1, dim=-1)

From d939d6fd0205431f53fc8696543258c82ccc5366 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Thu, 27 Jun 2019 09:39:44 +0200
Subject: [PATCH 028/139] fix hidden-state extraction

---
 pytorch_pretrained_bert/modeling_xlnet.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/pytorch_pretrained_bert/modeling_xlnet.py b/pytorch_pretrained_bert/modeling_xlnet.py
index 2da50313af..c30e263181 100644
--- a/pytorch_pretrained_bert/modeling_xlnet.py
+++ b/pytorch_pretrained_bert/modeling_xlnet.py
@@ -855,19 +855,22 @@ class XLNetModel(XLNetPreTrainedModel):
         for i, layer_module in enumerate(self.layer):
             # cache new mems
             new_mems.append(self.cache_mem(output_h, mems[i]))
-            hidden_states.append((output_h, output_g))
+            hidden_states.append((output_h, output_g) if output_g is not None else output_h)
 
             output_h, output_g = layer_module(output_h, output_g,
                                               attn_mask_h=non_tgt_mask, attn_mask_g=attn_mask,
                                               r=pos_emb, seg_mat=seg_mat,
                                               mems=mems[i], target_mapping=target_mapping,
                                               head_mask=head_mask)
-        hidden_states.append((output_h, output_g))
+        hidden_states.append((output_h, output_g) if output_g is not None else output_h)
         output = self.dropout(output_g if output_g is not None else output_h)
 
         # We transpose back here to shape [bsz, len, hidden_dim] (cf. beginning of forward() method)
         output = output.permute(1, 0, 2).contiguous()
-        hidden_states = [hs.permute(1, 0, 2).contiguous() for hs in hidden_states]
+        if output_g is not None:
+            hidden_states = [h.permute(1, 0, 2).contiguous() for hs in hidden_states for h in hs]
+        else:
+            hidden_states = [hs.permute(1, 0, 2).contiguous() for hs in hidden_states]
 
         return output, hidden_states, new_mems
 

From 3a00674cbf34d22a3ce23a13492f98b30482a13c Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Thu, 27 Jun 2019 17:18:46 +0200
Subject: [PATCH 029/139] fix imports

---
 examples/run_bert_classifier.py  | 2 +-
 examples/run_bert_squad.py       | 2 +-
 examples/run_xlnet_classifier.py | 2 +-
 examples/run_xlnet_squad.py      | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/examples/run_bert_classifier.py b/examples/run_bert_classifier.py
index 8bb37159d3..d987b35321 100644
--- a/examples/run_bert_classifier.py
+++ b/examples/run_bert_classifier.py
@@ -34,7 +34,7 @@ from torch.nn import CrossEntropyLoss, MSELoss
 
 from tensorboardX import SummaryWriter
 
-from pytorch_pretrained_bert.file_utils import WEIGHTS_NAME, CONFIG_NAME
+from pytorch_pretrained_bert import WEIGHTS_NAME, CONFIG_NAME
 from pytorch_pretrained_bert.modeling import BertForSequenceClassification
 from pytorch_pretrained_bert.tokenization import BertTokenizer
 from pytorch_pretrained_bert.optimization import BertAdam, WarmupLinearSchedule
diff --git a/examples/run_bert_squad.py b/examples/run_bert_squad.py
index 9aaa711c2b..54eceb36f7 100644
--- a/examples/run_bert_squad.py
+++ b/examples/run_bert_squad.py
@@ -33,7 +33,7 @@ from tqdm import tqdm, trange
 
 from tensorboardX import SummaryWriter
 
-from pytorch_pretrained_bert.file_utils import WEIGHTS_NAME, CONFIG_NAME
+from pytorch_pretrained_bert import WEIGHTS_NAME, CONFIG_NAME
 from pytorch_pretrained_bert.modeling import BertForQuestionAnswering
 from pytorch_pretrained_bert.optimization import BertAdam, WarmupLinearSchedule
 from pytorch_pretrained_bert.tokenization import BertTokenizer
diff --git a/examples/run_xlnet_classifier.py b/examples/run_xlnet_classifier.py
index 2309815981..fb5501e370 100644
--- a/examples/run_xlnet_classifier.py
+++ b/examples/run_xlnet_classifier.py
@@ -34,7 +34,7 @@ from torch.nn import CrossEntropyLoss, MSELoss
 
 from tensorboardX import SummaryWriter
 
-from pytorch_pretrained_bert.file_utils import WEIGHTS_NAME, CONFIG_NAME
+from pytorch_pretrained_bert import WEIGHTS_NAME, CONFIG_NAME
 from pytorch_pretrained_bert.modeling_xlnet import XLNetForSequenceClassification
 from pytorch_pretrained_bert.tokenization_xlnet import XLNetTokenizer
 from pytorch_pretrained_bert.optimization import BertAdam, WarmupLinearSchedule
diff --git a/examples/run_xlnet_squad.py b/examples/run_xlnet_squad.py
index 927668c57a..c299358b79 100644
--- a/examples/run_xlnet_squad.py
+++ b/examples/run_xlnet_squad.py
@@ -33,7 +33,7 @@ from tqdm import tqdm, trange
 
 from tensorboardX import SummaryWriter
 
-from pytorch_pretrained_bert.file_utils import WEIGHTS_NAME, CONFIG_NAME
+from pytorch_pretrained_bert import WEIGHTS_NAME, CONFIG_NAME
 from pytorch_pretrained_bert.modeling_xlnet import BertForQuestionAnswering
 from pytorch_pretrained_bert.tokenization_xlnet import XLNetTokenizer
 from pytorch_pretrained_bert.optimization import BertAdam, WarmupLinearSchedule

From 2b56e9889284b5432881e947aefbf7ed6780e4ec Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Fri, 28 Jun 2019 16:35:09 +0200
Subject: [PATCH 030/139] standardizing API across models - XLNetForSeqClass
 working

---
 examples/run_xlnet_classifier.py          |  59 ++++---
 pytorch_pretrained_bert/modeling.py       | 137 +++++++--------
 pytorch_pretrained_bert/modeling_xlm.py   | 132 ++++++++-------
 pytorch_pretrained_bert/modeling_xlnet.py | 196 ++++++++++++----------
 4 files changed, 277 insertions(+), 247 deletions(-)

diff --git a/examples/run_xlnet_classifier.py b/examples/run_xlnet_classifier.py
index fb5501e370..e30cad773b 100644
--- a/examples/run_xlnet_classifier.py
+++ b/examples/run_xlnet_classifier.py
@@ -67,6 +67,8 @@ def main():
                         help="The initial learning rate for Adam.")
     parser.add_argument("--num_train_epochs", default=3.0, type=float,
                         help="Total number of training epochs to perform.")
+    parser.add_argument("--max_steps", default=-1, type=int,
+                        help="If > 0 limit the number of training steps to perform, you should choose only one of num_train_epochs and max_steps.")
     parser.add_argument("--warmup_proportion", default=0.1, type=float,
                         help="Proportion of training to perform linear learning rate warmup for. "
                              "E.g., 0.1 = 10%% of training.")
@@ -189,8 +191,7 @@ def main():
         model = torch.nn.DataParallel(model)
 
     global_step = 0
-    nb_tr_steps = 0
-    tr_loss = 0
+    curr_tr_loss, curr_steps = 0., 1
 
     if args.do_train:
         if args.local_rank in [-1, 0]:
@@ -229,12 +230,15 @@ def main():
 
         train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
         if args.local_rank == -1:
-            train_sampler = RandomSampler(train_data)
+            train_sampler = SequentialSampler(train_data)  # RandomSampler(train_data)
         else:
             train_sampler = DistributedSampler(train_data)
         train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)
 
-        num_train_optimization_steps = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
+        if args.max_steps > 0:
+            num_train_optimization_steps = args.max_steps
+        else:
+            num_train_optimization_steps = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
 
         # Prepare optimizer
 
@@ -275,22 +279,16 @@ def main():
         logger.info("  Num steps = %d", num_train_optimization_steps)
 
         model.train()
-        for _ in trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]):
-            tr_loss = 0
-            nb_tr_examples, nb_tr_steps = 0, 0
-            for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])):
+        for _ in trange(int(args.num_train_epochs) if args.max_steps <= 0 else int('Inf'),
+                        desc="Epoch", disable=args.local_rank not in [-1, 0]):
+            for step, batch in enumerate(tqdm(train_dataloader,
+                                              desc="Iteration",
+                                              disable=args.local_rank not in [-1, 0])):
                 batch = tuple(t.to(device) for t in batch)
                 input_ids, input_mask, segment_ids, label_ids = batch
 
                 # define a new function to compute loss values for both output_modes
-                logits, _ = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask)
-
-                if output_mode == "classification":
-                    loss_fct = CrossEntropyLoss()
-                    loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1))
-                elif output_mode == "regression":
-                    loss_fct = MSELoss()
-                    loss = loss_fct(logits.view(-1), label_ids.view(-1))
+                loss, _ = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids)
 
                 if n_gpu > 1:
                     loss = loss.mean() # mean() to average on multi-gpu.
@@ -302,12 +300,10 @@ def main():
                 else:
                     loss.backward()
 
-                if args.clip_gradients > 0.0:
-                    torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip_gradients)
+                gnorm = torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip_gradients)
 
-                tr_loss += loss.item()
-                nb_tr_examples += input_ids.size(0)
-                nb_tr_steps += 1
+                curr_tr_loss += loss.item()
+                curr_steps += 1
                 if (step + 1) % args.gradient_accumulation_steps == 0:
                     if args.fp16:
                         # modify learning rate with special warm up BERT uses
@@ -318,10 +314,19 @@ def main():
                     optimizer.step()
                     optimizer.zero_grad()
                     global_step += 1
-                    if args.local_rank in [-1, 0] and (args.log_every <= 0 or (step + 1) % args.log_every == 0):
-                        if not args.fp16:
-                            tb_writer.add_scalar('lr', optimizer.get_lr()[0], global_step)
-                        tb_writer.add_scalar('loss', loss.item(), global_step)
+                    if args.local_rank in [-1, 0] and (args.log_every <= 0 or (global_step + 1) % args.log_every == 0):
+                        learning_rate = optimizer.get_lr()[0] if not args.fp16 else lr_this_step
+                        logger.info("[{}] | gnorm {:.2f} lr {:8.6f} | loss {:.2f}".format(
+                            global_step, gnorm, learning_rate, curr_tr_loss / curr_steps))
+                        tb_writer.add_scalar('lr', learning_rate, global_step)
+                        tb_writer.add_scalar('loss', curr_tr_loss / curr_steps, global_step)
+                        curr_tr_loss, curr_steps = 0., 1
+
+                    if args.max_steps > 0 and global_step > args.max_steps:
+                        break
+
+            if args.max_steps > 0 and global_step > args.max_steps:
+                break
 
     ### Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
     ### Example:
@@ -435,7 +440,7 @@ def main():
             preds = np.squeeze(preds)
         result = compute_metrics(task_name, preds, out_label_ids)
 
-        loss = tr_loss/global_step if args.do_train else None
+        loss = curr_tr_loss/curr_steps if args.do_train else None
 
         result['eval_loss'] = eval_loss
         result['global_step'] = global_step
@@ -508,7 +513,7 @@ def main():
             preds = np.argmax(preds, axis=1)
             result = compute_metrics(task_name, preds, out_label_ids)
 
-            loss = tr_loss/global_step if args.do_train else None
+            loss = curr_tr_loss/curr_steps if args.do_train else None
 
             result['eval_loss'] = eval_loss
             result['global_step'] = global_step
diff --git a/pytorch_pretrained_bert/modeling.py b/pytorch_pretrained_bert/modeling.py
index 27c747e405..eade7310f9 100644
--- a/pytorch_pretrained_bert/modeling.py
+++ b/pytorch_pretrained_bert/modeling.py
@@ -270,15 +270,13 @@ class BertEmbeddings(nn.Module):
 
 
 class BertSelfAttention(nn.Module):
-    def __init__(self, config, output_attentions=False, keep_multihead_output=False):
+    def __init__(self, config, output_attentions=False):
         super(BertSelfAttention, self).__init__()
         if config.hidden_size % config.num_attention_heads != 0:
             raise ValueError(
                 "The hidden size (%d) is not a multiple of the number of attention "
                 "heads (%d)" % (config.hidden_size, config.num_attention_heads))
         self.output_attentions = output_attentions
-        self.keep_multihead_output = keep_multihead_output
-        self.multihead_output = None
 
         self.num_attention_heads = config.num_attention_heads
         self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
@@ -329,9 +327,9 @@ class BertSelfAttention(nn.Module):
         context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
         new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
         context_layer = context_layer.view(*new_context_layer_shape)
-        if self.output_attentions:
-            return attention_probs, context_layer
-        return context_layer
+
+        outputs = [context_layer, attention_probs] if self.output_attentions else [context_layer]
+        return outputs
 
 
 class BertSelfOutput(nn.Module):
@@ -349,11 +347,10 @@ class BertSelfOutput(nn.Module):
 
 
 class BertAttention(nn.Module):
-    def __init__(self, config, output_attentions=False, keep_multihead_output=False):
+    def __init__(self, config, output_attentions=False):
         super(BertAttention, self).__init__()
         self.output_attentions = output_attentions
-        self.self = BertSelfAttention(config, output_attentions=output_attentions,
-                                              keep_multihead_output=keep_multihead_output)
+        self.self = BertSelfAttention(config, output_attentions=output_attentions)
         self.output = BertSelfOutput(config)
 
     def prune_heads(self, heads):
@@ -374,13 +371,10 @@ class BertAttention(nn.Module):
         self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
 
     def forward(self, input_tensor, attention_mask, head_mask=None):
-        self_output = self.self(input_tensor, attention_mask, head_mask)
-        if self.output_attentions:
-            attentions, self_output = self_output
-        attention_output = self.output(self_output, input_tensor)
-        if self.output_attentions:
-            return attentions, attention_output
-        return attention_output
+        self_outputs = self.self(input_tensor, attention_mask, head_mask)
+        attention_output = self.output(self_outputs[0], input_tensor)
+        outputs = [attention_output] + self_outputs[1:]  # add attentions if we output them
+        return outputs
 
 
 class BertIntermediate(nn.Module):
@@ -413,48 +407,52 @@ class BertOutput(nn.Module):
 
 
 class BertLayer(nn.Module):
-    def __init__(self, config, output_attentions=False, keep_multihead_output=False):
+    def __init__(self, config, output_attentions=False):
         super(BertLayer, self).__init__()
         self.output_attentions = output_attentions
-        self.attention = BertAttention(config, output_attentions=output_attentions,
-                                               keep_multihead_output=keep_multihead_output)
+        self.attention = BertAttention(config, output_attentions=output_attentions)
         self.intermediate = BertIntermediate(config)
         self.output = BertOutput(config)
 
     def forward(self, hidden_states, attention_mask, head_mask=None):
-        attention_output = self.attention(hidden_states, attention_mask, head_mask)
-        if self.output_attentions:
-            attentions, attention_output = attention_output
-        intermediate_output = self.intermediate(attention_output)
+        attention_outputs = self.attention(hidden_states, attention_mask, head_mask)
+        intermediate_output = self.intermediate(attention_outputs[0])
         layer_output = self.output(intermediate_output, attention_output)
-        if self.output_attentions:
-            return attentions, layer_output
-        return layer_output
+        outputs = [layer_output] + attention_outputs[1:]  # add attentions if we output them
+        return outputs
 
 
 class BertEncoder(nn.Module):
-    def __init__(self, config, output_attentions=False, keep_multihead_output=False):
+    def __init__(self, config, output_attentions=False, output_hidden_states=False):
         super(BertEncoder, self).__init__()
         self.output_attentions = output_attentions
-        layer = BertLayer(config, output_attentions=output_attentions,
-                                  keep_multihead_output=keep_multihead_output)
+        self.output_hidden_states = output_hidden_states
+        layer = BertLayer(config, output_attentions=output_attentions)
         self.layer = nn.ModuleList([copy.deepcopy(layer) for _ in range(config.num_hidden_layers)])
 
-    def forward(self, hidden_states, attention_mask, output_all_encoded_layers=True, head_mask=None):
-        all_encoder_layers = []
+    def forward(self, hidden_states, attention_mask, head_mask=None):
+        all_hidden_states = []
         all_attentions = []
         for i, layer_module in enumerate(self.layer):
-            hidden_states = layer_module(hidden_states, attention_mask, head_mask[i])
+            if self.output_hidden_states:
+                all_hidden_states.append(hidden_states)
+
+            layer_outputs = layer_module(hidden_states, attention_mask, head_mask[i])
+            hidden_states = layer_outputs[0]
+
             if self.output_attentions:
-                attentions, hidden_states = hidden_states
-                all_attentions.append(attentions)
-            if output_all_encoded_layers:
-                all_encoder_layers.append(hidden_states)
-        if not output_all_encoded_layers:
-            all_encoder_layers.append(hidden_states)
+                all_attentions.append(layer_outputs[1])
+
+        # Add last layer
+        if self.output_hidden_states:
+            all_hidden_states.append(hidden_states)
+
+        outputs = [hidden_states]
+        if self.output_hidden_states:
+            outputs.append(all_hidden_states)
         if self.output_attentions:
-            return all_attentions, all_encoder_layers
-        return all_encoder_layers
+            outputs.append(all_attentions)
+        return outputs  # outputs, (hidden states), (attentions)
 
 
 class BertPooler(nn.Module):
@@ -617,12 +615,13 @@ class BertModel(BertPreTrainedModel):
     all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)
     ```
     """
-    def __init__(self, config, output_attentions=False, keep_multihead_output=False):
+    def __init__(self, config, output_attentions=False, output_hidden_states=False):
         super(BertModel, self).__init__(config)
         self.output_attentions = output_attentions
+        self.output_hidden_states = output_hidden_states
         self.embeddings = BertEmbeddings(config)
         self.encoder = BertEncoder(config, output_attentions=output_attentions,
-                                           keep_multihead_output=keep_multihead_output)
+                                           output_hidden_states=output_hidden_states)
         self.pooler = BertPooler(config)
         self.apply(self.init_weights)
 
@@ -633,13 +632,7 @@ class BertModel(BertPreTrainedModel):
         for layer, heads in heads_to_prune.items():
             self.encoder.layer[layer].attention.prune_heads(heads)
 
-    def get_multihead_outputs(self):
-        """ Gather all multi-head outputs.
-            Return: list (layers) of multihead module outputs with gradients
-        """
-        return [layer.attention.self.multihead_output for layer in self.encoder.layer]
-
-    def forward(self, input_ids, token_type_ids=None, attention_mask=None, output_all_encoded_layers=True, head_mask=None):
+    def forward(self, input_ids, token_type_ids=None, attention_mask=None, head_mask=None):
         if attention_mask is None:
             attention_mask = torch.ones_like(input_ids)
         if token_type_ids is None:
@@ -676,19 +669,14 @@ class BertModel(BertPreTrainedModel):
             head_mask = [None] * self.config.num_hidden_layers
 
         embedding_output = self.embeddings(input_ids, token_type_ids)
-        encoded_layers = self.encoder(embedding_output,
-                                      extended_attention_mask,
-                                      output_all_encoded_layers=output_all_encoded_layers,
-                                      head_mask=head_mask)
-        if self.output_attentions:
-            all_attentions, encoded_layers = encoded_layers
-        sequence_output = encoded_layers[-1]
+        encoder_outputs = self.encoder(embedding_output,
+                                       extended_attention_mask,
+                                       head_mask=head_mask)
+        sequence_output = encoder_outputs[0]
         pooled_output = self.pooler(sequence_output)
-        if not output_all_encoded_layers:
-            encoded_layers = encoded_layers[-1]
-        if self.output_attentions:
-            return all_attentions, encoded_layers, pooled_output
-        return encoded_layers, pooled_output
+
+        outputs = [sequence_output, pooled_output] + encoder_outputs[1:]  # add hidden_states and attentions if they are here
+        return outputs  # sequence_output, pooled_output, (hidden_states), (attentions)
 
 
 class BertForPreTraining(BertPreTrainedModel):
@@ -746,32 +734,33 @@ class BertForPreTraining(BertPreTrainedModel):
     masked_lm_logits_scores, seq_relationship_logits = model(input_ids, token_type_ids, input_mask)
     ```
     """
-    def __init__(self, config, output_attentions=False, keep_multihead_output=False):
+    def __init__(self, config, output_attentions=False, output_hidden_states=False):
         super(BertForPreTraining, self).__init__(config)
         self.output_attentions = output_attentions
+        self.output_hidden_states = output_hidden_states
+
         self.bert = BertModel(config, output_attentions=output_attentions,
-                                      keep_multihead_output=keep_multihead_output)
+                                      output_hidden_states=output_hidden_states)
         self.cls = BertPreTrainingHeads(config, self.bert.embeddings.word_embeddings.weight)
         self.apply(self.init_weights)
 
-    def forward(self, input_ids, token_type_ids=None, attention_mask=None, masked_lm_labels=None, next_sentence_label=None, head_mask=None):
-        outputs = self.bert(input_ids, token_type_ids, attention_mask,
-                                                   output_all_encoded_layers=False, head_mask=head_mask)
-        if self.output_attentions:
-            all_attentions, sequence_output, pooled_output = outputs
-        else:
-            sequence_output, pooled_output = outputs
+    def forward(self, input_ids, token_type_ids=None, attention_mask=None, masked_lm_labels=None,
+                next_sentence_label=None, head_mask=None):
+        outputs = self.bert(input_ids, token_type_ids, attention_mask, head_mask=head_mask)
+
+        sequence_output, pooled_output = outputs[:2]
         prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output)
 
+        outputs = [prediction_scores, seq_relationship_score] + outputs[2:]  # add hidden states and attention if they are here
+
         if masked_lm_labels is not None and next_sentence_label is not None:
             loss_fct = CrossEntropyLoss(ignore_index=-1)
             masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1))
             next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1))
             total_loss = masked_lm_loss + next_sentence_loss
-            return total_loss
-        elif self.output_attentions:
-            return all_attentions, prediction_scores, seq_relationship_score
-        return prediction_scores, seq_relationship_score
+            outputs = [total_loss] + outputs
+
+        return outputs  # (loss), prediction_scores, seq_relationship_score, (hidden_states), (attentions)
 
 
 class BertForMaskedLM(BertPreTrainedModel):
diff --git a/pytorch_pretrained_bert/modeling_xlm.py b/pytorch_pretrained_bert/modeling_xlm.py
index 8cb56de253..92e1cc124c 100644
--- a/pytorch_pretrained_bert/modeling_xlm.py
+++ b/pytorch_pretrained_bert/modeling_xlm.py
@@ -919,9 +919,11 @@ class XLMModel(XLMPreTrainedModel):
 
 
 class XLMModel(XLMPreTrainedModel):
-    def __init__(self, config, output_attentions=False, keep_multihead_output=False):
+    def __init__(self, config, output_attentions=False, output_hidden_states=False):
         super(XLMModel, self).__init__(config)
         self.output_attentions = output_attentions
+        self.output_hidden_states = output_hidden_states
+
         self.mem_len = config.mem_len
         self.reuse_len = config.reuse_len
         self.d_model = config.d_model
@@ -1038,8 +1040,7 @@ class XLMModel(XLMPreTrainedModel):
         return pos_emb
 
     def forward(self, inp_k, token_type_ids=None, input_mask=None, attention_mask=None,
-                mems=None, perm_mask=None, target_mapping=None, inp_q=None,
-                output_all_encoded_layers=True, head_mask=None):
+                mems=None, perm_mask=None, target_mapping=None, inp_q=None, head_mask=None):
         """
         Args:
             inp_k: int32 Tensor in shape [bsz, len], the input token IDs.
@@ -1188,23 +1189,45 @@ class XLMModel(XLMPreTrainedModel):
             mems = [None] * len(self.layer)
 
         hidden_states = []
+        attentions = []
         for i, layer_module in enumerate(self.layer):
             # cache new mems
             new_mems.append(self.cache_mem(output_h, mems[i]))
+            # Save hidden_states
+            if output_g is None:
+                hidden_states.append(output_h)
+            else:
+                hidden_states.append((output_h, output_g))
 
             output_h, output_g = layer_module(output_h, output_g,
                                               attn_mask_h=non_tgt_mask, attn_mask_g=attn_mask,
                                               r=pos_emb, seg_mat=seg_mat,
                                               mems=mems[i], target_mapping=target_mapping,
                                               head_mask=head_mask)
+        # Save last hidden_state
+        if output_g is None:
             hidden_states.append(output_h)
+        else:
+            hidden_states.append((output_h, output_g))
+
+        # Select the right output and add dropout
         output = self.dropout(output_g if output_g is not None else output_h)
 
         # We transpose back here to shape [bsz, len, hidden_dim] (cf. beginning of forward() method)
         output = output.permute(1, 0, 2).contiguous()
-        hidden_states = [hs.permute(1, 0, 2).contiguous() for hs in hidden_states]
+        if output_g is None:
+            hidden_states = [hs.permute(1, 0, 2).contiguous() for hs in hidden_states]
+        else:
+            hidden_states = [h.permute(1, 0, 2).contiguous() for hs in hidden_states for h in hs]
 
-        return output, hidden_states, new_mems
+        # Build the list of outputs
+        outputs = [output, new_mems]
+        if self.output_attentions:
+            outputs.append(attentions)
+        if self.output_hidden_states:
+            outputs.append(hidden_states)
+
+        return outputs
 
 
 class XLMPredLayer(nn.Module):
@@ -1309,14 +1332,15 @@ class XLMLMHeadModel(XLMPreTrainedModel):
     all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)
     ```
     """
-    def __init__(self, config, output_attentions=False, keep_multihead_output=False):
+    def __init__(self, config, output_attentions=False, output_hidden_states=False):
         super(XLMLMHeadModel, self).__init__(config)
         self.output_attentions = output_attentions
+        self.output_hidden_states = output_hidden_states
+
         self.attn_type = config.attn_type
         self.same_length = config.same_length
 
-        self.transformer = XLMModel(config, output_attentions=output_attentions,
-                                              keep_multihead_output=keep_multihead_output)
+        self.transformer = XLMModel(config, output_attentions=output_attentions, output_hidden_states=output_hidden_states)
         self.lm_loss = nn.Linear(config.d_model, config.n_token, bias=True)
 
         # Tie weights
@@ -1331,7 +1355,7 @@ class XLMLMHeadModel(XLMPreTrainedModel):
 
     def forward(self, inp_k, token_type_ids=None, input_mask=None, attention_mask=None,
                 mems=None, perm_mask=None, target_mapping=None, inp_q=None,
-                labels=None, output_all_encoded_layers=True, head_mask=None):
+                labels=None, head_mask=None):
         """
         Args:
             inp_k: int32 Tensor in shape [bsz, len], the input token IDs.
@@ -1358,33 +1382,28 @@ class XLMLMHeadModel(XLMPreTrainedModel):
             summary_type: str, "last", "first", "mean", or "attn". The method
                 to pool the input to get a vector representation.
         """
-        output, hidden_states, new_mems = self.transformer(inp_k, token_type_ids, input_mask, attention_mask,
-                                            mems, perm_mask, target_mapping, inp_q,
-                                            output_all_encoded_layers, head_mask)
+        transformer_outputs = self.transformer(inp_k, token_type_ids, input_mask, attention_mask,
+                                               mems, perm_mask, target_mapping, inp_q, head_mask)
 
+        output = transformer_outputs[0]
         logits = self.lm_loss(output)
 
+        outputs = transformer_outputs[1:]  # Keep new_mems and attention/hidden states if they are here
+
         if labels is not None:
             # Flatten the tokens
             loss_fct = CrossEntropyLoss(ignore_index=-1)
             loss = loss_fct(logits.view(-1, logits.size(-1)),
                             labels.view(-1))
-            return loss, new_mems
+            outputs = [loss] + outputs
 
-        # if self.output_attentions:
-        #     all_attentions, encoded_layers = encoded_layers
-        # sequence_output = encoded_layers[-1]
-        # pooled_output = self.pooler(sequence_output)
-        # if not output_all_encoded_layers:
-        #     encoded_layers = encoded_layers[-1]
-        # if self.output_attentions:
-        return logits, new_mems
-        #     return all_attentions, encoded_layers, pooled_output
+        outputs = [logits] + outputs
+
+        return outputs
 
 
 class XLMSequenceSummary(nn.Module):
-    def __init__(self, config, summary_type="last", use_proj=True,
-                 output_attentions=False, keep_multihead_output=False):
+    def __init__(self, config, summary_type="last", use_proj=True):
         super(XLMSequenceSummary, self).__init__()
         self.summary_type = summary_type
         if use_proj:
@@ -1481,26 +1500,23 @@ class XLMForSequenceClassification(XLMPreTrainedModel):
     ```
     """
     def __init__(self, config, summary_type="last", use_proj=True, num_labels=2,
-                 output_attentions=False, keep_multihead_output=False):
+                 output_attentions=False, output_hidden_states=False):
         super(XLMForSequenceClassification, self).__init__(config)
         self.output_attentions = output_attentions
-        self.attn_type = config.attn_type
-        self.same_length = config.same_length
+        self.output_hidden_states = output_hidden_states
+
         self.summary_type = summary_type
         self.num_labels = num_labels
 
-        self.transformer = XLMModel(config, output_attentions=output_attentions,
-                                              keep_multihead_output=keep_multihead_output)
+        self.transformer = XLMModel(config, output_attentions=output_attentions, output_hidden_states=output_hidden_states)
 
-        self.sequence_summary = XLMSequenceSummary(config, summary_type=summary_type,
-                                                     use_proj=use_proj, output_attentions=output_attentions,
-                                                     keep_multihead_output=keep_multihead_output)
+        self.sequence_summary = XLMSequenceSummary(config, summary_type=summary_type, use_proj=use_proj)
         self.logits_proj = nn.Linear(config.d_model, num_labels)
         self.apply(self.init_weights)
 
     def forward(self, inp_k, token_type_ids=None, input_mask=None, attention_mask=None,
                 mems=None, perm_mask=None, target_mapping=None, inp_q=None,
-                labels=None, output_all_encoded_layers=True, head_mask=None):
+                labels=None, head_mask=None):
         """
         Args:
             inp_k: int32 Tensor in shape [bsz, len], the input token IDs.
@@ -1528,13 +1544,15 @@ class XLMForSequenceClassification(XLMPreTrainedModel):
                 Only used during pretraining for two-stream attention.
                 Set to None during finetuning.
         """
-        output, _, new_mems = self.transformer(inp_k, token_type_ids, input_mask, attention_mask,
-                                               mems, perm_mask, target_mapping, inp_q,
-                                               output_all_encoded_layers, head_mask)
+        transformer_outputs = self.transformer(inp_k, token_type_ids, input_mask, attention_mask,
+                                               mems, perm_mask, target_mapping, inp_q, head_mask)
 
+        output = transformer_outputs[0]
         output = self.sequence_summary(output)
         logits = self.logits_proj(output)
 
+        outputs = transformer_outputs[1:]  # Keep new_mems and attention/hidden states if they are here
+
         if labels is not None:
             if self.num_labels == 1:
                 #  We are doing regression
@@ -1543,17 +1561,11 @@ class XLMForSequenceClassification(XLMPreTrainedModel):
             else:
                 loss_fct = CrossEntropyLoss()
                 loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
-            return loss, new_mems
+            outputs = [loss] + outputs
 
-        # if self.output_attentions:
-        #     all_attentions, encoded_layers = encoded_layers
-        # sequence_output = encoded_layers[-1]
-        # pooled_output = self.pooler(sequence_output)
-        # if not output_all_encoded_layers:
-        #     encoded_layers = encoded_layers[-1]
-        # if self.output_attentions:
-        return logits, new_mems
-        #     return all_attentions, encoded_layers, pooled_output
+        outputs = [logits] + outputs
+
+        return outputs
 
 
 class XLMForQuestionAnswering(XLMPreTrainedModel):
@@ -1612,27 +1624,30 @@ class XLMForQuestionAnswering(XLMPreTrainedModel):
     start_logits, end_logits = model(input_ids, token_type_ids, input_mask)
     ```
     """
-    def __init__(self, config, output_attentions=False, keep_multihead_output=False):
+    def __init__(self, config, output_attentions=False, output_hidden_states=False):
         super(XLMForQuestionAnswering, self).__init__(config)
         self.output_attentions = output_attentions
-        self.transformer = XLMModel(config, output_attentions=output_attentions,
-                                      keep_multihead_output=keep_multihead_output)
+        self.output_hidden_states = output_hidden_states
+
+        self.transformer = XLMModel(config, output_attentions=output_attentions, output_hidden_states=output_hidden_states)
         self.qa_outputs = nn.Linear(config.hidden_size, 2)
         self.apply(self.init_weights)
 
     def forward(self, inp_k, token_type_ids=None, input_mask=None, attention_mask=None,
                 mems=None, perm_mask=None, target_mapping=None, inp_q=None,
-                start_positions=None, end_positions=None,
-                output_all_encoded_layers=True, head_mask=None):
-        output, _, new_mems = self.transformer(inp_k, token_type_ids, input_mask, attention_mask,
-                                            mems, perm_mask, target_mapping, inp_q,
-                                            output_all_encoded_layers, head_mask)
+                start_positions=None, end_positions=None, head_mask=None):
 
+        transformer_outputs = self.transformer(inp_k, token_type_ids, input_mask, attention_mask,
+                                               mems, perm_mask, target_mapping, inp_q, head_mask)
+
+        output = transformer_outputs[0]
         logits = self.qa_outputs(output)
         start_logits, end_logits = logits.split(1, dim=-1)
         start_logits = start_logits.squeeze(-1)
         end_logits = end_logits.squeeze(-1)
 
+        outputs = transformer_outputs[1:]  # Keep new_mems and attention/hidden states if they are here
+
         if start_positions is not None and end_positions is not None:
             # If we are on multi-GPU, split add a dimension
             if len(start_positions.size()) > 1:
@@ -1648,7 +1663,8 @@ class XLMForQuestionAnswering(XLMPreTrainedModel):
             start_loss = loss_fct(start_logits, start_positions)
             end_loss = loss_fct(end_logits, end_positions)
             total_loss = (start_loss + end_loss) / 2
-            return total_loss
-        elif self.output_attentions:
-            return all_attentions, start_logits, end_logits
-        return start_logits, end_logits
+            outputs = [total_loss] + outputs
+
+        outputs = [start_logits, end_logits] + outputs
+
+        return outputs
diff --git a/pytorch_pretrained_bert/modeling_xlnet.py b/pytorch_pretrained_bert/modeling_xlnet.py
index c30e263181..71e9f584dd 100644
--- a/pytorch_pretrained_bert/modeling_xlnet.py
+++ b/pytorch_pretrained_bert/modeling_xlnet.py
@@ -323,16 +323,13 @@ except ImportError:
             return self.weight * x + self.bias
 
 class XLNetRelativeAttention(nn.Module):
-    def __init__(self, config, output_attentions=False, keep_multihead_output=False):
+    def __init__(self, config, output_attentions=False):
         super(XLNetRelativeAttention, self).__init__()
         self.output_attentions = output_attentions
         if config.d_model % config.n_head != 0:
             raise ValueError(
                 "The hidden size (%d) is not a multiple of the number of attention "
                 "heads (%d)" % (config.d_model, config.n_head))
-        self.output_attentions = output_attentions
-        self.keep_multihead_output = keep_multihead_output
-        self.multihead_output = None
 
         self.n_head = config.n_head
         self.d_head = config.d_head
@@ -368,7 +365,7 @@ class XLNetRelativeAttention(nn.Module):
 
         return x
 
-    def rel_attn_core(self, q_head, k_head_h, v_head_h, k_head_r, seg_mat=None, attn_mask=None):
+    def rel_attn_core(self, q_head, k_head_h, v_head_h, k_head_r, seg_mat=None, attn_mask=None, head_mask=None):
         """Core relative positional attention operations."""
 
         # content based attention score
@@ -395,9 +392,16 @@ class XLNetRelativeAttention(nn.Module):
         attn_prob = F.softmax(attn_score, dim=1)
         attn_prob = self.dropout(attn_prob)
 
+        # Mask heads if we want to
+        if head_mask is not None:
+            attn_prob = attn_prob * head_mask
+
         # attention output
         attn_vec = torch.einsum('ijbn,jbnd->ibnd', attn_prob, v_head_h)
 
+        if self.output_attentions:
+            return attn_vec, attn_prob
+
         return attn_vec
 
     def post_attention(self, h, attn_vec, residual=True):
@@ -439,7 +443,10 @@ class XLNetRelativeAttention(nn.Module):
 
             # core attention ops
             attn_vec_h = self.rel_attn_core(
-                q_head_h, k_head_h, v_head_h, k_head_r, seg_mat=seg_mat, attn_mask=attn_mask_h)
+                q_head_h, k_head_h, v_head_h, k_head_r, seg_mat=seg_mat, attn_mask=attn_mask_h, head_mask=head_mask)
+
+            if self.output_attentions:
+                attn_vec_h, attn_prob_h = attn_vec_h
 
             # post processing
             output_h = self.post_attention(h, attn_vec_h)
@@ -452,14 +459,25 @@ class XLNetRelativeAttention(nn.Module):
             if target_mapping is not None:
                 q_head_g = torch.einsum('mbnd,mlb->lbnd', q_head_g, target_mapping)
                 attn_vec_g = self.rel_attn_core(
-                    q_head_g, k_head_h, v_head_h, k_head_r, seg_mat=seg_mat, attn_mask=attn_mask_g)
+                    q_head_g, k_head_h, v_head_h, k_head_r, seg_mat=seg_mat, attn_mask=attn_mask_g, head_mask=head_mask)
+
+                if self.output_attentions:
+                    attn_vec_g, attn_prob_g = attn_vec_g
+
                 attn_vec_g = torch.einsum('lbnd,mlb->mbnd', attn_vec_g, target_mapping)
             else:
                 attn_vec_g = self.rel_attn_core(
-                    q_head_g, k_head_h, v_head_h, k_head_r, seg_mat=seg_mat, attn_mask=attn_mask_g)
+                    q_head_g, k_head_h, v_head_h, k_head_r, seg_mat=seg_mat, attn_mask=attn_mask_g, head_mask=head_mask)
+
+                if self.output_attentions:
+                    attn_vec_g, attn_prob_g = attn_vec_g
 
             # post processing
             output_g = self.post_attention(g, attn_vec_g)
+
+            if self.output_attentions:
+                attn_prob = attn_prob_h, attn_prob_g
+
         else:
             ###### Multi-head attention with relative positional encoding
             if mems is not None and mems.dim() > 1:
@@ -477,30 +495,18 @@ class XLNetRelativeAttention(nn.Module):
 
             # core attention ops
             attn_vec = self.rel_attn_core(
-                q_head_h, k_head_h, v_head_h, k_head_r, seg_mat=seg_mat, attn_mask=attn_mask_h)
+                q_head_h, k_head_h, v_head_h, k_head_r, seg_mat=seg_mat, attn_mask=attn_mask_h, head_mask=head_mask)
+
+            if self.output_attentions:
+                attn_vec, attn_prob = attn_vec
 
             # post processing
             output_h = self.post_attention(h, attn_vec)
             output_g = None
 
+        if self.output_attentions:
+            return output_h, output_g, attn_prob
 
-        # Mask heads if we want to
-        # if head_mask is not None:
-        #     attention_probs = attention_probs * head_mask
-
-        # context_layer = torch.matmul(attention_probs, value_layer)
-        # if self.keep_multihead_output:
-        #     self.multihead_output = context_layer
-        #     self.multihead_output.retain_grad()
-
-        # context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
-        # new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
-        # context_layer = context_layer.view(*new_context_layer_shape)
-
-        # if self.output_attentions:
-        #     attentions, self_output = self_output
-        # if self.output_attentions:
-        #     return attentions, attention_output
         return output_h, output_g
 
 class XLNetFeedForward(nn.Module):
@@ -510,7 +516,8 @@ class XLNetFeedForward(nn.Module):
         self.layer_1 = nn.Linear(config.d_model, config.d_inner)
         self.layer_2 = nn.Linear(config.d_inner, config.d_model)
         self.dropout = nn.Dropout(config.dropout)
-        if isinstance(config.ff_activation, str) or (sys.version_info[0] == 2 and isinstance(config.ff_activation, unicode)):
+        if isinstance(config.ff_activation, str) or \
+                (sys.version_info[0] == 2 and isinstance(config.ff_activation, unicode)):
             self.activation_function = ACT2FN[config.ff_activation]
         else:
             self.activation_function = config.ff_activation
@@ -526,29 +533,27 @@ class XLNetFeedForward(nn.Module):
         return output
 
 class XLNetLayer(nn.Module):
-    def __init__(self, config, output_attentions=False, keep_multihead_output=False):
+    def __init__(self, config, output_attentions=False, ):
         super(XLNetLayer, self).__init__()
         self.output_attentions = output_attentions
-        self.rel_attn = XLNetRelativeAttention(config, output_attentions=output_attentions,
-                                               keep_multihead_output=keep_multihead_output)
+        self.rel_attn = XLNetRelativeAttention(config, output_attentions=output_attentions)
         self.ff = XLNetFeedForward(config)
         self.dropout = nn.Dropout(config.dropout)
 
     def forward(self, output_h, output_g,
                 attn_mask_h, attn_mask_g,
-                r, seg_mat,
-                mems=None, target_mapping=None, head_mask=None):
-        output_h, output_g = self.rel_attn(output_h, output_g,
-                                           attn_mask_h, attn_mask_g,
-                                           r, seg_mat,
-                                           mems=mems, target_mapping=target_mapping, head_mask=head_mask)
+                r, seg_mat, mems=None, target_mapping=None, head_mask=None):
+        outputs = self.rel_attn(output_h, output_g, attn_mask_h, attn_mask_g,
+                                r, seg_mat, mems=mems, target_mapping=target_mapping,
+                                head_mask=head_mask)
+        output_h, output_g = outputs[:2]
+
         if output_g is not None:
             output_g = self.ff(output_g)
         output_h = self.ff(output_h)
 
-        # if self.output_attentions:
-        #     return attentions, layer_output
-        return output_h, output_g
+        outputs = [output_h, output_g] + outputs[2:]  # Add again attentions if there are there
+        return outputs
 
 
 class XLNetPreTrainedModel(PreTrainedModel):
@@ -584,9 +589,11 @@ class XLNetPreTrainedModel(PreTrainedModel):
 
 
 class XLNetModel(XLNetPreTrainedModel):
-    def __init__(self, config, output_attentions=False, keep_multihead_output=False):
+    def __init__(self, config, output_attentions=False, output_hidden_states=False):
         super(XLNetModel, self).__init__(config)
         self.output_attentions = output_attentions
+        self.output_hidden_states = output_hidden_states
+
         self.mem_len = config.mem_len
         self.reuse_len = config.reuse_len
         self.d_model = config.d_model
@@ -597,8 +604,7 @@ class XLNetModel(XLNetPreTrainedModel):
 
         self.word_embedding = nn.Embedding(config.n_token, config.d_model)
         self.mask_emb = nn.Parameter(torch.Tensor(1, 1, config.d_model))
-        layer = XLNetLayer(config, output_attentions=output_attentions,
-                                   keep_multihead_output=keep_multihead_output)
+        layer = XLNetLayer(config, output_attentions=output_attentions)
         self.layer = nn.ModuleList([copy.deepcopy(layer) for _ in range(config.n_layer)])
         self.dropout = nn.Dropout(config.dropout)
 
@@ -851,28 +857,39 @@ class XLNetModel(XLNetPreTrainedModel):
         if mems is None:
             mems = [None] * len(self.layer)
 
+        attentions = []
         hidden_states = []
         for i, layer_module in enumerate(self.layer):
             # cache new mems
             new_mems.append(self.cache_mem(output_h, mems[i]))
+            if self.output_hidden_states:
+                hidden_states.append((output_h, output_g) if output_g is not None else output_h)
+
+            outputs = layer_module(output_h, output_g, attn_mask_h=non_tgt_mask, attn_mask_g=attn_mask,
+                                   r=pos_emb, seg_mat=seg_mat, mems=mems[i], target_mapping=target_mapping,
+                                   head_mask=head_mask)
+            output_h, output_g = outputs[:2]
+            if self.output_attentions:
+                attentions.append(outputs[2:])
+
+        # Add last hidden state
+        if self.output_hidden_states:
             hidden_states.append((output_h, output_g) if output_g is not None else output_h)
 
-            output_h, output_g = layer_module(output_h, output_g,
-                                              attn_mask_h=non_tgt_mask, attn_mask_g=attn_mask,
-                                              r=pos_emb, seg_mat=seg_mat,
-                                              mems=mems[i], target_mapping=target_mapping,
-                                              head_mask=head_mask)
-        hidden_states.append((output_h, output_g) if output_g is not None else output_h)
         output = self.dropout(output_g if output_g is not None else output_h)
 
-        # We transpose back here to shape [bsz, len, hidden_dim] (cf. beginning of forward() method)
-        output = output.permute(1, 0, 2).contiguous()
-        if output_g is not None:
-            hidden_states = [h.permute(1, 0, 2).contiguous() for hs in hidden_states for h in hs]
-        else:
-            hidden_states = [hs.permute(1, 0, 2).contiguous() for hs in hidden_states]
+        # Prepare outputs, we transpose back here to shape [bsz, len, hidden_dim] (cf. beginning of forward() method)
+        outputs = [output.permute(1, 0, 2).contiguous(), new_mems]
+        if self.output_hidden_states:
+            if output_g is not None:
+                hidden_states = [h.permute(1, 0, 2).contiguous() for hs in hidden_states for h in hs]
+            else:
+                hidden_states = [hs.permute(1, 0, 2).contiguous() for hs in hidden_states]
+            outputs.append(hidden_states)
+        if self.output_attentions:
+            outputs.append(attentions)
 
-        return output, hidden_states, new_mems
+        return outputs  # outputs, new_mems, (hidden_states), (attentions)
 
 
 class XLNetLMHeadModel(XLNetPreTrainedModel):
@@ -936,14 +953,16 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
     all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)
     ```
     """
-    def __init__(self, config, output_attentions=False, keep_multihead_output=False):
+    def __init__(self, config, output_attentions=False, output_hidden_states=False):
         super(XLNetLMHeadModel, self).__init__(config)
         self.output_attentions = output_attentions
+        self.output_hidden_states = output_hidden_states
+
         self.attn_type = config.attn_type
         self.same_length = config.same_length
 
         self.transformer = XLNetModel(config, output_attentions=output_attentions,
-                                              keep_multihead_output=keep_multihead_output)
+                                              output_hidden_states=output_hidden_states)
         self.lm_loss = nn.Linear(config.d_model, config.n_token, bias=True)
 
         # Tie weights
@@ -989,27 +1008,24 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
             summary_type: str, "last", "first", "mean", or "attn". The method
                 to pool the input to get a vector representation.
         """
-        output, hidden_states, new_mems = self.transformer(inp_k, token_type_ids, input_mask, attention_mask,
-                                            mems, perm_mask, target_mapping, inp_q, head_mask)
+        transformer_outputs = self.transformer(inp_k, token_type_ids, input_mask, attention_mask,
+                                               mems, perm_mask, target_mapping, inp_q, head_mask)
 
-        logits = self.lm_loss(output)
+        logits = self.lm_loss(transformer_outputs[0])
+
+        outputs = [logits] + transformer_outputs[1:]  # Keep mems, hidden states, attentions if there are in it
 
         if labels is not None:
             # Flatten the tokens
             loss_fct = CrossEntropyLoss(ignore_index=-1)
             loss = loss_fct(logits.view(-1, logits.size(-1)),
                             labels.view(-1))
-            return loss, new_mems
+            outputs = [loss] + outputs
 
-        # if self.output_attentions:
-        #     all_attentions, encoded_layers = encoded_layers
-        # if self.output_attentions:
-        return logits, new_mems
-        #     return all_attentions, encoded_layers, pooled_output
+        return outputs  # return (loss), logits, (mems), (hidden states), (attentions)
 
 class XLNetSequenceSummary(nn.Module):
-    def __init__(self, config, summary_type="last", use_proj=True,
-                 output_attentions=False, keep_multihead_output=False):
+    def __init__(self, config, summary_type="last", use_proj=True):
         super(XLNetSequenceSummary, self).__init__()
         self.summary_type = summary_type
         if use_proj:
@@ -1106,20 +1122,20 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel):
     ```
     """
     def __init__(self, config, summary_type="last", use_proj=True, num_labels=2,
-                 output_attentions=False, keep_multihead_output=False):
+                 output_attentions=False, output_hidden_states=False):
         super(XLNetForSequenceClassification, self).__init__(config)
         self.output_attentions = output_attentions
+        self.output_hidden_states = output_hidden_states
+
         self.attn_type = config.attn_type
         self.same_length = config.same_length
         self.summary_type = summary_type
         self.num_labels = num_labels
 
         self.transformer = XLNetModel(config, output_attentions=output_attentions,
-                                              keep_multihead_output=keep_multihead_output)
+                                              output_hidden_states=output_hidden_states)
 
-        self.sequence_summary = XLNetSequenceSummary(config, summary_type=summary_type,
-                                                     use_proj=use_proj, output_attentions=output_attentions,
-                                                     keep_multihead_output=keep_multihead_output)
+        self.sequence_summary = XLNetSequenceSummary(config, summary_type=summary_type, use_proj=use_proj)
         self.logits_proj = nn.Linear(config.d_model, num_labels)
         self.apply(self.init_weights)
 
@@ -1153,12 +1169,15 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel):
                 Only used during pretraining for two-stream attention.
                 Set to None during finetuning.
         """
-        output, _, new_mems = self.transformer(inp_k, token_type_ids, input_mask, attention_mask,
+        transformer_outputs = self.transformer(inp_k, token_type_ids, input_mask, attention_mask,
                                                mems, perm_mask, target_mapping, inp_q, head_mask)
+        output = transformer_outputs[0]
 
         output = self.sequence_summary(output)
         logits = self.logits_proj(output)
 
+        outputs = [logits] + transformer_outputs[1:]  # Keep mems, hidden states, attentions if there are in it
+
         if labels is not None:
             if self.num_labels == 1:
                 #  We are doing regression
@@ -1167,13 +1186,10 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel):
             else:
                 loss_fct = CrossEntropyLoss()
                 loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
-            return loss, new_mems
+            outputs = [loss] + outputs
+
+        return outputs  # return (loss), logits, (mems), (hidden states), (attentions)
 
-        # if self.output_attentions:
-        #     all_attentions, encoded_layers = encoded_layers
-        # if self.output_attentions:
-        return logits, new_mems
-        #     return all_attentions, encoded_layers, pooled_output
 
 class XLNetForQuestionAnswering(XLNetPreTrainedModel):
     """XLNet model for Question Answering (span extraction).
@@ -1231,25 +1247,30 @@ class XLNetForQuestionAnswering(XLNetPreTrainedModel):
     start_logits, end_logits = model(input_ids, token_type_ids, input_mask)
     ```
     """
-    def __init__(self, config, output_attentions=False, keep_multihead_output=False):
+    def __init__(self, config, output_attentions=False, output_hidden_states=False):
         super(XLNetForQuestionAnswering, self).__init__(config)
         self.output_attentions = output_attentions
+        self.output_hidden_states = output_hidden_states
+
         self.transformer = XLNetModel(config, output_attentions=output_attentions,
-                                      keep_multihead_output=keep_multihead_output)
+                                      output_hidden_states=output_hidden_states)
         self.qa_outputs = nn.Linear(config.hidden_size, 2)
         self.apply(self.init_weights)
 
     def forward(self, inp_k, token_type_ids=None, input_mask=None, attention_mask=None,
                 mems=None, perm_mask=None, target_mapping=None, inp_q=None,
                 start_positions=None, end_positions=None, head_mask=None):
-        output, _, new_mems = self.transformer(inp_k, token_type_ids, input_mask, attention_mask,
+        transformer_outputs = self.transformer(inp_k, token_type_ids, input_mask, attention_mask,
                                             mems, perm_mask, target_mapping, inp_q, head_mask)
 
-        logits = self.qa_outputs(output)
+        logits = self.qa_outputs(transformer_outputs[0])
+
         start_logits, end_logits = logits.split(1, dim=-1)
         start_logits = start_logits.squeeze(-1)
         end_logits = end_logits.squeeze(-1)
 
+        outputs = [start_logits, end_logits] + transformer_outputs[1:]  # Keep mems, hidden states, attentions if there are in it
+
         if start_positions is not None and end_positions is not None:
             # If we are on multi-GPU, split add a dimension
             if len(start_positions.size()) > 1:
@@ -1265,7 +1286,6 @@ class XLNetForQuestionAnswering(XLNetPreTrainedModel):
             start_loss = loss_fct(start_logits, start_positions)
             end_loss = loss_fct(end_logits, end_positions)
             total_loss = (start_loss + end_loss) / 2
-            return total_loss
-        elif self.output_attentions:
-            return all_attentions, start_logits, end_logits
-        return start_logits, end_logits
+            outputs = [total_loss] + outputs
+
+        return outputs  # return (loss), logits, (mems), (hidden states), (attentions)

From 213981d8cb0647c432e78aced2cae74eedc171c8 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Fri, 28 Jun 2019 16:45:24 +0200
Subject: [PATCH 031/139] updating bert API

---
 pytorch_pretrained_bert/modeling.py | 150 +++++++++++++---------------
 1 file changed, 72 insertions(+), 78 deletions(-)

diff --git a/pytorch_pretrained_bert/modeling.py b/pytorch_pretrained_bert/modeling.py
index eade7310f9..67054d416d 100644
--- a/pytorch_pretrained_bert/modeling.py
+++ b/pytorch_pretrained_bert/modeling.py
@@ -814,31 +814,28 @@ class BertForMaskedLM(BertPreTrainedModel):
     masked_lm_logits_scores = model(input_ids, token_type_ids, input_mask)
     ```
     """
-    def __init__(self, config, output_attentions=False, keep_multihead_output=False):
+    def __init__(self, config, output_attentions=False, output_hidden_states=False):
         super(BertForMaskedLM, self).__init__(config)
         self.output_attentions = output_attentions
-        self.bert = BertModel(config, output_attentions=output_attentions,
-                                      keep_multihead_output=keep_multihead_output)
+        self.output_hidden_states = output_hidden_states
+
+        self.bert = BertModel(config, output_attentions=output_attentions )
         self.cls = BertOnlyMLMHead(config, self.bert.embeddings.word_embeddings.weight)
         self.apply(self.init_weights)
 
     def forward(self, input_ids, token_type_ids=None, attention_mask=None, masked_lm_labels=None, head_mask=None):
-        outputs = self.bert(input_ids, token_type_ids, attention_mask,
-                                       output_all_encoded_layers=False,
-                                       head_mask=head_mask)
-        if self.output_attentions:
-            all_attentions, sequence_output, _ = outputs
-        else:
-            sequence_output, _ = outputs
+        outputs = self.bert(input_ids, token_type_ids, attention_mask, head_mask=head_mask)
+
+        sequence_output = outputs[0]
         prediction_scores = self.cls(sequence_output)
 
+        outputs = [prediction_scores] + outputs[2:]  # Add hidden states and attention is they are here
         if masked_lm_labels is not None:
             loss_fct = CrossEntropyLoss(ignore_index=-1)
             masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1))
-            return masked_lm_loss
-        elif self.output_attentions:
-            return all_attentions, prediction_scores
-        return prediction_scores
+            outputs = [masked_lm_loss] + outputs
+
+        return outputs  # (masked_lm_loss), prediction_scores, (hidden_states), (attentions)
 
 
 class BertForNextSentencePrediction(BertPreTrainedModel):
@@ -889,31 +886,29 @@ class BertForNextSentencePrediction(BertPreTrainedModel):
     seq_relationship_logits = model(input_ids, token_type_ids, input_mask)
     ```
     """
-    def __init__(self, config, output_attentions=False, keep_multihead_output=False):
+    def __init__(self, config, output_attentions=False, output_hidden_states=False):
         super(BertForNextSentencePrediction, self).__init__(config)
         self.output_attentions = output_attentions
-        self.bert = BertModel(config, output_attentions=output_attentions,
-                                      keep_multihead_output=keep_multihead_output)
+        self.output_hidden_states = output_hidden_states
+
+        self.bert = BertModel(config, output_attentions=output_attentions)
         self.cls = BertOnlyNSPHead(config)
+
         self.apply(self.init_weights)
 
     def forward(self, input_ids, token_type_ids=None, attention_mask=None, next_sentence_label=None, head_mask=None):
-        outputs = self.bert(input_ids, token_type_ids, attention_mask,
-                                     output_all_encoded_layers=False,
-                                     head_mask=head_mask)
-        if self.output_attentions:
-            all_attentions, _, pooled_output = outputs
-        else:
-            _, pooled_output = outputs
+        outputs = self.bert(input_ids, token_type_ids, attention_mask, head_mask=head_mask)
+        pooled_output = outputs[1]
+
         seq_relationship_score = self.cls(pooled_output)
 
+        outputs = [seq_relationship_score] + outputs[2:]  # add hidden states and attention if they are here
         if next_sentence_label is not None:
             loss_fct = CrossEntropyLoss(ignore_index=-1)
             next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1))
-            return next_sentence_loss
-        elif self.output_attentions:
-            return all_attentions, seq_relationship_score
-        return seq_relationship_score
+            outputs = [next_sentence_loss] + outputs
+
+        return outputs  # (next_sentence_loss), seq_relationship_score, (hidden_states), (attentions)
 
 
 class BertForSequenceClassification(BertPreTrainedModel):
@@ -966,25 +961,27 @@ class BertForSequenceClassification(BertPreTrainedModel):
     logits = model(input_ids, token_type_ids, input_mask)
     ```
     """
-    def __init__(self, config, num_labels=2, output_attentions=False, keep_multihead_output=False):
+    def __init__(self, config, num_labels=2, output_attentions=False, output_hidden_states=False):
         super(BertForSequenceClassification, self).__init__(config)
         self.output_attentions = output_attentions
+        self.output_hidden_states = output_hidden_states
         self.num_labels = num_labels
-        self.bert = BertModel(config, output_attentions=output_attentions,
-                                      keep_multihead_output=keep_multihead_output)
+
+        self.bert = BertModel(config, output_attentions=output_attentions)
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
         self.classifier = nn.Linear(config.hidden_size, num_labels)
+
         self.apply(self.init_weights)
 
     def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None, head_mask=None):
-        outputs = self.bert(input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False, head_mask=head_mask)
-        if self.output_attentions:
-            all_attentions, _, pooled_output = outputs
-        else:
-            _, pooled_output = outputs
+        outputs = self.bert(input_ids, token_type_ids, attention_mask, head_mask=head_mask)
+        pooled_output = outputs[1]
+
         pooled_output = self.dropout(pooled_output)
         logits = self.classifier(pooled_output)
 
+        outputs = [logits] + outputs[2:]  # add hidden states and attention if they are here
+
         if labels is not None:
             if self.num_labels == 1:
                 #  We are doing regression
@@ -993,10 +990,9 @@ class BertForSequenceClassification(BertPreTrainedModel):
             else:
                 loss_fct = CrossEntropyLoss()
                 loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
-            return loss
-        elif self.output_attentions:
-            return all_attentions, logits
-        return logits
+            outputs = [loss] + outputs
+
+        return outputs  # (loss), logits, (hidden_states), (attentions)
 
 
 class BertForMultipleChoice(BertPreTrainedModel):
@@ -1048,36 +1044,37 @@ class BertForMultipleChoice(BertPreTrainedModel):
     logits = model(input_ids, token_type_ids, input_mask)
     ```
     """
-    def __init__(self, config, num_choices=2, output_attentions=False, keep_multihead_output=False):
+    def __init__(self, config, num_choices=2, output_attentions=False, output_hidden_states=False):
         super(BertForMultipleChoice, self).__init__(config)
         self.output_attentions = output_attentions
+        self.output_hidden_states = output_hidden_states
         self.num_choices = num_choices
-        self.bert = BertModel(config, output_attentions=output_attentions,
-                                      keep_multihead_output=keep_multihead_output)
+
+        self.bert = BertModel(config, output_attentions=output_attentions)
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
         self.classifier = nn.Linear(config.hidden_size, 1)
+
         self.apply(self.init_weights)
 
     def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None, head_mask=None):
         flat_input_ids = input_ids.view(-1, input_ids.size(-1))
         flat_token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
         flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
-        outputs = self.bert(flat_input_ids, flat_token_type_ids, flat_attention_mask, output_all_encoded_layers=False, head_mask=head_mask)
-        if self.output_attentions:
-            all_attentions, _, pooled_output = outputs
-        else:
-            _, pooled_output = outputs
+        outputs = self.bert(flat_input_ids, flat_token_type_ids, flat_attention_mask, head_mask=head_mask)
+        pooled_output = outputs[1]
+
         pooled_output = self.dropout(pooled_output)
         logits = self.classifier(pooled_output)
         reshaped_logits = logits.view(-1, self.num_choices)
 
+        outputs = [reshaped_logits] + outputs[2:]  # add hidden states and attention if they are here
+
         if labels is not None:
             loss_fct = CrossEntropyLoss()
             loss = loss_fct(reshaped_logits, labels)
-            return loss
-        elif self.output_attentions:
-            return all_attentions, reshaped_logits
-        return reshaped_logits
+            outputs = [loss] + outputs
+
+        return outputs  # (loss), reshaped_logits, (hidden_states), (attentions)
 
 
 class BertForTokenClassification(BertPreTrainedModel):
@@ -1130,25 +1127,26 @@ class BertForTokenClassification(BertPreTrainedModel):
     logits = model(input_ids, token_type_ids, input_mask)
     ```
     """
-    def __init__(self, config, num_labels=2, output_attentions=False, keep_multihead_output=False):
+    def __init__(self, config, num_labels=2, output_attentions=False, output_hidden_states=False):
         super(BertForTokenClassification, self).__init__(config)
         self.output_attentions = output_attentions
+        self.output_hidden_states = output_hidden_states
         self.num_labels = num_labels
-        self.bert = BertModel(config, output_attentions=output_attentions,
-                                      keep_multihead_output=keep_multihead_output)
+
+        self.bert = BertModel(config, output_attentions=output_attentions)
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
         self.classifier = nn.Linear(config.hidden_size, num_labels)
+
         self.apply(self.init_weights)
 
     def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None, head_mask=None):
-        outputs = self.bert(input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False, head_mask=head_mask)
-        if self.output_attentions:
-            all_attentions, sequence_output, _ = outputs
-        else:
-            sequence_output, _ = outputs
+        outputs = self.bert(input_ids, token_type_ids, attention_mask, head_mask=head_mask)
+        sequence_output = outputs[0]
+
         sequence_output = self.dropout(sequence_output)
         logits = self.classifier(sequence_output)
 
+        outputs = [logits] + outputs[2:]  # add hidden states and attention if they are here
         if labels is not None:
             loss_fct = CrossEntropyLoss()
             # Only keep active parts of the loss
@@ -1159,10 +1157,9 @@ class BertForTokenClassification(BertPreTrainedModel):
                 loss = loss_fct(active_logits, active_labels)
             else:
                 loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
-            return loss
-        elif self.output_attentions:
-            return all_attentions, logits
-        return logits
+            outputs = [loss] + outputs
+
+        return outputs  # (loss), logits, (hidden_states), (attentions)
 
 
 class BertForQuestionAnswering(BertPreTrainedModel):
@@ -1217,28 +1214,26 @@ class BertForQuestionAnswering(BertPreTrainedModel):
     start_logits, end_logits = model(input_ids, token_type_ids, input_mask)
     ```
     """
-    def __init__(self, config, output_attentions=False, keep_multihead_output=False):
+    def __init__(self, config, output_attentions=False, output_hidden_states=False):
         super(BertForQuestionAnswering, self).__init__(config)
         self.output_attentions = output_attentions
-        self.bert = BertModel(config, output_attentions=output_attentions,
-                                      keep_multihead_output=keep_multihead_output)
+        self.output_hidden_states = output_hidden_states
+        self.bert = BertModel(config, output_attentions=output_attentions)
         self.qa_outputs = nn.Linear(config.hidden_size, 2)
+
         self.apply(self.init_weights)
 
     def forward(self, input_ids, token_type_ids=None, attention_mask=None, start_positions=None,
                 end_positions=None, head_mask=None):
-        outputs = self.bert(input_ids, token_type_ids, attention_mask,
-                                                       output_all_encoded_layers=False,
-                                                       head_mask=head_mask)
-        if self.output_attentions:
-            all_attentions, sequence_output, _ = outputs
-        else:
-            sequence_output, _ = outputs
+        outputs = self.bert(input_ids, token_type_ids, attention_mask, head_mask=head_mask)
+        sequence_output = outputs[0]
+
         logits = self.qa_outputs(sequence_output)
         start_logits, end_logits = logits.split(1, dim=-1)
         start_logits = start_logits.squeeze(-1)
         end_logits = end_logits.squeeze(-1)
 
+        outputs = [start_logits, end_logits] + outputs[2:]
         if start_positions is not None and end_positions is not None:
             # If we are on multi-GPU, split add a dimension
             if len(start_positions.size()) > 1:
@@ -1254,7 +1249,6 @@ class BertForQuestionAnswering(BertPreTrainedModel):
             start_loss = loss_fct(start_logits, start_positions)
             end_loss = loss_fct(end_logits, end_positions)
             total_loss = (start_loss + end_loss) / 2
-            return total_loss
-        elif self.output_attentions:
-            return all_attentions, start_logits, end_logits
-        return start_logits, end_logits
+            outputs = [total_loss] + outputs
+
+        return outputs  # (loss), start_logits, end_logits, (hidden_states), (attentions)

From d9184620f98fead41b9c83c78cbe2e167106c498 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Sat, 29 Jun 2019 23:10:40 +0200
Subject: [PATCH 032/139] fix tests and new API

---
 pytorch_pretrained_bert/modeling.py       |  30 ++--
 pytorch_pretrained_bert/modeling_xlnet.py |   8 +-
 tests/modeling_test.py                    | 158 ++++++++++------------
 tests/modeling_xlnet_test.py              |  33 +----
 4 files changed, 95 insertions(+), 134 deletions(-)

diff --git a/pytorch_pretrained_bert/modeling.py b/pytorch_pretrained_bert/modeling.py
index 67054d416d..a957ac8d78 100644
--- a/pytorch_pretrained_bert/modeling.py
+++ b/pytorch_pretrained_bert/modeling.py
@@ -320,9 +320,6 @@ class BertSelfAttention(nn.Module):
             attention_probs = attention_probs * head_mask
 
         context_layer = torch.matmul(attention_probs, value_layer)
-        if self.keep_multihead_output:
-            self.multihead_output = context_layer
-            self.multihead_output.retain_grad()
 
         context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
         new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
@@ -416,7 +413,8 @@ class BertLayer(nn.Module):
 
     def forward(self, hidden_states, attention_mask, head_mask=None):
         attention_outputs = self.attention(hidden_states, attention_mask, head_mask)
-        intermediate_output = self.intermediate(attention_outputs[0])
+        attention_output = attention_outputs[0]
+        intermediate_output = self.intermediate(attention_output)
         layer_output = self.output(intermediate_output, attention_output)
         outputs = [layer_output] + attention_outputs[1:]  # add attentions if we output them
         return outputs
@@ -571,8 +569,7 @@ class BertModel(BertPreTrainedModel):
     Params:
         `config`: a BertConfig class instance with the configuration to build a new model
         `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
-        `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
-            This can be used to compute head importance metrics. Default: False
+        `output_hidden_states`: If True, also output hidden states computed by the model at each layer. Default: False
 
     Inputs:
         `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
@@ -688,8 +685,7 @@ class BertForPreTraining(BertPreTrainedModel):
     Params:
         `config`: a BertConfig class instance with the configuration to build a new model
         `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
-        `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
-            This can be used to compute head importance metrics. Default: False
+        `output_hidden_states`: If True, also output hidden states computed by the model at each layer. Default: False
 
     Inputs:
         `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
@@ -770,8 +766,7 @@ class BertForMaskedLM(BertPreTrainedModel):
     Params:
         `config`: a BertConfig class instance with the configuration to build a new model
         `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
-        `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
-            This can be used to compute head importance metrics. Default: False
+        `output_hidden_states`: If True, also output hidden states computed by the model at each layer. Default: False
 
     Inputs:
         `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
@@ -845,8 +840,7 @@ class BertForNextSentencePrediction(BertPreTrainedModel):
     Params:
         `config`: a BertConfig class instance with the configuration to build a new model
         `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
-        `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
-            This can be used to compute head importance metrics. Default: False
+        `output_hidden_states`: If True, also output hidden states computed by the model at each layer. Default: False
 
     Inputs:
         `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
@@ -919,8 +913,7 @@ class BertForSequenceClassification(BertPreTrainedModel):
     Params:
         `config`: a BertConfig class instance with the configuration to build a new model
         `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
-        `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
-            This can be used to compute head importance metrics. Default: False
+        `output_hidden_states`: If True, also output hidden states computed by the model at each layer. Default: False
         `num_labels`: the number of classes for the classifier. Default = 2.
 
     Inputs:
@@ -1003,8 +996,7 @@ class BertForMultipleChoice(BertPreTrainedModel):
     Params:
         `config`: a BertConfig class instance with the configuration to build a new model
         `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
-        `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
-            This can be used to compute head importance metrics. Default: False
+        `output_hidden_states`: If True, also output hidden states computed by the model at each layer. Default: False
         `num_choices`: the number of classes for the classifier. Default = 2.
 
     Inputs:
@@ -1085,8 +1077,7 @@ class BertForTokenClassification(BertPreTrainedModel):
     Params:
         `config`: a BertConfig class instance with the configuration to build a new model
         `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
-        `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
-            This can be used to compute head importance metrics. Default: False
+        `output_hidden_states`: If True, also output hidden states computed by the model at each layer. Default: False
         `num_labels`: the number of classes for the classifier. Default = 2.
 
     Inputs:
@@ -1170,8 +1161,7 @@ class BertForQuestionAnswering(BertPreTrainedModel):
     Params:
         `config`: a BertConfig class instance with the configuration to build a new model
         `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
-        `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
-            This can be used to compute head importance metrics. Default: False
+        `output_hidden_states`: If True, also output hidden states computed by the model at each layer. Default: False
 
     Inputs:
         `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
diff --git a/pytorch_pretrained_bert/modeling_xlnet.py b/pytorch_pretrained_bert/modeling_xlnet.py
index 71e9f584dd..c8fff081cb 100644
--- a/pytorch_pretrained_bert/modeling_xlnet.py
+++ b/pytorch_pretrained_bert/modeling_xlnet.py
@@ -504,10 +504,10 @@ class XLNetRelativeAttention(nn.Module):
             output_h = self.post_attention(h, attn_vec)
             output_g = None
 
+        outputs = [output_h, output_g]
         if self.output_attentions:
-            return output_h, output_g, attn_prob
-
-        return output_h, output_g
+            outputs = outputs + [attn_prob]
+        return outputs
 
 class XLNetFeedForward(nn.Module):
     def __init__(self, config):
@@ -867,7 +867,7 @@ class XLNetModel(XLNetPreTrainedModel):
 
             outputs = layer_module(output_h, output_g, attn_mask_h=non_tgt_mask, attn_mask_g=attn_mask,
                                    r=pos_emb, seg_mat=seg_mat, mems=mems[i], target_mapping=target_mapping,
-                                   head_mask=head_mask)
+                                   head_mask=head_mask[i])
             output_h, output_g = outputs[:2]
             if self.output_attentions:
                 attentions.append(outputs[2:])
diff --git a/tests/modeling_test.py b/tests/modeling_test.py
index 126c6fad13..10e93658c9 100644
--- a/tests/modeling_test.py
+++ b/tests/modeling_test.py
@@ -123,9 +123,13 @@ class BertModelTest(unittest.TestCase):
         def create_bert_model(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
             model = BertModel(config=config)
             model.eval()
-            all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)
+            sequence_output, pooled_output = model(input_ids, token_type_ids, input_mask)
+
+            model = BertModel(config=config, output_hidden_states=True)
+            model.eval()
+            _, _, all_encoder_layers = model(input_ids, token_type_ids, input_mask)
             outputs = {
-                "sequence_output": all_encoder_layers[-1],
+                "sequence_output": sequence_output,
                 "pooled_output": pooled_output,
                 "all_encoder_layers": all_encoder_layers,
             }
@@ -134,7 +138,7 @@ class BertModelTest(unittest.TestCase):
         def check_bert_model_output(self, result):
             self.parent.assertListEqual(
                 [size for layer in result["all_encoder_layers"] for size in layer.size()],
-                [self.batch_size, self.seq_length, self.hidden_size] * self.num_hidden_layers)
+                [self.batch_size, self.seq_length, self.hidden_size] * (self.num_hidden_layers + 1))
             self.parent.assertListEqual(
                 list(result["sequence_output"].size()),
                 [self.batch_size, self.seq_length, self.hidden_size])
@@ -144,8 +148,7 @@ class BertModelTest(unittest.TestCase):
         def create_bert_for_masked_lm(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
             model = BertForMaskedLM(config=config)
             model.eval()
-            loss = model(input_ids, token_type_ids, input_mask, token_labels)
-            prediction_scores = model(input_ids, token_type_ids, input_mask)
+            loss, prediction_scores = model(input_ids, token_type_ids, input_mask, token_labels)
             outputs = {
                 "loss": loss,
                 "prediction_scores": prediction_scores,
@@ -160,8 +163,7 @@ class BertModelTest(unittest.TestCase):
         def create_bert_for_next_sequence_prediction(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
             model = BertForNextSentencePrediction(config=config)
             model.eval()
-            loss = model(input_ids, token_type_ids, input_mask, sequence_labels)
-            seq_relationship_score = model(input_ids, token_type_ids, input_mask)
+            loss, seq_relationship_score = model(input_ids, token_type_ids, input_mask, sequence_labels)
             outputs = {
                 "loss": loss,
                 "seq_relationship_score": seq_relationship_score,
@@ -177,8 +179,7 @@ class BertModelTest(unittest.TestCase):
         def create_bert_for_pretraining(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
             model = BertForPreTraining(config=config)
             model.eval()
-            loss = model(input_ids, token_type_ids, input_mask, token_labels, sequence_labels)
-            prediction_scores, seq_relationship_score = model(input_ids, token_type_ids, input_mask)
+            loss, prediction_scores, seq_relationship_score = model(input_ids, token_type_ids, input_mask, token_labels, sequence_labels)
             outputs = {
                 "loss": loss,
                 "prediction_scores": prediction_scores,
@@ -198,8 +199,7 @@ class BertModelTest(unittest.TestCase):
         def create_bert_for_question_answering(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
             model = BertForQuestionAnswering(config=config)
             model.eval()
-            loss = model(input_ids, token_type_ids, input_mask, sequence_labels, sequence_labels)
-            start_logits, end_logits = model(input_ids, token_type_ids, input_mask)
+            loss, start_logits, end_logits = model(input_ids, token_type_ids, input_mask, sequence_labels, sequence_labels)
             outputs = {
                 "loss": loss,
                 "start_logits": start_logits,
@@ -219,8 +219,7 @@ class BertModelTest(unittest.TestCase):
         def create_bert_for_sequence_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
             model = BertForSequenceClassification(config=config, num_labels=self.num_labels)
             model.eval()
-            loss = model(input_ids, token_type_ids, input_mask, sequence_labels)
-            logits = model(input_ids, token_type_ids, input_mask)
+            loss, logits = model(input_ids, token_type_ids, input_mask, sequence_labels)
             outputs = {
                 "loss": loss,
                 "logits": logits,
@@ -236,8 +235,7 @@ class BertModelTest(unittest.TestCase):
         def create_bert_for_token_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
             model = BertForTokenClassification(config=config, num_labels=self.num_labels)
             model.eval()
-            loss = model(input_ids, token_type_ids, input_mask, token_labels)
-            logits = model(input_ids, token_type_ids, input_mask)
+            loss, logits = model(input_ids, token_type_ids, input_mask, token_labels)
             outputs = {
                 "loss": loss,
                 "logits": logits,
@@ -256,13 +254,10 @@ class BertModelTest(unittest.TestCase):
             multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
             multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
             multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
-            loss = model(multiple_choice_inputs_ids,
+            loss, logits = model(multiple_choice_inputs_ids,
                          multiple_choice_token_type_ids,
                          multiple_choice_input_mask,
                          choice_labels)
-            logits = model(multiple_choice_inputs_ids,
-                           multiple_choice_token_type_ids,
-                           multiple_choice_input_mask)
             outputs = {
                 "loss": loss,
                 "logits": logits,
@@ -285,8 +280,8 @@ class BertModelTest(unittest.TestCase):
                 else:
                     model = model_class(config=config, output_attentions=True)
                 model.eval()
-                output = model(input_ids, token_type_ids, input_mask)
-                attentions = output[0]
+                outputs = model(input_ids, token_type_ids, input_mask)
+                attentions = outputs[-1]
                 self.parent.assertEqual(len(attentions), self.num_hidden_layers)
                 self.parent.assertListEqual(
                     list(attentions[0].size()),
@@ -300,57 +295,56 @@ class BertModelTest(unittest.TestCase):
                 if model_class in [BertForSequenceClassification,
                                    BertForTokenClassification]:
                     model = model_class(config=config,
-                                        num_labels=self.num_labels,
-                                        keep_multihead_output=True)
+                                        num_labels=self.num_labels)
                 else:
-                    model = model_class(config=config, keep_multihead_output=True)
+                    model = model_class(config=config)
                 model.eval()
                 head_mask = torch.ones(self.num_hidden_layers, self.num_attention_heads).to(input_ids.device)
                 head_mask[0, 1:-1] = 0.0 # Mask all but the first and last heads on the first layer
                 head_mask[-1, 1:] = 0.0  # Mask all but the first head on the last layer
-                output = model(input_ids, token_type_ids, input_mask, head_mask=head_mask)
+                # Set that after having prepared the tensor to avoid error (leaf variable has been moved into the graph interior) 
+                head_mask.requires_grad_(requires_grad=True)
+                outputs = model(input_ids, token_type_ids, input_mask, head_mask=head_mask)
 
-                if isinstance(model, BertModel):
-                    output = sum(t.sum() for t in output[0])
-                elif isinstance(output, (list, tuple)):
-                    output = sum(t.sum() for t in output)
+                # Compute some gradients
+                output = sum(t.sum() for t in outputs[0])
                 output = output.sum()
                 output.backward()
-                multihead_outputs = (model if isinstance(model, BertModel) else model.bert).get_multihead_outputs()
+                multihead_outputs = head_mask.grad
 
                 self.parent.assertEqual(len(multihead_outputs), self.num_hidden_layers)
-                self.parent.assertListEqual(
-                    list(multihead_outputs[0].size()),
-                    [self.batch_size, self.num_attention_heads,
-                     self.seq_length, self.hidden_size // self.num_attention_heads])
-                self.parent.assertEqual(
-                    len(multihead_outputs[0][:, 1:(self.num_attention_heads-1), :, :].nonzero()),
-                    0)
-                self.parent.assertEqual(
-                    len(multihead_outputs[0][:, 0, :, :].nonzero()),
-                    self.batch_size * self.seq_length * self.hidden_size // self.num_attention_heads)
-                self.parent.assertEqual(
-                    len(multihead_outputs[0][:, self.num_attention_heads-1, :, :].nonzero()),
-                    self.batch_size * self.seq_length * self.hidden_size // self.num_attention_heads)
+                # self.parent.assertListEqual(
+                #     list(multihead_outputs[0].size()),
+                #     [self.batch_size, self.num_attention_heads,
+                #      self.seq_length, self.hidden_size // self.num_attention_heads])
+                # self.parent.assertEqual(
+                #     len(multihead_outputs[0][:, 1:(self.num_attention_heads-1), :, :].nonzero()),
+                #     0)
+                # self.parent.assertEqual(
+                #     len(multihead_outputs[0][:, 0, :, :].nonzero()),
+                #     self.batch_size * self.seq_length * self.hidden_size // self.num_attention_heads)
+                # self.parent.assertEqual(
+                #     len(multihead_outputs[0][:, self.num_attention_heads-1, :, :].nonzero()),
+                #     self.batch_size * self.seq_length * self.hidden_size // self.num_attention_heads)
 
-                self.parent.assertListEqual(
-                    list(multihead_outputs[1].size()),
-                    [self.batch_size, self.num_attention_heads,
-                     self.seq_length, self.hidden_size // self.num_attention_heads])
-                self.parent.assertEqual(
-                    len(multihead_outputs[1].nonzero()),
-                    multihead_outputs[1].numel())
+                # self.parent.assertListEqual(
+                #     list(multihead_outputs[1].size()),
+                #     [self.batch_size, self.num_attention_heads,
+                #      self.seq_length, self.hidden_size // self.num_attention_heads])
+                # self.parent.assertEqual(
+                #     len(multihead_outputs[1].nonzero()),
+                #     multihead_outputs[1].numel())
 
-                self.parent.assertListEqual(
-                    list(multihead_outputs[-1].size()),
-                    [self.batch_size, self.num_attention_heads,
-                     self.seq_length, self.hidden_size // self.num_attention_heads])
-                self.parent.assertEqual(
-                    len(multihead_outputs[-1][:, 1:, :, :].nonzero()),
-                    0)
-                self.parent.assertEqual(
-                    len(multihead_outputs[-1][:, 0, :, :].nonzero()),
-                    self.batch_size * self.seq_length * self.hidden_size // self.num_attention_heads)
+                # self.parent.assertListEqual(
+                #     list(multihead_outputs[-1].size()),
+                #     [self.batch_size, self.num_attention_heads,
+                #      self.seq_length, self.hidden_size // self.num_attention_heads])
+                # self.parent.assertEqual(
+                #     len(multihead_outputs[-1][:, 1:, :, :].nonzero()),
+                #     0)
+                # self.parent.assertEqual(
+                #     len(multihead_outputs[-1][:, 0, :, :].nonzero()),
+                #     self.batch_size * self.seq_length * self.hidden_size // self.num_attention_heads)
 
 
         def create_and_check_bert_for_head_pruning(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
@@ -360,38 +354,34 @@ class BertModelTest(unittest.TestCase):
                 if model_class in [BertForSequenceClassification,
                                    BertForTokenClassification]:
                     model = model_class(config=config,
-                                        num_labels=self.num_labels,
-                                        keep_multihead_output=True)
+                                        num_labels=self.num_labels)
                 else:
-                    model = model_class(config=config, keep_multihead_output=True)
+                    model = model_class(config=config)
                 model.eval()
                 bert_model = model if isinstance(model, BertModel) else model.bert
                 heads_to_prune = {0: list(range(1, self.num_attention_heads)),
                                   -1: [0]}
                 bert_model.prune_heads(heads_to_prune)
-                output = model(input_ids, token_type_ids, input_mask)
+                outputs = model(input_ids, token_type_ids, input_mask)
 
-                if isinstance(model, BertModel):
-                    output = sum(t.sum() for t in output[0])
-                elif isinstance(output, (list, tuple)):
-                    output = sum(t.sum() for t in output)
-                output = output.sum()
-                output.backward()
-                multihead_outputs = bert_model.get_multihead_outputs()
+                # output = sum(t.sum() for t in outputs[0])
+                # output = output.sum()
+                # output.backward()
+                # multihead_outputs = bert_model.get_multihead_outputs()
 
-                self.parent.assertEqual(len(multihead_outputs), self.num_hidden_layers)
-                self.parent.assertListEqual(
-                    list(multihead_outputs[0].size()),
-                    [self.batch_size, 1,
-                     self.seq_length, self.hidden_size // self.num_attention_heads])
-                self.parent.assertListEqual(
-                    list(multihead_outputs[1].size()),
-                    [self.batch_size, self.num_attention_heads,
-                     self.seq_length, self.hidden_size // self.num_attention_heads])
-                self.parent.assertListEqual(
-                    list(multihead_outputs[-1].size()),
-                    [self.batch_size, self.num_attention_heads-1,
-                     self.seq_length, self.hidden_size // self.num_attention_heads])
+                # self.parent.assertEqual(len(multihead_outputs), self.num_hidden_layers)
+                # self.parent.assertListEqual(
+                #     list(multihead_outputs[0].size()),
+                #     [self.batch_size, 1,
+                #      self.seq_length, self.hidden_size // self.num_attention_heads])
+                # self.parent.assertListEqual(
+                #     list(multihead_outputs[1].size()),
+                #     [self.batch_size, self.num_attention_heads,
+                #      self.seq_length, self.hidden_size // self.num_attention_heads])
+                # self.parent.assertListEqual(
+                #     list(multihead_outputs[-1].size()),
+                #     [self.batch_size, self.num_attention_heads-1,
+                #      self.seq_length, self.hidden_size // self.num_attention_heads])
 
 
     def test_default(self):
diff --git a/tests/modeling_xlnet_test.py b/tests/modeling_xlnet_test.py
index 237fa16d3a..e696c618b1 100644
--- a/tests/modeling_xlnet_test.py
+++ b/tests/modeling_xlnet_test.py
@@ -134,26 +134,19 @@ class XLNetModelTest(unittest.TestCase):
             model = XLNetLMHeadModel(config)
             model.eval()
 
-            loss_1, mems_1a = model(input_ids_1, token_type_ids=segment_ids, labels=lm_labels)
-            all_logits_1, mems_1b = model(input_ids_1, token_type_ids=segment_ids)
+            loss_1, all_logits_1, mems_1 = model(input_ids_1, token_type_ids=segment_ids, labels=lm_labels)
 
-            loss_2, mems_2a = model(input_ids_2, token_type_ids=segment_ids, labels=lm_labels, mems=mems_1a)
-            all_logits_2, mems_2b = model(input_ids_2, token_type_ids=segment_ids, mems=mems_1b)
+            loss_2, all_logits_2, mems_2 = model(input_ids_2, token_type_ids=segment_ids, labels=lm_labels, mems=mems_1)
 
-            logits, _ = model(input_ids_q,
-                                    perm_mask=perm_mask,
-                                    target_mapping=target_mapping,
-                                    inp_q=inp_q)
+            logits, _ = model(input_ids_q, perm_mask=perm_mask, target_mapping=target_mapping, inp_q=inp_q)
 
             outputs = {
                 "loss_1": loss_1,
-                "mems_1a": mems_1a,
+                "mems_1": mems_1,
                 "all_logits_1": all_logits_1,
-                "mems_1b": mems_1b,
                 "loss_2": loss_2,
-                "mems_2a": mems_2a,
+                "mems_2": mems_2,
                 "all_logits_2": all_logits_2,
-                "mems_2b": mems_2b,
             }
             return outputs
 
@@ -165,14 +158,8 @@ class XLNetModelTest(unittest.TestCase):
                 list(result["all_logits_1"].size()),
                 [self.batch_size, self.seq_length, self.vocab_size])
             self.parent.assertListEqual(
-                list(list(mem.size()) for mem in result["mems_1a"]),
+                list(list(mem.size()) for mem in result["mems_1"]),
                 [[self.seq_length, self.batch_size, self.d_model]] * self.n_layer)
-            self.parent.assertListEqual(
-                list(list(mem.size()) for mem in result["mems_1b"]),
-                [[self.seq_length, self.batch_size, self.d_model]] * self.n_layer)
-            self.parent.assertListEqual(
-                list(mem[~torch.isnan(mem)].sum() for mem in result["mems_1a"]),
-                list(mem[~torch.isnan(mem)].sum() for mem in result["mems_1b"]))
 
             self.parent.assertListEqual(
                 list(result["loss_2"].size()),
@@ -181,14 +168,8 @@ class XLNetModelTest(unittest.TestCase):
                 list(result["all_logits_2"].size()),
                 [self.batch_size, self.seq_length, self.vocab_size])
             self.parent.assertListEqual(
-                list(list(mem.size()) for mem in result["mems_2a"]),
+                list(list(mem.size()) for mem in result["mems_2"]),
                 [[self.mem_len, self.batch_size, self.d_model]] * self.n_layer)
-            self.parent.assertListEqual(
-                list(list(mem.size()) for mem in result["mems_2b"]),
-                [[self.mem_len, self.batch_size, self.d_model]] * self.n_layer)
-            self.parent.assertListEqual(
-                list(mem[~torch.isnan(mem)].sum() for mem in result["mems_2a"]),
-                list(mem[~torch.isnan(mem)].sum() for mem in result["mems_2b"]))
 
     def test_default(self):
         self.run_tester(XLNetModelTest.XLNetModelTester(self))

From 4f8b5f687cc9a2837bdb19ca0a204f325f6b28e8 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Sat, 29 Jun 2019 23:35:21 +0200
Subject: [PATCH 033/139] add fix for serialization of tokenizer

---
 pytorch_pretrained_bert/tokenization_xlnet.py | 15 ++++++++++++
 tests/tokenization_xlnet_test.py              | 24 +++++++++++++++++--
 2 files changed, 37 insertions(+), 2 deletions(-)

diff --git a/pytorch_pretrained_bert/tokenization_xlnet.py b/pytorch_pretrained_bert/tokenization_xlnet.py
index 63717a6e25..3e28bbb807 100644
--- a/pytorch_pretrained_bert/tokenization_xlnet.py
+++ b/pytorch_pretrained_bert/tokenization_xlnet.py
@@ -182,6 +182,21 @@ class XLNetTokenizer(object):
     def __len__(self):
         return len(self.encoder) + len(self.special_tokens)
 
+    def __getstate__(self):
+        state = self.__dict__.copy()
+        state["sp_model"] = None
+        return state
+
+    def __setstate__(self, d):
+        self.__dict__ = d
+        try:
+            import sentencepiece as spm
+        except ImportError:
+            logger.warning("You need to install SentencePiece to use XLNetTokenizer: https://github.com/google/sentencepiece"
+                           "pip install sentencepiece")
+        self.sp_model = spm.SentencePieceProcessor()
+        self.sp_model.Load(self.vocab_file)
+
     def set_special_tokens(self, special_tokens):
         """ Add a list of additional tokens to the encoder.
             The additional tokens are indexed starting from the last index of the
diff --git a/tests/tokenization_xlnet_test.py b/tests/tokenization_xlnet_test.py
index 30d7f37c04..285dee226d 100644
--- a/tests/tokenization_xlnet_test.py
+++ b/tests/tokenization_xlnet_test.py
@@ -15,11 +15,17 @@
 from __future__ import absolute_import, division, print_function, unicode_literals
 
 import os
+import sys
 import unittest
 from io import open
 import shutil
 import pytest
 
+if sys.version_info[0] == 2:
+    import cPickle as pickle
+else:
+    import pickle
+
 from pytorch_pretrained_bert.tokenization_xlnet import (XLNetTokenizer,
                                                         PRETRAINED_VOCAB_ARCHIVE_MAP,
                                                         SPIECE_UNDERLINE)
@@ -43,8 +49,6 @@ class XLNetTokenizationTest(unittest.TestCase):
         vocab_file, special_tokens_file = tokenizer.save_vocabulary(vocab_path)
         tokenizer = tokenizer.from_pretrained(vocab_path,
                                               keep_accents=True)
-        os.remove(vocab_file)
-        os.remove(special_tokens_file)
 
         tokens = tokenizer.tokenize(u"I was born in 92000, and this is falsé.")
         self.assertListEqual(tokens, [SPIECE_UNDERLINE + u'I', SPIECE_UNDERLINE + u'was', SPIECE_UNDERLINE + u'b',
@@ -65,6 +69,22 @@ class XLNetTokenizationTest(unittest.TestCase):
                                            SPIECE_UNDERLINE + u'is', SPIECE_UNDERLINE + u'f', u'al', u's',
                                            u'<unk>', u'.'])
 
+        text = "Munich and Berlin are nice cities"
+        filename = u"/tmp/tokenizer.bin"
+
+        subwords = tokenizer.tokenize(text)
+
+        pickle.dump(tokenizer, open(filename, "wb"))
+
+        tokenizer_new = pickle.load(open(filename, "rb"))
+        subwords_loaded = tokenizer_new.tokenize(text)
+
+        self.assertListEqual(subwords, subwords_loaded)
+
+        os.remove(filename)
+        os.remove(vocab_file)
+        os.remove(special_tokens_file)
+
     @pytest.mark.slow
     def test_tokenizer_from_pretrained(self):
         cache_dir = "/tmp/pytorch_pretrained_bert_test/"

From 1484d67de9f4c8301874d62d821bbfbd3d6ce99b Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Tue, 2 Jul 2019 12:13:17 +0200
Subject: [PATCH 034/139] [LARGE] updating all tests and API

---
 pytorch_pretrained_bert/model_utils.py        |  76 +-
 pytorch_pretrained_bert/modeling.py           | 112 ++-
 pytorch_pretrained_bert/modeling_gpt2.py      | 306 ++-----
 pytorch_pretrained_bert/modeling_openai.py    | 160 ++--
 .../modeling_transfo_xl.py                    | 202 +++--
 pytorch_pretrained_bert/modeling_xlm.py       | 783 +++---------------
 pytorch_pretrained_bert/modeling_xlnet.py     | 140 ++--
 pytorch_pretrained_bert/tests/__init__.py     |   0
 .../tests}/conftest.py                        |   0
 .../tests/fixtures}/input.txt                 |   0
 .../tests/fixtures}/sample_text.txt           |   0
 .../tests/fixtures}/test_sentencepiece.model  | Bin
 .../tests/model_tests_commons.py              | 379 +++++++++
 .../tests/model_utils_test.py                 |  50 ++
 .../tests/modeling_gpt2_test.py               |  55 ++
 .../tests/modeling_openai_test.py             |  55 ++
 .../tests/modeling_test.py                    | 307 +++++++
 .../tests}/modeling_transfo_xl_test.py        | 115 +--
 .../tests}/modeling_xlnet_test.py             |  88 +-
 .../tests}/optimization_test.py               |   0
 .../tests}/tokenization_gpt2_test.py          |   0
 .../tests}/tokenization_openai_test.py        |   0
 .../tests}/tokenization_test.py               |   0
 .../tests}/tokenization_transfo_xl_test.py    |   0
 .../tests}/tokenization_xlnet_test.py         |   5 +-
 tests/modeling_gpt2_test.py                   | 364 --------
 tests/modeling_openai_test.py                 | 338 --------
 tests/modeling_test.py                        | 467 -----------
 xlnet                                         |   1 -
 29 files changed, 1482 insertions(+), 2521 deletions(-)
 create mode 100644 pytorch_pretrained_bert/tests/__init__.py
 rename {tests => pytorch_pretrained_bert/tests}/conftest.py (100%)
 rename {samples => pytorch_pretrained_bert/tests/fixtures}/input.txt (100%)
 rename {samples => pytorch_pretrained_bert/tests/fixtures}/sample_text.txt (100%)
 rename {samples => pytorch_pretrained_bert/tests/fixtures}/test_sentencepiece.model (100%)
 create mode 100644 pytorch_pretrained_bert/tests/model_tests_commons.py
 create mode 100644 pytorch_pretrained_bert/tests/model_utils_test.py
 create mode 100644 pytorch_pretrained_bert/tests/modeling_gpt2_test.py
 create mode 100644 pytorch_pretrained_bert/tests/modeling_openai_test.py
 create mode 100644 pytorch_pretrained_bert/tests/modeling_test.py
 rename {tests => pytorch_pretrained_bert/tests}/modeling_transfo_xl_test.py (61%)
 rename {tests => pytorch_pretrained_bert/tests}/modeling_xlnet_test.py (77%)
 rename {tests => pytorch_pretrained_bert/tests}/optimization_test.py (100%)
 rename {tests => pytorch_pretrained_bert/tests}/tokenization_gpt2_test.py (100%)
 rename {tests => pytorch_pretrained_bert/tests}/tokenization_openai_test.py (100%)
 rename {tests => pytorch_pretrained_bert/tests}/tokenization_test.py (100%)
 rename {tests => pytorch_pretrained_bert/tests}/tokenization_transfo_xl_test.py (100%)
 rename {tests => pytorch_pretrained_bert/tests}/tokenization_xlnet_test.py (97%)
 delete mode 100644 tests/modeling_gpt2_test.py
 delete mode 100644 tests/modeling_openai_test.py
 delete mode 100644 tests/modeling_test.py
 delete mode 160000 xlnet

diff --git a/pytorch_pretrained_bert/model_utils.py b/pytorch_pretrained_bert/model_utils.py
index 1bc1632580..15f6a4d5b4 100644
--- a/pytorch_pretrained_bert/model_utils.py
+++ b/pytorch_pretrained_bert/model_utils.py
@@ -41,6 +41,12 @@ class PretrainedConfig(object):
     """
     pretrained_config_archive_map = {}
 
+    def __init__(self, **kwargs):
+        self.finetuning_task = kwargs.pop('finetuning_task', None)
+        self.num_labels = kwargs.pop('num_labels', 2)
+        self.output_attentions = kwargs.pop('output_attentions', False)
+        self.output_hidden_states = kwargs.pop('output_hidden_states', False)
+
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
         """
@@ -114,6 +120,9 @@ class PretrainedConfig(object):
             text = reader.read()
         return cls.from_dict(json.loads(text))
 
+    def __eq__(self, other):
+        return self.__dict__ == other.__dict__
+
     def __repr__(self):
         return str(self.to_json_string())
 
@@ -133,12 +142,11 @@ class PretrainedConfig(object):
 
 
 class PreTrainedModel(nn.Module):
-    """ An abstract class to handle weights initialization and
+    """ An abstract class to handle storing model config and
         a simple interface for dowloading and loading pretrained models.
     """
     config_class = PretrainedConfig
     pretrained_model_archive_map = {}
-    pretrained_config_archive_map = {}
     load_tf_weights = lambda model, config, path: None
     base_model_prefix = ""
 
@@ -151,8 +159,16 @@ class PreTrainedModel(nn.Module):
                 "`model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`".format(
                     self.__class__.__name__, self.__class__.__name__
                 ))
+        # Save config in model
         self.config = config
 
+    def prune_heads(self, heads_to_prune):
+        """ Prunes heads of the base model.
+            heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
+        """
+        model_to_prune = getattr(self, self.base_model_prefix, self)  # get the base model if needed
+        model_to_prune._prune_heads(heads_to_prune)
+
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
         """
@@ -175,24 +191,22 @@ class PreTrainedModel(nn.Module):
             *inputs, **kwargs: additional input for the specific XLNet class
                 (ex: num_labels for XLNetForSequenceClassification)
         """
-        state_dict = kwargs.get('state_dict', None)
-        kwargs.pop('state_dict', None)
-        cache_dir = kwargs.get('cache_dir', None)
-        kwargs.pop('cache_dir', None)
-        from_tf = kwargs.get('from_tf', False)
-        kwargs.pop('from_tf', None)
+        state_dict = kwargs.pop('state_dict', None)
+        cache_dir = kwargs.pop('cache_dir', None)
+        from_tf = kwargs.pop('from_tf', None)
 
+        # Load config
+        config = cls.config_class.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
+
+        # Load model
         if pretrained_model_name_or_path in cls.pretrained_model_archive_map:
             archive_file = cls.pretrained_model_archive_map[pretrained_model_name_or_path]
-            config_file = cls.pretrained_config_archive_map[pretrained_model_name_or_path]
         else:
             if from_tf:
                 # Directly load from a TensorFlow checkpoint
                 archive_file = os.path.join(pretrained_model_name_or_path, TF_WEIGHTS_NAME + ".index")
-                config_file = os.path.join(pretrained_model_name_or_path, CONFIG_NAME)
             else:
                 archive_file = os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME)
-                config_file = os.path.join(pretrained_model_name_or_path, CONFIG_NAME)
         # redirect to the cache, if necessary
         try:
             resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir)
@@ -210,47 +224,15 @@ class PreTrainedModel(nn.Module):
                         ', '.join(cls.pretrained_model_archive_map.keys()),
                         archive_file))
             return None
-        try:
-            resolved_config_file = cached_path(config_file, cache_dir=cache_dir)
-        except EnvironmentError:
-            if pretrained_model_name_or_path in cls.pretrained_config_archive_map:
-                logger.error(
-                    "Couldn't reach server at '{}' to download pretrained model configuration file.".format(
-                        config_file))
-            else:
-                logger.error(
-                    "Model name '{}' was not found in model name list ({}). "
-                    "We assumed '{}' was a path or url but couldn't find any file "
-                    "associated to this path or url.".format(
-                        pretrained_model_name_or_path,
-                        ', '.join(cls.pretrained_config_archive_map.keys()),
-                        config_file))
-            return None
-        if resolved_archive_file == archive_file and resolved_config_file == config_file:
+        if resolved_archive_file == archive_file:
             logger.info("loading weights file {}".format(archive_file))
-            logger.info("loading configuration file {}".format(config_file))
         else:
             logger.info("loading weights file {} from cache at {}".format(
                 archive_file, resolved_archive_file))
-            logger.info("loading configuration file {} from cache at {}".format(
-                config_file, resolved_config_file))
-
-        # Load config
-        config = cls.config_class.from_json_file(resolved_config_file)
-
-        # Update config with kwargs if needed
-        to_remove = []
-        for key, value in kwargs.items():
-            if hasattr(config, key):
-                setattr(config, key, value)
-                to_remove.append(key)
-        for key in to_remove:
-            kwargs.pop(key, None)
-
-        logger.info("Model config {}".format(config))
 
         # Instantiate model.
-        model = cls(config, *inputs, **kwargs)
+        model = cls(config)
+
         if state_dict is None and not from_tf:
             state_dict = torch.load(resolved_archive_file, map_location='cpu')
         if from_tf:
@@ -275,7 +257,7 @@ class PreTrainedModel(nn.Module):
                 if child is not None:
                     load(child, prefix + name + '.')
 
-        # Be able to load base models as well as derived models (with heads)
+        # Make sure we are able to load base models as well as derived models (with heads)
         start_prefix = ''
         model_to_load = model
         if not hasattr(model, cls.base_model_prefix) and any(s.startswith(cls.base_model_prefix) for s in state_dict.keys()):
diff --git a/pytorch_pretrained_bert/modeling.py b/pytorch_pretrained_bert/modeling.py
index a957ac8d78..f2b63634b1 100644
--- a/pytorch_pretrained_bert/modeling.py
+++ b/pytorch_pretrained_bert/modeling.py
@@ -155,7 +155,7 @@ class BertConfig(PretrainedConfig):
     pretrained_config_archive_map = PRETRAINED_CONFIG_ARCHIVE_MAP
 
     def __init__(self,
-                 vocab_size_or_config_json_file,
+                 vocab_size_or_config_json_file=30522,
                  hidden_size=768,
                  num_hidden_layers=12,
                  num_attention_heads=12,
@@ -167,7 +167,7 @@ class BertConfig(PretrainedConfig):
                  type_vocab_size=2,
                  initializer_range=0.02,
                  layer_norm_eps=1e-12,
-                 finetuning_task=None):
+                 **kwargs):
         """Constructs BertConfig.
 
         Args:
@@ -192,8 +192,8 @@ class BertConfig(PretrainedConfig):
             initializer_range: The sttdev of the truncated_normal_initializer for
                 initializing all weight matrices.
             layer_norm_eps: The epsilon used by LayerNorm.
-            finetuning_task: name of the glue task on which the model was fine-tuned if any
         """
+        super(BertConfig, self).__init__(**kwargs)
         if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
                         and isinstance(vocab_size_or_config_json_file, unicode)):
             with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader:
@@ -213,7 +213,6 @@ class BertConfig(PretrainedConfig):
             self.type_vocab_size = type_vocab_size
             self.initializer_range = initializer_range
             self.layer_norm_eps = layer_norm_eps
-            self.finetuning_task = finetuning_task
         else:
             raise ValueError("First argument must be either a vocabulary size (int)"
                              "or the path to a pretrained model config file (str)")
@@ -270,13 +269,13 @@ class BertEmbeddings(nn.Module):
 
 
 class BertSelfAttention(nn.Module):
-    def __init__(self, config, output_attentions=False):
+    def __init__(self, config):
         super(BertSelfAttention, self).__init__()
         if config.hidden_size % config.num_attention_heads != 0:
             raise ValueError(
                 "The hidden size (%d) is not a multiple of the number of attention "
                 "heads (%d)" % (config.hidden_size, config.num_attention_heads))
-        self.output_attentions = output_attentions
+        self.output_attentions = config.output_attentions
 
         self.num_attention_heads = config.num_attention_heads
         self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
@@ -344,10 +343,9 @@ class BertSelfOutput(nn.Module):
 
 
 class BertAttention(nn.Module):
-    def __init__(self, config, output_attentions=False):
+    def __init__(self, config):
         super(BertAttention, self).__init__()
-        self.output_attentions = output_attentions
-        self.self = BertSelfAttention(config, output_attentions=output_attentions)
+        self.self = BertSelfAttention(config)
         self.output = BertSelfOutput(config)
 
     def prune_heads(self, heads):
@@ -404,10 +402,9 @@ class BertOutput(nn.Module):
 
 
 class BertLayer(nn.Module):
-    def __init__(self, config, output_attentions=False):
+    def __init__(self, config):
         super(BertLayer, self).__init__()
-        self.output_attentions = output_attentions
-        self.attention = BertAttention(config, output_attentions=output_attentions)
+        self.attention = BertAttention(config)
         self.intermediate = BertIntermediate(config)
         self.output = BertOutput(config)
 
@@ -421,11 +418,11 @@ class BertLayer(nn.Module):
 
 
 class BertEncoder(nn.Module):
-    def __init__(self, config, output_attentions=False, output_hidden_states=False):
+    def __init__(self, config):
         super(BertEncoder, self).__init__()
-        self.output_attentions = output_attentions
-        self.output_hidden_states = output_hidden_states
-        layer = BertLayer(config, output_attentions=output_attentions)
+        self.output_attentions = config.output_attentions
+        self.output_hidden_states = config.output_hidden_states
+        layer = BertLayer(config)
         self.layer = nn.ModuleList([copy.deepcopy(layer) for _ in range(config.num_hidden_layers)])
 
     def forward(self, hidden_states, attention_mask, head_mask=None):
@@ -546,9 +543,6 @@ class BertPreTrainedModel(PreTrainedModel):
     load_tf_weights = load_tf_weights_in_bert
     base_model_prefix = "bert"
 
-    def __init__(self, *inputs, **kwargs):
-        super(BertPreTrainedModel, self).__init__(*inputs, **kwargs)
-
     def init_weights(self, module):
         """ Initialize the weights.
         """
@@ -612,19 +606,19 @@ class BertModel(BertPreTrainedModel):
     all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)
     ```
     """
-    def __init__(self, config, output_attentions=False, output_hidden_states=False):
+    def __init__(self, config):
         super(BertModel, self).__init__(config)
-        self.output_attentions = output_attentions
-        self.output_hidden_states = output_hidden_states
+
         self.embeddings = BertEmbeddings(config)
-        self.encoder = BertEncoder(config, output_attentions=output_attentions,
-                                           output_hidden_states=output_hidden_states)
+        self.encoder = BertEncoder(config)
         self.pooler = BertPooler(config)
+
         self.apply(self.init_weights)
 
-    def prune_heads(self, heads_to_prune):
+    def _prune_heads(self, heads_to_prune):
         """ Prunes heads of the model.
             heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
+            See base class PreTrainedModel
         """
         for layer, heads in heads_to_prune.items():
             self.encoder.layer[layer].attention.prune_heads(heads)
@@ -730,14 +724,12 @@ class BertForPreTraining(BertPreTrainedModel):
     masked_lm_logits_scores, seq_relationship_logits = model(input_ids, token_type_ids, input_mask)
     ```
     """
-    def __init__(self, config, output_attentions=False, output_hidden_states=False):
+    def __init__(self, config):
         super(BertForPreTraining, self).__init__(config)
-        self.output_attentions = output_attentions
-        self.output_hidden_states = output_hidden_states
 
-        self.bert = BertModel(config, output_attentions=output_attentions,
-                                      output_hidden_states=output_hidden_states)
+        self.bert = BertModel(config)
         self.cls = BertPreTrainingHeads(config, self.bert.embeddings.word_embeddings.weight)
+
         self.apply(self.init_weights)
 
     def forward(self, input_ids, token_type_ids=None, attention_mask=None, masked_lm_labels=None,
@@ -809,13 +801,12 @@ class BertForMaskedLM(BertPreTrainedModel):
     masked_lm_logits_scores = model(input_ids, token_type_ids, input_mask)
     ```
     """
-    def __init__(self, config, output_attentions=False, output_hidden_states=False):
+    def __init__(self, config):
         super(BertForMaskedLM, self).__init__(config)
-        self.output_attentions = output_attentions
-        self.output_hidden_states = output_hidden_states
 
-        self.bert = BertModel(config, output_attentions=output_attentions )
+        self.bert = BertModel(config)
         self.cls = BertOnlyMLMHead(config, self.bert.embeddings.word_embeddings.weight)
+
         self.apply(self.init_weights)
 
     def forward(self, input_ids, token_type_ids=None, attention_mask=None, masked_lm_labels=None, head_mask=None):
@@ -880,12 +871,10 @@ class BertForNextSentencePrediction(BertPreTrainedModel):
     seq_relationship_logits = model(input_ids, token_type_ids, input_mask)
     ```
     """
-    def __init__(self, config, output_attentions=False, output_hidden_states=False):
+    def __init__(self, config):
         super(BertForNextSentencePrediction, self).__init__(config)
-        self.output_attentions = output_attentions
-        self.output_hidden_states = output_hidden_states
 
-        self.bert = BertModel(config, output_attentions=output_attentions)
+        self.bert = BertModel(config)
         self.cls = BertOnlyNSPHead(config)
 
         self.apply(self.init_weights)
@@ -954,15 +943,13 @@ class BertForSequenceClassification(BertPreTrainedModel):
     logits = model(input_ids, token_type_ids, input_mask)
     ```
     """
-    def __init__(self, config, num_labels=2, output_attentions=False, output_hidden_states=False):
+    def __init__(self, config):
         super(BertForSequenceClassification, self).__init__(config)
-        self.output_attentions = output_attentions
-        self.output_hidden_states = output_hidden_states
-        self.num_labels = num_labels
+        self.num_labels = config.num_labels
 
-        self.bert = BertModel(config, output_attentions=output_attentions)
+        self.bert = BertModel(config)
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
-        self.classifier = nn.Linear(config.hidden_size, num_labels)
+        self.classifier = nn.Linear(config.hidden_size, self.config.num_labels)
 
         self.apply(self.init_weights)
 
@@ -997,7 +984,6 @@ class BertForMultipleChoice(BertPreTrainedModel):
         `config`: a BertConfig class instance with the configuration to build a new model
         `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
         `output_hidden_states`: If True, also output hidden states computed by the model at each layer. Default: False
-        `num_choices`: the number of classes for the classifier. Default = 2.
 
     Inputs:
         `input_ids`: a torch.LongTensor of shape [batch_size, num_choices, sequence_length]
@@ -1030,25 +1016,23 @@ class BertForMultipleChoice(BertPreTrainedModel):
     config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
         num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
 
-    num_choices = 2
-
-    model = BertForMultipleChoice(config, num_choices)
+    model = BertForMultipleChoice(config)
     logits = model(input_ids, token_type_ids, input_mask)
     ```
     """
-    def __init__(self, config, num_choices=2, output_attentions=False, output_hidden_states=False):
+    def __init__(self, config):
         super(BertForMultipleChoice, self).__init__(config)
-        self.output_attentions = output_attentions
-        self.output_hidden_states = output_hidden_states
-        self.num_choices = num_choices
 
-        self.bert = BertModel(config, output_attentions=output_attentions)
+        self.bert = BertModel(config)
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
         self.classifier = nn.Linear(config.hidden_size, 1)
 
         self.apply(self.init_weights)
 
     def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None, head_mask=None):
+        """ Input shapes should be [bsz, num choices, seq length] """
+        num_choices = input_ids.shape[1]
+
         flat_input_ids = input_ids.view(-1, input_ids.size(-1))
         flat_token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
         flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
@@ -1057,7 +1041,7 @@ class BertForMultipleChoice(BertPreTrainedModel):
 
         pooled_output = self.dropout(pooled_output)
         logits = self.classifier(pooled_output)
-        reshaped_logits = logits.view(-1, self.num_choices)
+        reshaped_logits = logits.view(-1, num_choices)
 
         outputs = [reshaped_logits] + outputs[2:]  # add hidden states and attention if they are here
 
@@ -1118,15 +1102,13 @@ class BertForTokenClassification(BertPreTrainedModel):
     logits = model(input_ids, token_type_ids, input_mask)
     ```
     """
-    def __init__(self, config, num_labels=2, output_attentions=False, output_hidden_states=False):
+    def __init__(self, config):
         super(BertForTokenClassification, self).__init__(config)
-        self.output_attentions = output_attentions
-        self.output_hidden_states = output_hidden_states
-        self.num_labels = num_labels
+        self.num_labels = config.num_labels
 
-        self.bert = BertModel(config, output_attentions=output_attentions)
+        self.bert = BertModel(config)
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
-        self.classifier = nn.Linear(config.hidden_size, num_labels)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
 
         self.apply(self.init_weights)
 
@@ -1204,12 +1186,12 @@ class BertForQuestionAnswering(BertPreTrainedModel):
     start_logits, end_logits = model(input_ids, token_type_ids, input_mask)
     ```
     """
-    def __init__(self, config, output_attentions=False, output_hidden_states=False):
+    def __init__(self, config):
         super(BertForQuestionAnswering, self).__init__(config)
-        self.output_attentions = output_attentions
-        self.output_hidden_states = output_hidden_states
-        self.bert = BertModel(config, output_attentions=output_attentions)
-        self.qa_outputs = nn.Linear(config.hidden_size, 2)
+        self.num_labels = config.num_labels
+
+        self.bert = BertModel(config)
+        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
 
         self.apply(self.init_weights)
 
diff --git a/pytorch_pretrained_bert/modeling_gpt2.py b/pytorch_pretrained_bert/modeling_gpt2.py
index 792d5e6777..688512ae80 100644
--- a/pytorch_pretrained_bert/modeling_gpt2.py
+++ b/pytorch_pretrained_bert/modeling_gpt2.py
@@ -119,7 +119,8 @@ class GPT2Config(PretrainedConfig):
         attn_pdrop=0.1,
         layer_norm_epsilon=1e-5,
         initializer_range=0.02,
-        predict_special_tokens=True
+        predict_special_tokens=True,
+        **kwargs
     ):
         """Constructs GPT2Config.
 
@@ -142,6 +143,8 @@ class GPT2Config(PretrainedConfig):
                 initializing all weight matrices.
             predict_special_tokens: should we predict special tokens (when the model has a LM head)
         """
+        super(GPT2Config, self).__init__(**kwargs)
+
         if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
                         and isinstance(vocab_size_or_config_json_file, unicode)):
             with open(vocab_size_or_config_json_file, "r", encoding="utf-8") as reader:
@@ -174,8 +177,10 @@ class GPT2Config(PretrainedConfig):
 
 
 class Attention(nn.Module):
-    def __init__(self, nx, n_ctx, config, scale=False, output_attentions=False, keep_multihead_output=False):
+    def __init__(self, nx, n_ctx, config, scale=False):
         super(Attention, self).__init__()
+        self.output_attentions = config.output_attentions
+
         n_state = nx  # in Attention: n_state=768 (nx=n_embd)
         # [switch nx => n_state from Block to Attention to keep identical to TF implem]
         assert n_state % config.n_head == 0
@@ -184,10 +189,6 @@ class Attention(nn.Module):
         self.split_size = n_state
         self.scale = scale
 
-        self.output_attentions = output_attentions
-        self.keep_multihead_output = keep_multihead_output
-        self.multihead_output = None
-
         self.c_attn = Conv1D(n_state * 3, nx)
         self.c_proj = Conv1D(n_state, nx)
         self.attn_dropout = nn.Dropout(config.attn_pdrop)
@@ -224,9 +225,10 @@ class Attention(nn.Module):
         if head_mask is not None:
             w = w * head_mask
 
+        outputs = [torch.matmul(w, v)]
         if self.output_attentions:
-            return w, torch.matmul(w, v)
-        return torch.matmul(w, v)
+            outputs.append(w)
+        return outputs
 
     def merge_heads(self, x):
         x = x.permute(0, 2, 1, 3).contiguous()
@@ -253,19 +255,15 @@ class Attention(nn.Module):
             value = torch.cat((past_value, value), dim=-2)
         present = torch.stack((key.transpose(-2, -1), value))  # transpose to have same shapes for stacking
 
-        a = self._attn(query, key, value, head_mask)
-        if self.keep_multihead_output:
-            self.multihead_output = a
-            self.multihead_output.retain_grad()
+        attn_outputs = self._attn(query, key, value, head_mask)
+        a = attn_outputs[0]
 
-        if self.output_attentions:
-            attentions, a = a
         a = self.merge_heads(a)
         a = self.c_proj(a)
         a = self.resid_dropout(a)
-        if self.output_attentions:
-            return attentions, a, present
-        return a, present
+
+        outputs = [a, present] + attn_outputs[1:]
+        return outputs  # a, present, (attentions)
 
 
 class MLP(nn.Module):
@@ -284,27 +282,24 @@ class MLP(nn.Module):
 
 
 class Block(nn.Module):
-    def __init__(self, n_ctx, config, scale=False, output_attentions=False, keep_multihead_output=False):
+    def __init__(self, n_ctx, config, scale=False):
         super(Block, self).__init__()
         nx = config.n_embd
-        self.output_attentions = output_attentions
         self.ln_1 = LayerNorm(nx, eps=config.layer_norm_epsilon)
-        self.attn = Attention(nx, n_ctx, config, scale, output_attentions, keep_multihead_output)
+        self.attn = Attention(nx, n_ctx, config, scale)
         self.ln_2 = LayerNorm(nx, eps=config.layer_norm_epsilon)
         self.mlp = MLP(4 * nx, config)
 
     def forward(self, x, layer_past=None, head_mask=None):
         output_attn = self.attn(self.ln_1(x), layer_past=layer_past, head_mask=head_mask)
-        if self.output_attentions:
-            attentions, a, present = output_attn
-        else:
-            a, present = output_attn
+        a = output_attn[0]  # output_attn: a, present, (attentions)
+
         x = x + a
         m = self.mlp(self.ln_2(x))
         x = x + m
-        if self.output_attentions:
-            return attentions, x, present
-        return x, present
+
+        outputs = [x] + output_attn[1:]
+        return outputs  # x, present, (attentions)
 
 
 class GPT2LMHead(nn.Module):
@@ -342,12 +337,17 @@ class GPT2MultipleChoiceHead(nn.Module):
         nn.init.normal_(self.linear.weight, std=0.02)
         nn.init.normal_(self.linear.bias, 0)
 
-    def forward(self, hidden_states, mc_token_ids):
-        # Classification logits
-        # hidden_state (bsz, num_choices, seq_length, hidden_size)
-        # mc_token_ids (bsz, num_choices)
-        mc_token_ids = mc_token_ids.unsqueeze(-1).unsqueeze(-1).expand(-1, -1, -1, hidden_states.size(-1))
-        # (bsz, num_choices, 1, hidden_size)
+    def forward(self, hidden_states, mc_token_ids=None):
+        """ Extract classification token hidden state and project it using self.linear
+            hidden_state: shape (bsz, num_choices, seq_length, hidden_size)
+            mc_token_ids: [optional] index of the classification token, shape (bsz, num_choices)
+            if mc_token_ids=None we take the last token of the sequence as classification token
+        """
+        if mc_token_ids is None:
+            mc_token_ids = torch.full_like(hidden_states[:, :, :1, :], hidden_states.shape[2] - 1, dtype=torch.long)
+        else:
+            mc_token_ids = mc_token_ids.unsqueeze(-1).unsqueeze(-1).expand(-1, -1, -1, hidden_states.size(-1))
+        # mc_token_ids has shape (bsz, num_choices, 1, hidden_size)
         multiple_choice_h = hidden_states.gather(2, mc_token_ids).squeeze(2)
         # (bsz, num_choices, hidden_size)
         multiple_choice_h = self.dropout(multiple_choice_h.transpose(1, 2)).transpose(1, 2)
@@ -362,13 +362,9 @@ class GPT2PreTrainedModel(PreTrainedModel):
     """
     config_class = GPT2Config
     pretrained_model_archive_map = PRETRAINED_MODEL_ARCHIVE_MAP
-    pretrained_config_archive_map = PRETRAINED_CONFIG_ARCHIVE_MAP
     load_tf_weights = load_tf_weights_in_gpt2
     base_model_prefix = "transformer"
 
-    def __init__(self, *inputs, **kwargs):
-        super(GPT2PreTrainedModel, self).__init__(*inputs, **kwargs)
-
     def init_weights(self, module):
         """ Initialize the weights.
         """
@@ -403,126 +399,9 @@ class GPT2PreTrainedModel(PreTrainedModel):
             state_dict: an optional state dictionary (collections.OrderedDict object) to use instead of pre-trained models
             *inputs, **kwargs: additional input for the specific GPT2 class
         """
-        # state_dict = kwargs.get('state_dict', None)
-        # kwargs.pop('state_dict', None)
-        # cache_dir = kwargs.get('cache_dir', None)
-        # kwargs.pop('cache_dir', None)
-        # from_tf = kwargs.get('from_tf', False)
-        # kwargs.pop('from_tf', None)
-        num_special_tokens = kwargs.get('num_special_tokens', None)
-        kwargs.pop('num_special_tokens', None)
+        num_special_tokens = kwargs.pop('num_special_tokens', None)
 
-        # if pretrained_model_name_or_path in PRETRAINED_MODEL_ARCHIVE_MAP:
-        #     archive_file = PRETRAINED_MODEL_ARCHIVE_MAP[pretrained_model_name_or_path]
-        #     config_file = PRETRAINED_CONFIG_ARCHIVE_MAP[pretrained_model_name_or_path]
-        # else:
-        #     archive_file = os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME)
-        #     config_file = os.path.join(pretrained_model_name_or_path, CONFIG_NAME)
-        # # redirect to the cache, if necessary
-        # try:
-        #     resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir)
-        # except EnvironmentError:
-        #     if pretrained_model_name_or_path in PRETRAINED_MODEL_ARCHIVE_MAP:
-        #         logger.error(
-        #             "Couldn't reach server at '{}' to download pretrained weights.".format(
-        #                 archive_file))
-        #     else:
-        #         logger.error(
-        #             "Model name '{}' was not found in model name list ({}). "
-        #             "We assumed '{}' was a path or url but couldn't find file {} "
-        #             "at this path or url.".format(
-        #                 pretrained_model_name_or_path, ", ".join(PRETRAINED_MODEL_ARCHIVE_MAP.keys()), pretrained_model_name_or_path,
-        #                 archive_file
-        #             )
-        #         )
-        #     return None
-        # try:
-        #     resolved_config_file = cached_path(config_file, cache_dir=cache_dir)
-        # except EnvironmentError:
-        #     if pretrained_model_name_or_path in PRETRAINED_CONFIG_ARCHIVE_MAP:
-        #         logger.error(
-        #             "Couldn't reach server at '{}' to download pretrained model configuration file.".format(
-        #                 config_file))
-        #     else:
-        #         logger.error(
-        #             "Model name '{}' was not found in model name list ({}). "
-        #             "We assumed '{}' was a path or url but couldn't find file {} "
-        #             "at this path or url.".format(
-        #                 pretrained_model_name_or_path, ", ".join(PRETRAINED_CONFIG_ARCHIVE_MAP.keys()), pretrained_model_name_or_path,
-        #                 config_file
-        #             )
-        #         )
-        #     return None
-        # if resolved_archive_file == archive_file and resolved_config_file == config_file:
-        #     logger.info("loading weights file {}".format(archive_file))
-        #     logger.info("loading configuration file {}".format(config_file))
-        # else:
-        #     logger.info("loading weights file {} from cache at {}".format(
-        #         archive_file, resolved_archive_file))
-        #     logger.info("loading configuration file {} from cache at {}".format(
-        #         config_file, resolved_config_file))
-        # # Load config
-        # config = GPT2Config.from_json_file(resolved_config_file)
-        # logger.info("Model config {}".format(config))
-        # # Instantiate model.
-        # model = cls(config, *inputs, **kwargs)
-        # if state_dict is None and not from_tf:
-        #     state_dict = torch.load(resolved_archive_file, map_location='cpu')
-        # if from_tf:
-        #     # Directly load from a TensorFlow checkpoint (stored as NumPy array)
-        #     return load_tf_weights_in_gpt2(model, resolved_archive_file)
-
-        # old_keys = []
-        # new_keys = []
-        # for key in state_dict.keys():
-        #     new_key = None
-        #     if key.endswith(".g"):
-        #         new_key = key[:-2] + ".weight"
-        #     elif key.endswith(".b"):
-        #         new_key = key[:-2] + ".bias"
-        #     elif key.endswith(".w"):
-        #         new_key = key[:-2] + ".weight"
-        #     if new_key:
-        #         old_keys.append(key)
-        #         new_keys.append(new_key)
-        # for old_key, new_key in zip(old_keys, new_keys):
-        #     state_dict[new_key] = state_dict.pop(old_key)
-
-        # missing_keys = []
-        # unexpected_keys = []
-        # error_msgs = []
-        # # copy state_dict so _load_from_state_dict can modify it
-        # metadata = getattr(state_dict, "_metadata", None)
-        # state_dict = state_dict.copy()
-        # if metadata is not None:
-        #     state_dict._metadata = metadata
-
-        # def load(module, prefix=""):
-        #     local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {})
-        #     module._load_from_state_dict(
-        #         state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys, error_msgs
-        #     )
-        #     for name, child in module._modules.items():
-        #         if child is not None:
-        #             load(child, prefix + name + ".")
-
-        # start_model = model
-        # if hasattr(model, "transformer") and all(not s.startswith('transformer.') for s in state_dict.keys()):
-        #     start_model = model.transformer
-        # load(start_model, prefix="")
-
-        # if len(missing_keys) > 0:
-        #     logger.info(
-        #         "Weights of {} not initialized from pretrained model: {}".format(model.__class__.__name__, missing_keys)
-        #     )
-        # if len(unexpected_keys) > 0:
-        #     logger.info(
-        #         "Weights from pretrained model not used in {}: {}".format(model.__class__.__name__, unexpected_keys)
-        #     )
-        # if len(error_msgs) > 0:
-        #     raise RuntimeError(
-        #         "Error(s) in loading state_dict for {}:\n\t{}".format(model.__class__.__name__, "\n\t".join(error_msgs))
-        #     )
+        model = PreTrainedModel.from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs)
 
         # Add additional embeddings for special tokens if needed
         # This step also make sure we are still sharing the output and input embeddings after loading weights
@@ -553,8 +432,6 @@ class GPT2Model(GPT2PreTrainedModel):
     Params:
         `config`: a GPT2Config class instance with the configuration to build a new model
         `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
-        `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
-            This can be used to compute head importance metrics. Default: False
 
     Inputs:
         `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] (or more generally [d_1, ..., d_n, sequence_length]
@@ -591,14 +468,15 @@ class GPT2Model(GPT2PreTrainedModel):
     ```
     """
 
-    def __init__(self, config, output_attentions=False, keep_multihead_output=False):
+    def __init__(self, config):
         super(GPT2Model, self).__init__(config)
-        self.output_attentions = output_attentions
+        self.output_hidden_states = config.output_hidden_states
+        self.output_attentions = config.output_attentions
+
         self.wte = nn.Embedding(config.total_tokens_embeddings, config.n_embd)
         self.wpe = nn.Embedding(config.n_positions, config.n_embd)
         self.drop = nn.Dropout(config.embd_pdrop)
-        block = Block(config.n_ctx, config, scale=True, output_attentions=output_attentions,
-                                                        keep_multihead_output=keep_multihead_output)
+        block = Block(config.n_ctx, config, scale=True)
         self.h = nn.ModuleList([copy.deepcopy(block) for _ in range(config.n_layer)])
         self.ln_f = LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)
 
@@ -618,19 +496,13 @@ class GPT2Model(GPT2PreTrainedModel):
         # Copy word embeddings from the previous weights
         self.wte.weight.data[:self.config.vocab_size, :] = old_embed.weight.data[:self.config.vocab_size, :]
 
-    def prune_heads(self, heads_to_prune):
+    def _prune_heads(self, heads_to_prune):
         """ Prunes heads of the model.
             heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
         """
         for layer, heads in heads_to_prune.items():
             self.h[layer].attn.prune_heads(heads)
 
-    def get_multihead_outputs(self):
-        """ Gather all multi-head outputs.
-            Return: list (layers) of multihead module outputs with gradients
-        """
-        return [h.attn.multihead_output for h in self.h]
-
     def forward(self, input_ids, position_ids=None, token_type_ids=None, past=None, head_mask=None):
         if past is None:
             past_length = 0
@@ -675,20 +547,32 @@ class GPT2Model(GPT2PreTrainedModel):
         all_attentions = []
         all_hidden_states = []
         for i, (block, layer_past) in enumerate(zip(self.h, past)):
-            all_hidden_states.append(hidden_states.view(*output_shape))
-            outputs = block(hidden_states, layer_past, head_mask[i])
-            if self.output_attentions:
-                attentions, hidden_states, present = outputs
-                all_attentions.append(attentions)
-            else:
-                hidden_states, present = outputs
-            presents.append(present)
-        hidden_states = self.ln_f(hidden_states)
-        all_hidden_states.append(hidden_states.view(*output_shape))
+            if self.output_hidden_states:
+                all_hidden_states.append(hidden_states.view(*output_shape))
 
+            outputs = block(hidden_states, layer_past, head_mask[i])
+            hidden_states, present = outputs[:2]
+            presents.append(present)
+
+            if self.output_attentions:
+                all_attentions.append(outputs[2])
+
+        hidden_states = self.ln_f(hidden_states)
+
+        hidden_states = hidden_states.view(*output_shape)
+        # Add last hidden state
+        if self.output_hidden_states:
+            all_hidden_states.append(hidden_states)
+
+        outputs = [hidden_states, presents]
+        if self.output_hidden_states:
+            outputs.append(all_hidden_states)
         if self.output_attentions:
-            return all_attentions, all_hidden_states, presents
-        return all_hidden_states, presents
+            # let the number of heads free (-1) so we can extract attention even after head pruning
+            attention_output_shape = input_shape[:-1] + (-1,) + all_attentions[0].shape[-2:]
+            all_attentions = list(t.view(*attention_output_shape) for t in all_attentions)
+            outputs.append(all_attentions)
+        return outputs  # last hidden state, presents, (all hidden_states), (attentions)
 
 
 class GPT2LMHeadModel(GPT2PreTrainedModel):
@@ -740,10 +624,9 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
     ```
     """
 
-    def __init__(self, config, output_attentions=False, keep_multihead_output=False):
+    def __init__(self, config):
         super(GPT2LMHeadModel, self).__init__(config)
-        self.transformer = GPT2Model(config, output_attentions=output_attentions,
-                                             keep_multihead_output=keep_multihead_output)
+        self.transformer = GPT2Model(config)
         self.lm_head = GPT2LMHead(self.transformer.wte.weight, config)
         self.apply(self.init_weights)
 
@@ -756,14 +639,12 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
         self.lm_head.set_embeddings_weights(self.transformer.wte.weight, predict_special_tokens=predict_special_tokens)
 
     def forward(self, input_ids, position_ids=None, token_type_ids=None, lm_labels=None, past=None, head_mask=None):
-        transformer_output = self.transformer(input_ids, position_ids, token_type_ids, past, head_mask)
-        if self.transformer.output_attentions:
-            all_attentions, hidden_states, presents = transformer_output
-        else:
-            hidden_states, presents = transformer_output
-        hidden_states = hidden_states[-1]
+        transformer_outputs = self.transformer(input_ids, position_ids, token_type_ids, past, head_mask)
+        hidden_states = transformer_outputs[0]
 
         lm_logits = self.lm_head(hidden_states)
+
+        outputs = [lm_logits] + transformer_outputs[1:]
         if lm_labels is not None:
             # Shift so that tokens < n predict n
             shift_logits = lm_logits[..., :-1, :].contiguous()
@@ -772,10 +653,9 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
             loss_fct = CrossEntropyLoss(ignore_index=-1)
             loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)),
                             shift_labels.view(-1))
-            return loss
-        if self.transformer.output_attentions:
-            return all_attentions, lm_logits, presents
-        return lm_logits, presents
+            outputs = [loss] + outputs
+
+        return outputs  # (loss), lm_logits, presents, (all hidden_states), (attentions)
 
 
 class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
@@ -832,12 +712,12 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
     ```
     """
 
-    def __init__(self, config, output_attentions=False, keep_multihead_output=False):
+    def __init__(self, config):
         super(GPT2DoubleHeadsModel, self).__init__(config)
-        self.transformer = GPT2Model(config, output_attentions=output_attentions,
-                                             keep_multihead_output=keep_multihead_output)
+        self.transformer = GPT2Model(config)
         self.lm_head = GPT2LMHead(self.transformer.wte.weight, config)
         self.multiple_choice_head = GPT2MultipleChoiceHead(config)
+
         self.apply(self.init_weights)
 
     def set_num_special_tokens(self, num_special_tokens, predict_special_tokens=True):
@@ -848,28 +728,26 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
         self.transformer.set_num_special_tokens(num_special_tokens)
         self.lm_head.set_embeddings_weights(self.transformer.wte.weight, predict_special_tokens=predict_special_tokens)
 
-    def forward(self, input_ids, mc_token_ids, lm_labels=None, mc_labels=None, token_type_ids=None,
+    def forward(self, input_ids, mc_token_ids=None, lm_labels=None, mc_labels=None, token_type_ids=None,
                 position_ids=None, past=None, head_mask=None):
-        transformer_output = self.transformer(input_ids, position_ids, token_type_ids, past, head_mask)
-        if self.transformer.output_attentions:
-            all_attentions, hidden_states, presents = transformer_output
-        else:
-            hidden_states, presents = transformer_output
-        hidden_states = hidden_states[-1]
+        transformer_outputs = self.transformer(input_ids, position_ids, token_type_ids, past, head_mask)
+        hidden_states = transformer_outputs[0]
 
         lm_logits = self.lm_head(hidden_states)
         mc_logits = self.multiple_choice_head(hidden_states, mc_token_ids)
-        losses = []
+
+        outputs = [lm_logits, mc_logits] + transformer_outputs[1:]
+        if mc_labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(mc_logits.view(-1, mc_logits.size(-1)),
+                            mc_labels.view(-1))
+            outputs = [loss] + outputs
         if lm_labels is not None:
             shift_logits = lm_logits[..., :-1, :].contiguous()
             shift_labels = lm_labels[..., 1:].contiguous()
             loss_fct = CrossEntropyLoss(ignore_index=-1)
-            losses.append(loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)))
-        if mc_labels is not None:
-            loss_fct = CrossEntropyLoss()
-            losses.append(loss_fct(mc_logits.view(-1, mc_logits.size(-1)), mc_labels.view(-1)))
-        if losses:
-            return losses
-        if self.transformer.output_attentions:
-            return all_attentions, lm_logits, mc_logits, presents
-        return lm_logits, mc_logits, presents
+            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)),
+                            shift_labels.view(-1))
+            outputs = [loss] + outputs
+
+        return outputs  # (lm loss), (mc loss), lm logits, mc logits, presents, (all hidden_states), (attentions)
diff --git a/pytorch_pretrained_bert/modeling_openai.py b/pytorch_pretrained_bert/modeling_openai.py
index 670f250ef9..5ee4e9224a 100644
--- a/pytorch_pretrained_bert/modeling_openai.py
+++ b/pytorch_pretrained_bert/modeling_openai.py
@@ -147,7 +147,8 @@ class OpenAIGPTConfig(PretrainedConfig):
         attn_pdrop=0.1,
         layer_norm_epsilon=1e-5,
         initializer_range=0.02,
-        predict_special_tokens=True
+        predict_special_tokens=True,
+        **kwargs
     ):
         """Constructs OpenAIGPTConfig.
 
@@ -172,6 +173,8 @@ class OpenAIGPTConfig(PretrainedConfig):
                 initializing all weight matrices.
             predict_special_tokens: should we predict special tokens (when the model has a LM head)
         """
+        super(OpenAIGPTConfig, self).__init__(**kwargs)
+
         if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
                         and isinstance(vocab_size_or_config_json_file, unicode)):
             with open(vocab_size_or_config_json_file, "r", encoding="utf-8") as reader:
@@ -205,7 +208,7 @@ class OpenAIGPTConfig(PretrainedConfig):
 
 
 class Attention(nn.Module):
-    def __init__(self, nx, n_ctx, config, scale=False, output_attentions=False, keep_multihead_output=False):
+    def __init__(self, nx, n_ctx, config, scale=False):
         super(Attention, self).__init__()
         n_state = nx  # in Attention: n_state=768 (nx=n_embd)
         # [switch nx => n_state from Block to Attention to keep identical to TF implem]
@@ -215,9 +218,7 @@ class Attention(nn.Module):
         self.split_size = n_state
         self.scale = scale
 
-        self.output_attentions = output_attentions
-        self.keep_multihead_output = keep_multihead_output
-        self.multihead_output = None
+        self.output_attentions = config.output_attentions
 
         self.c_attn = Conv1D(n_state * 3, nx)
         self.c_proj = Conv1D(n_state, nx)
@@ -256,9 +257,10 @@ class Attention(nn.Module):
         if head_mask is not None:
             w = w * head_mask
 
+        outputs = [torch.matmul(w, v)]
         if self.output_attentions:
-            return w, torch.matmul(w, v)
-        return torch.matmul(w, v)
+            outputs.append(w)
+        return outputs
 
     def merge_heads(self, x):
         x = x.permute(0, 2, 1, 3).contiguous()
@@ -280,19 +282,15 @@ class Attention(nn.Module):
         key = self.split_heads(key, k=True)
         value = self.split_heads(value)
 
-        a = self._attn(query, key, value, head_mask)
-        if self.keep_multihead_output:
-            self.multihead_output = a
-            self.multihead_output.retain_grad()
+        attn_outputs = self._attn(query, key, value, head_mask)
+        a = attn_outputs[0]
 
-        if self.output_attentions:
-            attentions, a = a
         a = self.merge_heads(a)
         a = self.c_proj(a)
         a = self.resid_dropout(a)
-        if self.output_attentions:
-            return attentions, a
-        return a
+
+        outputs = [a] + attn_outputs[1:]
+        return outputs  # a, (attentions)
 
 
 class MLP(nn.Module):
@@ -311,25 +309,24 @@ class MLP(nn.Module):
 
 
 class Block(nn.Module):
-    def __init__(self, n_ctx, config, scale=False, output_attentions=False, keep_multihead_output=False):
+    def __init__(self, n_ctx, config, scale=False):
         super(Block, self).__init__()
         nx = config.n_embd
-        self.output_attentions = output_attentions
-        self.attn = Attention(nx, n_ctx, config, scale, output_attentions, keep_multihead_output)
+        self.attn = Attention(nx, n_ctx, config, scale)
         self.ln_1 = LayerNorm(nx, eps=config.layer_norm_epsilon)
         self.mlp = MLP(4 * nx, config)
         self.ln_2 = LayerNorm(nx, eps=config.layer_norm_epsilon)
 
     def forward(self, x, head_mask=None):
-        a = self.attn(x, head_mask=head_mask)
-        if self.output_attentions:
-            attentions, a = a
+        attn_outputs = self.attn(x, head_mask=head_mask)
+        a = attn_outputs[0]
+
         n = self.ln_1(x + a)
         m = self.mlp(n)
         h = self.ln_2(n + m)
-        if self.output_attentions:
-            return attentions, h
-        return h
+
+        outputs = [h] + attn_outputs[1:]
+        return outputs
 
 
 class OpenAIGPTLMHead(nn.Module):
@@ -368,11 +365,16 @@ class OpenAIGPTMultipleChoiceHead(nn.Module):
         nn.init.normal_(self.linear.weight, std=0.02)
         nn.init.normal_(self.linear.bias, 0)
 
-    def forward(self, hidden_states, mc_token_ids):
-        # Classification logits
-        # hidden_state (bsz, num_choices, seq_length, hidden_size)
-        # mc_token_ids (bsz, num_choices)
-        mc_token_ids = mc_token_ids.unsqueeze(-1).unsqueeze(-1).expand(-1, -1, -1, hidden_states.size(-1))
+    def forward(self, hidden_states, mc_token_ids=None):
+        """ Extract classification token hidden state and project it using self.linear
+            hidden_state: hidden state of shape (bsz, num_choices, seq_length, hidden_size)
+            mc_token_ids: [optional] index of the classification token, shape (bsz, num_choices)
+            if mc_token_ids=None we take the last token of the sequence as classification token
+        """
+        if mc_token_ids is None:
+            mc_token_ids = torch.full_like(hidden_states[:, :, :1, :], hidden_states.shape[2] - 1, dtype=torch.long)
+        else:
+            mc_token_ids = mc_token_ids.unsqueeze(-1).unsqueeze(-1).expand(-1, -1, -1, hidden_states.size(-1))
         # (bsz, num_choices, 1, hidden_size)
         multiple_choice_h = hidden_states.gather(2, mc_token_ids).squeeze(2)
         # (bsz, num_choices, hidden_size)
@@ -388,13 +390,9 @@ class OpenAIGPTPreTrainedModel(PreTrainedModel):
     """
     config_class = OpenAIGPTConfig
     pretrained_model_archive_map = PRETRAINED_MODEL_ARCHIVE_MAP
-    pretrained_config_archive_map = PRETRAINED_CONFIG_ARCHIVE_MAP
     load_tf_weights = load_tf_weights_in_openai_gpt
     base_model_prefix = "transformer"
 
-    def __init__(self, *inputs, **kwargs):
-        super(OpenAIGPTPreTrainedModel, self).__init__(*inputs, **kwargs)
-
     def init_weights(self, module):
         """ Initialize the weights.
         """
@@ -495,14 +493,15 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
     ```
     """
 
-    def __init__(self, config, output_attentions=False, keep_multihead_output=False):
+    def __init__(self, config):
         super(OpenAIGPTModel, self).__init__(config)
-        self.output_attentions = output_attentions
+        self.output_attentions = config.output_attentions
+        self.output_hidden_states = config.output_hidden_states
+
         self.tokens_embed = nn.Embedding(config.total_tokens_embeddings, config.n_embd)
         self.positions_embed = nn.Embedding(config.n_positions, config.n_embd)
         self.drop = nn.Dropout(config.embd_pdrop)
-        block = Block(config.n_ctx, config, scale=True, output_attentions=output_attentions,
-                                                        keep_multihead_output=keep_multihead_output)
+        block = Block(config.n_ctx, config, scale=True)
         self.h = nn.ModuleList([copy.deepcopy(block) for _ in range(config.n_layer)])
 
         self.apply(self.init_weights)
@@ -521,19 +520,13 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
         # Copy word embeddings from the previous weights
         self.tokens_embed.weight.data[:self.config.vocab_size, :] = old_embed.weight.data[:self.config.vocab_size, :]
 
-    def prune_heads(self, heads_to_prune):
+    def _prune_heads(self, heads_to_prune):
         """ Prunes heads of the model.
             heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
         """
         for layer, heads in heads_to_prune.items():
             self.h[layer].attn.prune_heads(heads)
 
-    def get_multihead_outputs(self):
-        """ Gather all multi-head outputs.
-            Return: list (layers) of multihead module outputs with gradients
-        """
-        return [h.attn.multihead_output for h in self.h]
-
     def forward(self, input_ids, position_ids=None, token_type_ids=None, head_mask=None):
         if position_ids is None:
             # This was used when we had a single embedding matrice from position and token embeddings
@@ -574,19 +567,26 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
         output_shape = input_shape + (hidden_states.size(-1),)
 
         all_attentions = []
-        all_hidden_states = [hidden_states.view(*output_shape)]
+        all_hidden_states = []
         for i, block in enumerate(self.h):
+            if self.output_hidden_states:
+                all_hidden_states.append(hidden_states.view(*output_shape))
+
             outputs = block(hidden_states, head_mask[i])
+            hidden_states = outputs[0]
             if self.output_attentions:
-                attentions, hidden_states = outputs
-                all_attentions.append(attentions)
-            else:
-                hidden_states = outputs
+                all_attentions.append(outputs[1])
+
+        # Add last layer
+        if self.output_hidden_states:
             all_hidden_states.append(hidden_states.view(*output_shape))
 
+        outputs = [hidden_states.view(*output_shape)]
+        if self.output_hidden_states:
+            outputs.append(all_hidden_states)
         if self.output_attentions:
-            return all_attentions, all_hidden_states
-        return all_hidden_states
+            outputs.append(all_attentions)
+        return outputs  # last hidden state, (all hidden states), (all attentions)
 
 
 class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
@@ -650,10 +650,9 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
     ```
     """
 
-    def __init__(self, config, output_attentions=False, keep_multihead_output=False):
+    def __init__(self, config):
         super(OpenAIGPTLMHeadModel, self).__init__(config)
-        self.transformer = OpenAIGPTModel(config, output_attentions=output_attentions,
-                                             keep_multihead_output=keep_multihead_output)
+        self.transformer = OpenAIGPTModel(config)
         self.lm_head = OpenAIGPTLMHead(self.transformer.tokens_embed.weight, config)
         self.apply(self.init_weights)
 
@@ -666,12 +665,11 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
         self.lm_head.set_embeddings_weights(self.transformer.tokens_embed.weight, predict_special_tokens=predict_special_tokens)
 
     def forward(self, input_ids, position_ids=None, token_type_ids=None, lm_labels=None, head_mask=None):
-        hidden_states = self.transformer(input_ids, position_ids, token_type_ids, head_mask)
-        if self.transformer.output_attentions:
-            all_attentions, hidden_states = hidden_states
-        hidden_states = hidden_states[-1]
-
+        transformer_outputs = self.transformer(input_ids, position_ids, token_type_ids, head_mask)
+        hidden_states = transformer_outputs[0]
         lm_logits = self.lm_head(hidden_states)
+
+        outputs = [lm_logits] + transformer_outputs[1:]
         if lm_labels is not None:
             # Shift so that tokens < n predict n
             shift_logits = lm_logits[..., :-1, :].contiguous()
@@ -680,10 +678,9 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
             loss_fct = CrossEntropyLoss(ignore_index=-1)
             loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)),
                             shift_labels.view(-1))
-            return loss
-        if self.transformer.output_attentions:
-            return all_attentions, lm_logits
-        return lm_logits
+            outputs = [loss] + outputs
+
+        return outputs  # (loss), lm_logits, (all hidden states), (all attentions)
 
 
 class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
@@ -752,10 +749,9 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
     ```
     """
 
-    def __init__(self, config, output_attentions=False, keep_multihead_output=False):
+    def __init__(self, config):
         super(OpenAIGPTDoubleHeadsModel, self).__init__(config)
-        self.transformer = OpenAIGPTModel(config, output_attentions=output_attentions,
-                                             keep_multihead_output=keep_multihead_output)
+        self.transformer = OpenAIGPTModel(config)
         self.lm_head = OpenAIGPTLMHead(self.transformer.tokens_embed.weight, config)
         self.multiple_choice_head = OpenAIGPTMultipleChoiceHead(config)
         self.apply(self.init_weights)
@@ -768,26 +764,26 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
         self.transformer.set_num_special_tokens(num_special_tokens)
         self.lm_head.set_embeddings_weights(self.transformer.tokens_embed.weight, predict_special_tokens=predict_special_tokens)
 
-    def forward(self, input_ids, mc_token_ids, lm_labels=None, mc_labels=None, token_type_ids=None,
+    def forward(self, input_ids, mc_token_ids=None, lm_labels=None, mc_labels=None, token_type_ids=None,
                 position_ids=None, head_mask=None):
-        hidden_states = self.transformer(input_ids, position_ids, token_type_ids, head_mask)
-        if self.transformer.output_attentions:
-            all_attentions, hidden_states = hidden_states
-        hidden_states = hidden_states[-1]
+        transformer_outputs = self.transformer(input_ids, position_ids, token_type_ids, head_mask)
+        hidden_states = transformer_outputs[0]
 
         lm_logits = self.lm_head(hidden_states)
         mc_logits = self.multiple_choice_head(hidden_states, mc_token_ids)
-        losses = []
+
+        outputs = [lm_logits, mc_logits] + transformer_outputs[1:]
+        if mc_labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(mc_logits.view(-1, mc_logits.size(-1)),
+                            mc_labels.view(-1))
+            outputs = [loss] + outputs
         if lm_labels is not None:
             shift_logits = lm_logits[..., :-1, :].contiguous()
             shift_labels = lm_labels[..., 1:].contiguous()
             loss_fct = CrossEntropyLoss(ignore_index=-1)
-            losses.append(loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)))
-        if mc_labels is not None:
-            loss_fct = CrossEntropyLoss()
-            losses.append(loss_fct(mc_logits.view(-1, mc_logits.size(-1)), mc_labels.view(-1)))
-        if losses:
-            return losses
-        if self.transformer.output_attentions:
-            return all_attentions, lm_logits, mc_logits
-        return lm_logits, mc_logits
+            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)),
+                            shift_labels.view(-1))
+            outputs = [loss] + outputs
+
+        return outputs  # (lm loss), (mc loss), lm logits, mc logits, (all hidden_states), (attentions)
diff --git a/pytorch_pretrained_bert/modeling_transfo_xl.py b/pytorch_pretrained_bert/modeling_transfo_xl.py
index 518abb86a3..84df603a53 100644
--- a/pytorch_pretrained_bert/modeling_transfo_xl.py
+++ b/pytorch_pretrained_bert/modeling_transfo_xl.py
@@ -209,7 +209,8 @@ class TransfoXLConfig(PretrainedConfig):
                  init="normal",
                  init_range=0.01,
                  proj_init_std=0.01,
-                 init_std=0.02):
+                 init_std=0.02,
+                 **kwargs):
         """Constructs TransfoXLConfig.
 
         Args:
@@ -244,6 +245,8 @@ class TransfoXLConfig(PretrainedConfig):
             proj_init_std: parameters initialized by N(0, init_std)
             init_std: parameters initialized by N(0, init_std)
         """
+        super(TransfoXLConfig, self).__init__(**kwargs)
+
         if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
                         and isinstance(vocab_size_or_config_json_file, unicode)):
             with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader:
@@ -287,6 +290,7 @@ class TransfoXLConfig(PretrainedConfig):
                              "or the path to a pretrained model config file (str)")
 
 
+
 class PositionalEmbedding(nn.Module):
     def __init__(self, demb):
         super(PositionalEmbedding, self).__init__()
@@ -306,6 +310,7 @@ class PositionalEmbedding(nn.Module):
             return pos_emb[:,None,:]
 
 
+
 class PositionwiseFF(nn.Module):
     def __init__(self, d_model, d_inner, dropout, pre_lnorm=False):
         super(PositionwiseFF, self).__init__()
@@ -341,11 +346,14 @@ class PositionwiseFF(nn.Module):
 
         return output
 
+
+
 class MultiHeadAttn(nn.Module):
     def __init__(self, n_head, d_model, d_head, dropout, dropatt=0, 
-                 pre_lnorm=False, r_r_bias=None, r_w_bias=None):
+                 pre_lnorm=False, r_r_bias=None, r_w_bias=None, output_attentions=False):
         super(MultiHeadAttn, self).__init__()
 
+        self.output_attentions = output_attentions
         self.n_head = n_head
         self.d_model = d_model
         self.d_head = d_head
@@ -371,7 +379,7 @@ class MultiHeadAttn(nn.Module):
             self.r_r_bias = r_r_bias
             self.r_w_bias = r_w_bias
 
-    def forward(self, h, attn_mask=None, mems=None):
+    def forward(self, h, attn_mask=None, mems=None, head_mask=None):
         ##### multihead attention
         # [hlen x bsz x n_head x d_head]
 
@@ -404,6 +412,10 @@ class MultiHeadAttn(nn.Module):
         attn_prob = F.softmax(attn_score, dim=1)
         attn_prob = self.dropatt(attn_prob)
 
+        # Mask heads if we want to
+        if head_mask is not None:
+            attn_prob = attn_prob * head_mask
+
         # [qlen x klen x bsz x n_head] + [klen x bsz x n_head x d_head] -> [qlen x bsz x n_head x d_head]
         attn_vec = torch.einsum('ijbn,jbnd->ibnd', (attn_prob, head_v))
         attn_vec = attn_vec.contiguous().view(
@@ -415,19 +427,23 @@ class MultiHeadAttn(nn.Module):
 
         if self.pre_lnorm:
             ##### residual connection
-            output = h + attn_out
+            outputs = [h + attn_out]
         else:
             ##### residual connection + layer normalization
-            output = self.layer_norm(h + attn_out)
+            outputs = [self.layer_norm(h + attn_out)]
 
-        return output
+        if self.output_attentions:
+            outputs.append(attn_prob)
+
+        return outputs
 
 class RelMultiHeadAttn(nn.Module):
     def __init__(self, n_head, d_model, d_head, dropout, dropatt=0,
                  tgt_len=None, ext_len=None, mem_len=None, pre_lnorm=False,
-                 r_r_bias=None, r_w_bias=None):
+                 r_r_bias=None, r_w_bias=None, output_attentions=False):
         super(RelMultiHeadAttn, self).__init__()
 
+        self.output_attentions = output_attentions
         self.n_head = n_head
         self.d_model = d_model
         self.d_head = d_head
@@ -506,7 +522,7 @@ class RelPartialLearnableMultiHeadAttn(RelMultiHeadAttn):
 
         self.r_net = nn.Linear(self.d_model, self.n_head * self.d_head, bias=False)
 
-    def forward(self, w, r, attn_mask=None, mems=None):
+    def forward(self, w, r, attn_mask=None, mems=None, head_mask=None):
         qlen, rlen, bsz = w.size(0), r.size(0), w.size(1)
 
         if mems is not None:
@@ -561,6 +577,10 @@ class RelPartialLearnableMultiHeadAttn(RelMultiHeadAttn):
         attn_prob = F.softmax(attn_score, dim=1)
         attn_prob = self.dropatt(attn_prob)
 
+        # Mask heads if we want to
+        if head_mask is not None:
+            attn_prob = attn_prob * head_mask
+
         #### compute attention vector
         attn_vec = torch.einsum('ijbn,jbnd->ibnd', (attn_prob, w_head_v))
 
@@ -574,18 +594,21 @@ class RelPartialLearnableMultiHeadAttn(RelMultiHeadAttn):
 
         if self.pre_lnorm:
             ##### residual connection
-            output = w + attn_out
+            outputs = [w + attn_out]
         else:
             ##### residual connection + layer normalization
-            output = self.layer_norm(w + attn_out)
+            outputs = [self.layer_norm(w + attn_out)]
 
-        return output
+        if self.output_attentions:
+            outputs.append(attn_prob)
+
+        return outputs
 
 class RelLearnableMultiHeadAttn(RelMultiHeadAttn):
     def __init__(self, *args, **kwargs):
         super(RelLearnableMultiHeadAttn, self).__init__(*args, **kwargs)
 
-    def forward(self, w, r_emb, r_w_bias, r_bias, attn_mask=None, mems=None):
+    def forward(self, w, r_emb, r_w_bias, r_bias, attn_mask=None, mems=None, head_mask=None):
         # r_emb: [klen, n_head, d_head], used for term B
         # r_w_bias: [n_head, d_head], used for term C
         # r_bias: [klen, n_head], used for term D
@@ -646,6 +669,9 @@ class RelLearnableMultiHeadAttn(RelMultiHeadAttn):
         attn_prob = F.softmax(attn_score, dim=1)
         attn_prob = self.dropatt(attn_prob)
 
+        if head_mask is not None:
+            attn_prob = attn_prob * head_mask
+
         #### compute attention vector
         attn_vec = torch.einsum('ijbn,jbnd->ibnd', (attn_prob, w_head_v))
 
@@ -659,12 +685,17 @@ class RelLearnableMultiHeadAttn(RelMultiHeadAttn):
 
         if self.pre_lnorm:
             ##### residual connection
-            output = w + attn_out
+            outputs = [w + attn_out]
         else:
             ##### residual connection + layer normalization
-            output = self.layer_norm(w + attn_out)
+            outputs = [self.layer_norm(w + attn_out)]
+
+        if self.output_attentions:
+            outputs.append(attn_prob)
+
+        return outputs
+
 
-        return output
 
 class DecoderLayer(nn.Module):
     def __init__(self, n_head, d_model, d_head, d_inner, dropout, **kwargs):
@@ -674,13 +705,15 @@ class DecoderLayer(nn.Module):
         self.pos_ff = PositionwiseFF(d_model, d_inner, dropout, 
                                      pre_lnorm=kwargs.get('pre_lnorm'))
 
-    def forward(self, dec_inp, dec_attn_mask=None, mems=None):
+    def forward(self, dec_inp, dec_attn_mask=None, mems=None, head_mask=None):
 
-        output = self.dec_attn(dec_inp, attn_mask=dec_attn_mask,
-                               mems=mems)
-        output = self.pos_ff(output)
+        attn_outputs = self.dec_attn(dec_inp, attn_mask=dec_attn_mask,
+                               mems=mems, head_mask=head_mask)
+        ff_output = self.pos_ff(attn_outputs[0])
 
-        return output
+        outputs = [ff_output] + attn_outputs[1:]
+
+        return outputs
 
 class RelLearnableDecoderLayer(nn.Module):
     def __init__(self, n_head, d_model, d_head, d_inner, dropout,
@@ -692,14 +725,16 @@ class RelLearnableDecoderLayer(nn.Module):
         self.pos_ff = PositionwiseFF(d_model, d_inner, dropout, 
                                      pre_lnorm=kwargs.get('pre_lnorm'))
 
-    def forward(self, dec_inp, r_emb, r_w_bias, r_bias, dec_attn_mask=None, mems=None):
+    def forward(self, dec_inp, r_emb, r_w_bias, r_bias, dec_attn_mask=None, mems=None, head_mask=None):
 
-        output = self.dec_attn(dec_inp, r_emb, r_w_bias, r_bias,
+        attn_outputs = self.dec_attn(dec_inp, r_emb, r_w_bias, r_bias,
                                attn_mask=dec_attn_mask,
-                               mems=mems)
-        output = self.pos_ff(output)
+                               mems=mems, head_mask=head_mask)
+        ff_output = self.pos_ff(attn_outputs[0])
 
-        return output
+        outputs = [ff_output] + attn_outputs[1:]
+
+        return outputs
 
 class RelPartialLearnableDecoderLayer(nn.Module):
     def __init__(self, n_head, d_model, d_head, d_inner, dropout,
@@ -711,14 +746,17 @@ class RelPartialLearnableDecoderLayer(nn.Module):
         self.pos_ff = PositionwiseFF(d_model, d_inner, dropout, 
                                      pre_lnorm=kwargs.get('pre_lnorm'))
 
-    def forward(self, dec_inp, r, dec_attn_mask=None, mems=None):
+    def forward(self, dec_inp, r, dec_attn_mask=None, mems=None, head_mask=None):
 
-        output = self.dec_attn(dec_inp, r,
+        attn_outputs = self.dec_attn(dec_inp, r,
                                attn_mask=dec_attn_mask,
-                               mems=mems)
-        output = self.pos_ff(output)
+                               mems=mems, head_mask=head_mask)
+        ff_output = self.pos_ff(attn_outputs[0])
+
+        outputs = [ff_output] + attn_outputs[1:]
+
+        return outputs
 
-        return output
 
 
 class AdaptiveEmbedding(nn.Module):
@@ -791,13 +829,9 @@ class TransfoXLPreTrainedModel(PreTrainedModel):
     """
     config_class = TransfoXLConfig
     pretrained_model_archive_map = PRETRAINED_MODEL_ARCHIVE_MAP
-    pretrained_config_archive_map = PRETRAINED_CONFIG_ARCHIVE_MAP
     load_tf_weights = load_tf_weights_in_transfo_xl
     base_model_prefix = "transformer"
 
-    def __init__(self, *inputs, **kwargs):
-        super(TransfoXLPreTrainedModel, self).__init__(*inputs, **kwargs)
-
     def _init_weight(self, weight):
         if self.config.init == 'uniform':
             nn.init.uniform_(weight, -self.config.init_range, self.config.init_range)
@@ -894,6 +928,9 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
     """
     def __init__(self, config):
         super(TransfoXLModel, self).__init__(config)
+        self.output_attentions = config.output_attentions
+        self.output_hidden_states = config.output_hidden_states
+
         self.n_token = config.n_token
 
         self.d_embed = config.d_embed
@@ -928,7 +965,8 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
                         tgt_len=config.tgt_len, ext_len=config.ext_len, mem_len=config.mem_len,
                         dropatt=config.dropatt, pre_lnorm=config.pre_lnorm,
                         r_w_bias=None if config.untie_r else self.r_w_bias,
-                        r_r_bias=None if config.untie_r else self.r_r_bias)
+                        r_r_bias=None if config.untie_r else self.r_r_bias,
+                        output_attentions=self.output_attentions)
                 )
         elif config.attn_type == 1: # learnable embeddings
             for i in range(config.n_layer):
@@ -938,7 +976,8 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
                         tgt_len=config.tgt_len, ext_len=config.ext_len, mem_len=config.mem_len,
                         dropatt=config.dropatt, pre_lnorm=config.pre_lnorm,
                         r_w_bias=None if config.untie_r else self.r_w_bias,
-                        r_r_bias=None if config.untie_r else self.r_r_bias)
+                        r_r_bias=None if config.untie_r else self.r_r_bias,
+                        output_attentions=self.output_attentions)
                 )
         elif config.attn_type in [2, 3]: # absolute embeddings
             for i in range(config.n_layer):
@@ -947,7 +986,8 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
                         config.n_head, config.d_model, config.d_head, config.d_inner, config.dropout,
                         dropatt=config.dropatt, pre_lnorm=config.pre_lnorm,
                         r_w_bias=None if config.untie_r else self.r_w_bias,
-                        r_r_bias=None if config.untie_r else self.r_r_bias)
+                        r_r_bias=None if config.untie_r else self.r_r_bias,
+                        output_attentions=self.output_attentions)
                 )
 
         self.same_length = config.same_length
@@ -965,17 +1005,21 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
         elif self.attn_type == 3: # absolute deeper SA
             self.r_emb = nn.Parameter(torch.Tensor(
                     self.n_layer, self.max_klen, self.n_head, self.d_head))
+
         self.apply(self.init_weights)
 
     def backward_compatible(self):
         self.sample_softmax = -1
 
-
     def reset_length(self, tgt_len, ext_len, mem_len):
         self.tgt_len = tgt_len
         self.mem_len = mem_len
         self.ext_len = ext_len
 
+    def _prune_heads(self, heads):
+        logger.info("Head pruning is not implemented for Transformer-XL model")
+        pass
+
     def init_mems(self, data):
         if self.mem_len > 0:
             mems = []
@@ -1012,9 +1056,24 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
 
         return new_mems
 
-    def _forward(self, dec_inp, mems=None):
+    def _forward(self, dec_inp, mems=None, head_mask=None):
         qlen, bsz = dec_inp.size()
 
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] (a head_mask for each layer)
+        # and head_mask is converted to shape [num_hidden_layers x qlen x klen x bsz x n_head]
+        if head_mask is not None:
+            if head_mask.dim() == 1:
+                head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(0).unsqueeze(0)
+                head_mask = head_mask.expand(self.n_layer, -1, -1, -1, -1)
+            elif head_mask.dim() == 2:
+                head_mask = head_mask.unsqueeze(1).unsqueeze(1).unsqueeze(1)
+            head_mask = head_mask.to(dtype=next(self.parameters()).dtype) # switch to fload if need + fp16 compatibility
+        else:
+            head_mask = [None] * self.n_layer
+
         word_emb = self.word_emb(dec_inp)
 
         mlen = mems[0].size(0) if mems is not None else 0
@@ -1033,6 +1092,7 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
                 word_emb.new_ones(qlen, klen), diagonal=1+mlen).byte()[:,:,None]
 
         hids = []
+        attentions = []
         if self.attn_type == 0: # default
             pos_seq = torch.arange(klen-1, -1, -1.0, device=word_emb.device, 
                                    dtype=word_emb.dtype)
@@ -1046,7 +1106,11 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
             for i, layer in enumerate(self.layers):
                 hids.append(core_out)
                 mems_i = None if mems is None else mems[i]
-                core_out = layer(core_out, pos_emb, dec_attn_mask=dec_attn_mask, mems=mems_i)
+                layer_outputs = layer(core_out, pos_emb, dec_attn_mask=dec_attn_mask,
+                                      mems=mems_i, head_mask=head_mask[i])
+                core_out = layer_outputs[0]
+                if self.output_attentions:
+                    attentions.append(layer_outputs[1])
         elif self.attn_type == 1: # learnable
             core_out = self.drop(word_emb)
             for i, layer in enumerate(self.layers):
@@ -1058,8 +1122,12 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
                     r_emb, r_bias = self.r_emb[i], self.r_bias[i]
 
                 mems_i = None if mems is None else mems[i]
-                core_out = layer(core_out, r_emb, self.r_w_bias[i],
-                        r_bias, dec_attn_mask=dec_attn_mask, mems=mems_i)
+                layer_outputs = layer(core_out, r_emb, self.r_w_bias[i],
+                                      r_bias, dec_attn_mask=dec_attn_mask,
+                                      mems=mems_i, head_mask=head_mask[i])
+                core_out = layer_outputs[0]
+                if self.output_attentions:
+                    attentions.append(layer_outputs[1])
         elif self.attn_type == 2: # absolute
             pos_seq = torch.arange(klen - 1, -1, -1.0, device=word_emb.device,
                                    dtype=word_emb.dtype)
@@ -1074,8 +1142,11 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
                 mems_i = None if mems is None else mems[i]
                 if mems_i is not None and i == 0:
                     mems_i += pos_emb[:mlen]
-                core_out = layer(core_out, dec_attn_mask=dec_attn_mask,
-                                 mems=mems_i)
+                layer_outputs = layer(core_out, dec_attn_mask=dec_attn_mask,
+                                 mems=mems_i, head_mask=head_mask[i])
+                core_out = layer_outputs[0]
+                if self.output_attentions:
+                    attentions.append(layer_outputs[1])
         elif self.attn_type == 3:
             core_out = self.drop(word_emb)
 
@@ -1093,16 +1164,30 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
                     mems_i += cur_emb.view(mlen, 1, -1)
                 core_out += self.r_emb[i][-qlen:].view(qlen, 1, -1)
 
-                core_out = layer(core_out, dec_attn_mask=dec_attn_mask,
-                                 mems=mems_i)
+                layer_outputs = layer(core_out, dec_attn_mask=dec_attn_mask,
+                                      mems=mems_i, head_mask=head_mask[i])
+                core_out = layer_outputs[0]
+                if self.output_attentions:
+                    attentions.append(layer_outputs[1])
 
         core_out = self.drop(core_out)
 
         new_mems = self._update_mems(hids, mems, mlen, qlen)
 
-        return core_out, new_mems
+        # We transpose back here to shape [bsz, len, hidden_dim]
+        outputs = [core_out.transpose(0, 1).contiguous(), new_mems]
+        if self.output_hidden_states:
+            # Add last layer and transpose to library standard shape [bsz, len, hidden_dim]
+            hids.append(core_out)
+            hids = list(t.transpose(0, 1).contiguous() for t in hids)
+            outputs.append(hids)
+        if self.output_attentions:
+            # Transpose to library standard shape [bsz, n_heads, query_seq_len, key_seq_len]
+            attentions = list(t.permute(2, 3, 0, 1).contiguous() for t in attentions)
+            outputs.append(attentions)
+        return outputs  # last hidden state, new_mems, (all hidden states), (all attentions)
 
-    def forward(self, input_ids, mems=None):
+    def forward(self, input_ids, mems=None, head_mask=None):
         """ Params:
                 input_ids :: [bsz, len]
                 mems :: optional mems from previous forwar passes (or init_mems)
@@ -1122,11 +1207,9 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
 
         if mems is None:
             mems = self.init_mems(input_ids)
-        last_hidden, new_mems = self._forward(input_ids, mems=mems)
+        outputs = self._forward(input_ids, mems=mems, head_mask=head_mask)
 
-        # We transpose back here to shape [bsz, len, hidden_dim]
-        last_hidden = last_hidden.transpose(0, 1).contiguous()
-        return (last_hidden, new_mems)
+        return outputs  # last hidden state, new_mems, (all hidden states), (all attentions)
 
 
 class TransfoXLLMHeadModel(TransfoXLPreTrainedModel):
@@ -1218,7 +1301,7 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel):
     def init_mems(self, data):
         return self.transformer.init_mems(data)
 
-    def forward(self, input_ids, labels=None, mems=None):
+    def forward(self, input_ids, labels=None, mems=None, head_mask=None):
         """ Params:
                 input_ids :: [bsz, len]
                 labels :: [bsz, len]
@@ -1235,19 +1318,26 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel):
         bsz = input_ids.size(0)
         tgt_len = input_ids.size(1)
 
-        last_hidden, new_mems = self.transformer(input_ids, mems)
+        transformer_outputs = self.transformer(input_ids, mems, head_mask)
 
+        last_hidden = transformer_outputs[0]
         pred_hid = last_hidden[:, -tgt_len:]
+        outputs = transformer_outputs[1:]
         if self.sample_softmax > 0 and self.training:
             assert self.config.tie_weight
             logit = sample_logits(self.transformer.word_emb, self.out_layer.bias, labels, pred_hid, self.sampler)
             softmax_output = -F.log_softmax(logit, -1)[:, :, 0]
+            outputs = [softmax_output] + outputs
+            if labels is not None:
+                # TODO: This is not implemented
+                raise NotImplementedError
         else:
             softmax_output = self.crit(pred_hid.view(-1, pred_hid.size(-1)), labels)
             if labels is None:
                 softmax_output = softmax_output.view(bsz, tgt_len, -1)
+                outputs = [softmax_output] + outputs
             else:
                 softmax_output = softmax_output.view(bsz, tgt_len)
+                outputs = [softmax_output, None] + outputs
 
-        # We transpose back
-        return (softmax_output, new_mems)
+        return outputs  # (loss), logits or None if labels is not None (speed up adaptive softmax), new_mems, (all hidden states), (all attentions)
diff --git a/pytorch_pretrained_bert/modeling_xlm.py b/pytorch_pretrained_bert/modeling_xlm.py
index 92e1cc124c..b86c9778a2 100644
--- a/pytorch_pretrained_bert/modeling_xlm.py
+++ b/pytorch_pretrained_bert/modeling_xlm.py
@@ -73,6 +73,7 @@ class XLMConfig(PretrainedConfig):
 
     def __init__(self,
                  vocab_size_or_config_json_file,
+                 causal=True,
                  d_model=1024,
                  n_layer=24,
                  n_head=16,
@@ -145,6 +146,7 @@ class XLMConfig(PretrainedConfig):
                 self.__dict__[key] = value
         elif isinstance(vocab_size_or_config_json_file, int):
             self.n_token = vocab_size_or_config_json_file
+            self.causal = causal
             self.d_model = d_model
             self.n_layer = n_layer
             self.n_head = n_head
@@ -396,7 +398,6 @@ class XLMPreTrainedModel(PreTrainedModel):
     """
     config_class = XLMConfig
     pretrained_model_archive_map = PRETRAINED_MODEL_ARCHIVE_MAP
-    pretrained_config_archive_map = PRETRAINED_CONFIG_ARCHIVE_MAP
     load_tf_weights = None
     base_model_prefix = "xlm"
 
@@ -429,7 +430,7 @@ class XLMModel(XLMPreTrainedModel):
                   'hidden_dim', 'dropout', 'attention_dropout', 'asm',
                   'asm_cutoffs', 'asm_div_value']
 
-    def __init__(self, params, output_attentions=False, keep_multihead_output=False):  #, dico, is_encoder, with_output):
+    def __init__(self, params, output_attentions=False, output_hidden_states=False):  #, dico, is_encoder, with_output):
         """ XLM model from: "Cross-lingual Language Model Pretraining" by Guillaume Lample, Alexis Conneau
             Paper: https://arxiv.org/abs/1901.07291
             Original code: https://github.com/facebookresearch/XLM
@@ -483,11 +484,13 @@ class XLMModel(XLMPreTrainedModel):
         """
         super(XLMModel, self).__init__(params)
         self.output_attentions = output_attentions
+        self.output_hidden_states = output_hidden_states
 
         # encoder / decoder, output layer
         # self.is_encoder = is_encoder
         # self.is_decoder = not is_encoder
         # self.with_output = with_output
+        self.causal = params.causal
 
         # dictionary / languages
         self.n_langs = params.n_langs
@@ -536,63 +539,45 @@ class XLMModel(XLMPreTrainedModel):
             self.ffns.append(TransformerFFN(self.dim, self.hidden_dim, self.dim, dropout=self.dropout, gelu_activation=params.gelu_activation))
             self.layer_norm2.append(nn.LayerNorm(self.dim, eps=1e-12))
 
-        # output layer
-        # if self.with_output:
-        #     self.pred_layer = PredLayer(params)
-        #     if params.share_inout_emb:
-        #         self.pred_layer.proj.weight = self.embeddings.weight
-
-    # def forward(self, mode, **kwargs):
-    #     """
-    #     Forward function with different forward modes.
-    #     ### Small hack to handle PyTorch distributed.
-    #     """
-    #     if mode == 'fwd':
-    #         return self.fwd(**kwargs)
-    #     elif mode == 'predict':
-    #         return self.predict(**kwargs)
-    #     else:
-    #         raise Exception("Unknown mode: %s" % mode)
-
-    def forward(self, x, lengths, causal, src_enc=None, src_len=None, positions=None, langs=None, cache=None):
+    def forward(self, x, lengths, positions=None, langs=None, cache=None, head_mask=None):  # src_enc=None, src_len=None, 
         """
         Inputs:
-            `x` LongTensor(slen, bs), containing word indices
+            `x` LongTensor(bs, slen), containing word indices
             `lengths` LongTensor(bs), containing the length of each sentence
             `causal` Boolean, if True, the attention is only done over previous hidden states
-            `positions` LongTensor(slen, bs), containing word positions
-            `langs` LongTensor(slen, bs), containing language IDs
+            `positions` LongTensor(bs, slen), containing word positions
+            `langs` LongTensor(bs, slen), containing language IDs
         """
         # lengths = (x != self.pad_index).float().sum(dim=1)
         # mask = x != self.pad_index
 
         # check inputs
-        slen, bs = x.size()
+        bs, slen = x.size()
         assert lengths.size(0) == bs
         assert lengths.max().item() <= slen
-        x = x.transpose(0, 1)  # batch size as dimension 0
-        assert (src_enc is None) == (src_len is None)
-        if src_enc is not None:
-            assert self.is_decoder
-            assert src_enc.size(0) == bs
+        # x = x.transpose(0, 1)  # batch size as dimension 0
+        # assert (src_enc is None) == (src_len is None)
+        # if src_enc is not None:
+        #     assert self.is_decoder
+        #     assert src_enc.size(0) == bs
 
         # generate masks
-        mask, attn_mask = get_masks(slen, lengths, causal)
-        if self.is_decoder and src_enc is not None:
-            src_mask = torch.arange(src_len.max(), dtype=torch.long, device=lengths.device) < src_len[:, None]
+        mask, attn_mask = get_masks(slen, lengths, self.causal)
+        # if self.is_decoder and src_enc is not None:
+        #     src_mask = torch.arange(src_len.max(), dtype=torch.long, device=lengths.device) < src_len[:, None]
 
         # positions
         if positions is None:
             positions = x.new(slen).long()
             positions = torch.arange(slen, out=positions).unsqueeze(0)
         else:
-            assert positions.size() == (slen, bs)
-            positions = positions.transpose(0, 1)
+            assert positions.size() == (bs, slen)  # (slen, bs)
+            # positions = positions.transpose(0, 1)
 
         # langs
         if langs is not None:
-            assert langs.size() == (slen, bs)
-            langs = langs.transpose(0, 1)
+            assert langs.size() == (bs, slen)  # (slen, bs)
+            # langs = langs.transpose(0, 1)
 
         # do not recompute cached elements
         if cache is not None:
@@ -614,620 +599,50 @@ class XLMModel(XLMPreTrainedModel):
         tensor *= mask.unsqueeze(-1).to(tensor.dtype)
 
         # transformer layers
+        hidden_states = []
+        attentions = []
         for i in range(self.n_layers):
+            if self.output_hidden_states:
+                hidden_states.append(tensor)
 
             # self attention
-            attn = self.attentions[i](tensor, attn_mask, cache=cache)
+            attn_outputs = self.attentions[i](tensor, attn_mask, cache=cache, head_mask=head_mask[i])
+            attn = attn_outputs[0]
+            if self.output_attentions:
+                attentions.append(attn_outputs[1])
             attn = F.dropout(attn, p=self.dropout, training=self.training)
             tensor = tensor + attn
             tensor = self.layer_norm1[i](tensor)
 
             # encoder attention (for decoder only)
-            if self.is_decoder and src_enc is not None:
-                attn = self.encoder_attn[i](tensor, src_mask, kv=src_enc, cache=cache)
-                attn = F.dropout(attn, p=self.dropout, training=self.training)
-                tensor = tensor + attn
-                tensor = self.layer_norm15[i](tensor)
+            # if self.is_decoder and src_enc is not None:
+            #     attn = self.encoder_attn[i](tensor, src_mask, kv=src_enc, cache=cache)
+            #     attn = F.dropout(attn, p=self.dropout, training=self.training)
+            #     tensor = tensor + attn
+            #     tensor = self.layer_norm15[i](tensor)
 
             # FFN
             tensor = tensor + self.ffns[i](tensor)
             tensor = self.layer_norm2[i](tensor)
             tensor *= mask.unsqueeze(-1).to(tensor.dtype)
 
+        # Add last hidden state
+        if self.output_hidden_states:
+            hidden_states.append(tensor)
+
         # update cache length
         if cache is not None:
             cache['slen'] += tensor.size(1)
 
         # move back sequence length to dimension 0
-        tensor = tensor.transpose(0, 1)
+        # tensor = tensor.transpose(0, 1)
 
-        return tensor
-
-    def predict(self, tensor, pred_mask, y, get_scores):
-        """
-        Given the last hidden state, compute word scores and/or the loss.
-            `pred_mask` is a ByteTensor of shape (slen, bs), filled with 1 when
-                we need to predict a word
-            `y` is a LongTensor of shape (pred_mask.sum(),)
-            `get_scores` is a boolean specifying whether we need to return scores
-        """
-        masked_tensor = tensor[pred_mask.unsqueeze(-1).expand_as(tensor)].view(-1, self.dim)
-        scores, loss = self.pred_layer(masked_tensor, y, get_scores)
-        return scores, loss
-
-    def generate(self, src_enc, src_len, tgt_lang_id, max_len=200, sample_temperature=None):
-        """
-        Decode a sentence given initial start.
-        `x`:
-            - LongTensor(bs, slen)
-                <EOS> W1 W2 W3 <EOS> <PAD>
-                <EOS> W1 W2 W3   W4  <EOS>
-        `lengths`:
-            - LongTensor(bs) [5, 6]
-        `positions`:
-            - False, for regular "arange" positions (LM)
-            - True, to reset positions from the new generation (MT)
-        `langs`:
-            - must be None if the model only supports one language
-            - lang_id if only one language is involved (LM)
-            - (lang_id1, lang_id2) if two languages are involved (MT)
-        """
-
-        # input batch
-        bs = len(src_len)
-        assert src_enc.size(0) == bs
-
-        # generated sentences
-        generated = src_len.new(max_len, bs)  # upcoming output
-        generated.fill_(self.pad_index)       # fill upcoming ouput with <PAD>
-        generated[0].fill_(self.eos_index)    # we use <EOS> for <BOS> everywhere
-
-        # positions
-        positions = src_len.new(max_len).long()
-        positions = torch.arange(max_len, out=positions).unsqueeze(1).expand(max_len, bs)
-
-        # language IDs
-        langs = src_len.new(max_len).long().fill_(tgt_lang_id)
-        langs = langs.unsqueeze(1).expand(max_len, bs)
-
-        # current position / max lengths / length of generated sentences / unfinished sentences
-        cur_len = 1
-        gen_len = src_len.clone().fill_(1)
-        unfinished_sents = src_len.clone().fill_(1)
-
-        # cache compute states
-        cache = {'slen': 0}
-
-        while cur_len < max_len:
-
-            # compute word scores
-            tensor = self.forward(
-                'fwd',
-                x=generated[:cur_len],
-                lengths=gen_len,
-                positions=positions[:cur_len],
-                langs=langs[:cur_len],
-                causal=True,
-                src_enc=src_enc,
-                src_len=src_len,
-                cache=cache
-            )
-            assert tensor.size() == (1, bs, self.dim)
-            tensor = tensor.data[-1, :, :]               # (bs, dim)
-            scores = self.pred_layer.get_scores(tensor)  # (bs, n_words)
-
-            # select next words: sample or greedy
-            if sample_temperature is None:
-                next_words = torch.topk(scores, 1)[1].squeeze(1)
-            else:
-                next_words = torch.multinomial(F.softmax(scores / sample_temperature, dim=1), 1).squeeze(1)
-            assert next_words.size() == (bs,)
-
-            # update generations / lengths / finished sentences / current length
-            generated[cur_len] = next_words * unfinished_sents + self.pad_index * (1 - unfinished_sents)
-            gen_len.add_(unfinished_sents)
-            unfinished_sents.mul_(next_words.ne(self.eos_index).long())
-            cur_len = cur_len + 1
-
-            # stop when there is a </s> in each sentence, or if we exceed the maximul length
-            if unfinished_sents.max() == 0:
-                break
-
-        # add <EOS> to unfinished sentences
-        if cur_len == max_len:
-            generated[-1].masked_fill_(unfinished_sents.byte(), self.eos_index)
-
-        # sanity check
-        assert (generated == self.eos_index).sum() == 2 * bs
-
-        return generated[:cur_len], gen_len
-
-    def generate_beam(self, src_enc, src_len, tgt_lang_id, beam_size, length_penalty, early_stopping, max_len=200):
-        """
-        Decode a sentence given initial start.
-        `x`:
-            - LongTensor(bs, slen)
-                <EOS> W1 W2 W3 <EOS> <PAD>
-                <EOS> W1 W2 W3   W4  <EOS>
-        `lengths`:
-            - LongTensor(bs) [5, 6]
-        `positions`:
-            - False, for regular "arange" positions (LM)
-            - True, to reset positions from the new generation (MT)
-        `langs`:
-            - must be None if the model only supports one language
-            - lang_id if only one language is involved (LM)
-            - (lang_id1, lang_id2) if two languages are involved (MT)
-        """
-
-        # check inputs
-        assert src_enc.size(0) == src_len.size(0)
-        assert beam_size >= 1
-
-        # batch size / number of words
-        bs = len(src_len)
-        n_words = self.n_words
-
-        # expand to beam size the source latent representations / source lengths
-        src_enc = src_enc.unsqueeze(1).expand((bs, beam_size) + src_enc.shape[1:]).contiguous().view((bs * beam_size,) + src_enc.shape[1:])
-        src_len = src_len.unsqueeze(1).expand(bs, beam_size).contiguous().view(-1)
-
-        # generated sentences (batch with beam current hypotheses)
-        generated = src_len.new(max_len, bs * beam_size)  # upcoming output
-        generated.fill_(self.pad_index)                   # fill upcoming ouput with <PAD>
-        generated[0].fill_(self.eos_index)                # we use <EOS> for <BOS> everywhere
-
-        # generated hypotheses
-        generated_hyps = [BeamHypotheses(beam_size, max_len, length_penalty, early_stopping) for _ in range(bs)]
-
-        # positions
-        positions = src_len.new(max_len).long()
-        positions = torch.arange(max_len, out=positions).unsqueeze(1).expand_as(generated)
-
-        # language IDs
-        langs = positions.clone().fill_(tgt_lang_id)
-
-        # scores for each sentence in the beam
-        beam_scores = src_enc.new(bs, beam_size).fill_(0)
-        beam_scores[:, 1:] = -1e9
-        beam_scores = beam_scores.view(-1)
-
-        # current position
-        cur_len = 1
-
-        # cache compute states
-        cache = {'slen': 0}
-
-        # done sentences
-        done = [False for _ in range(bs)]
-
-        while cur_len < max_len:
-
-            # compute word scores
-            tensor = self.forward(
-                'fwd',
-                x=generated[:cur_len],
-                lengths=src_len.new(bs * beam_size).fill_(cur_len),
-                positions=positions[:cur_len],
-                langs=langs[:cur_len],
-                causal=True,
-                src_enc=src_enc,
-                src_len=src_len,
-                cache=cache
-            )
-            assert tensor.size() == (1, bs * beam_size, self.dim)
-            tensor = tensor.data[-1, :, :]               # (bs * beam_size, dim)
-            scores = self.pred_layer.get_scores(tensor)  # (bs * beam_size, n_words)
-            scores = F.log_softmax(scores, dim=-1)       # (bs * beam_size, n_words)
-            assert scores.size() == (bs * beam_size, n_words)
-
-            # select next words with scores
-            _scores = scores + beam_scores[:, None].expand_as(scores)  # (bs * beam_size, n_words)
-            _scores = _scores.view(bs, beam_size * n_words)            # (bs, beam_size * n_words)
-
-            next_scores, next_words = torch.topk(_scores, 2 * beam_size, dim=1, largest=True, sorted=True)
-            assert next_scores.size() == next_words.size() == (bs, 2 * beam_size)
-
-            # next batch beam content
-            # list of (bs * beam_size) tuple(next hypothesis score, next word, current position in the batch)
-            next_batch_beam = []
-
-            # for each sentence
-            for sent_id in range(bs):
-
-                # if we are done with this sentence
-                done[sent_id] = done[sent_id] or generated_hyps[sent_id].is_done(next_scores[sent_id].max().item())
-                if done[sent_id]:
-                    next_batch_beam.extend([(0, self.pad_index, 0)] * beam_size)  # pad the batch
-                    continue
-
-                # next sentence beam content
-                next_sent_beam = []
-
-                # next words for this sentence
-                for idx, value in zip(next_words[sent_id], next_scores[sent_id]):
-
-                    # get beam and word IDs
-                    beam_id = idx // n_words
-                    word_id = idx % n_words
-
-                    # end of sentence, or next word
-                    if word_id == self.eos_index or cur_len + 1 == max_len:
-                        generated_hyps[sent_id].add(generated[:cur_len, sent_id * beam_size + beam_id].clone(), value.item())
-                    else:
-                        next_sent_beam.append((value, word_id, sent_id * beam_size + beam_id))
-
-                    # the beam for next step is full
-                    if len(next_sent_beam) == beam_size:
-                        break
-
-                # update next beam content
-                assert len(next_sent_beam) == 0 if cur_len + 1 == max_len else beam_size
-                if len(next_sent_beam) == 0:
-                    next_sent_beam = [(0, self.pad_index, 0)] * beam_size  # pad the batch
-                next_batch_beam.extend(next_sent_beam)
-                assert len(next_batch_beam) == beam_size * (sent_id + 1)
-
-            # sanity check / prepare next batch
-            assert len(next_batch_beam) == bs * beam_size
-            beam_scores = beam_scores.new([x[0] for x in next_batch_beam])
-            beam_words = generated.new([x[1] for x in next_batch_beam])
-            beam_idx = src_len.new([x[2] for x in next_batch_beam])
-
-            # re-order batch and internal states
-            generated = generated[:, beam_idx]
-            generated[cur_len] = beam_words
-            for k in cache.keys():
-                if k != 'slen':
-                    cache[k] = (cache[k][0][beam_idx], cache[k][1][beam_idx])
-
-            # update current length
-            cur_len = cur_len + 1
-
-            # stop when we are done with each sentence
-            if all(done):
-                break
-
-        # visualize hypotheses
-        # print([len(x) for x in generated_hyps], cur_len)
-        # globals().update( locals() );
-        # !import code; code.interact(local=vars())
-        # for ii in range(bs):
-        #     for ss, ww in sorted(generated_hyps[ii].hyp, key=lambda x: x[0], reverse=True):
-        #         print("%.3f " % ss + " ".join(self.dico[x] for x in ww.tolist()))
-        #     print("")
-
-        # select the best hypotheses
-        tgt_len = src_len.new(bs)
-        best = []
-
-        for i, hypotheses in enumerate(generated_hyps):
-            best_hyp = max(hypotheses.hyp, key=lambda x: x[0])[1]
-            tgt_len[i] = len(best_hyp) + 1  # +1 for the <EOS> symbol
-            best.append(best_hyp)
-
-        # generate target batch
-        decoded = src_len.new(tgt_len.max().item(), bs).fill_(self.pad_index)
-        for i, hypo in enumerate(best):
-            decoded[:tgt_len[i] - 1, i] = hypo
-            decoded[tgt_len[i] - 1, i] = self.eos_index
-
-        # sanity check
-        assert (decoded == self.eos_index).sum() == 2 * bs
-
-        return decoded, tgt_len
-
-
-class XLMModel(XLMPreTrainedModel):
-    def __init__(self, config, output_attentions=False, output_hidden_states=False):
-        super(XLMModel, self).__init__(config)
-        self.output_attentions = output_attentions
-        self.output_hidden_states = output_hidden_states
-
-        self.mem_len = config.mem_len
-        self.reuse_len = config.reuse_len
-        self.d_model = config.d_model
-        self.same_length = config.same_length
-        self.attn_type = config.attn_type
-        self.bi_data = config.bi_data
-        self.clamp_len = config.clamp_len
-
-        self.word_embedding = nn.Embedding(config.n_token, config.d_model)
-        self.mask_emb = nn.Parameter(torch.Tensor(1, 1, config.d_model))
-        layer = XLMLayer(config, output_attentions=output_attentions,
-                                   keep_multihead_output=keep_multihead_output)
-        self.layer = nn.ModuleList([copy.deepcopy(layer) for _ in range(config.n_layer)])
-        self.dropout = nn.Dropout(config.dropout)
-
-    def prune_heads(self, heads_to_prune):
-        """ Prunes heads of the model.
-            heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
-        """
-        for layer, heads in heads_to_prune.items():
-            self.layer[layer].attention.prune_heads(heads)
-
-    def get_multihead_outputs(self):
-        """ Gather all multi-head outputs.
-            Return: list (layers) of multihead module outputs with gradients
-        """
-        return [layer.attention.self.multihead_output for layer in self.layer]
-
-    def create_mask(self, qlen, mlen):
-        """ create causal attention mask.
-            float mask where 1.0 indicate masked, 0.0 indicated not-masked.
-             same_length=False:      same_length=True:
-             <mlen > <  qlen >       <mlen > <  qlen >
-          ^ [0 0 0 0 0 1 1 1 1]     [0 0 0 0 0 1 1 1 1]
-            [0 0 0 0 0 0 1 1 1]     [1 0 0 0 0 0 1 1 1]
-       qlen [0 0 0 0 0 0 0 1 1]     [1 1 0 0 0 0 0 1 1]
-            [0 0 0 0 0 0 0 0 1]     [1 1 1 0 0 0 0 0 1]
-          v [0 0 0 0 0 0 0 0 0]     [1 1 1 1 0 0 0 0 0]
-        """
-        attn_mask = torch.ones([qlen, qlen])
-        mask_up = torch.triu(attn_mask, diagonal=1)
-        attn_mask_pad = torch.zeros([qlen, mlen])
-        ret = torch.cat([attn_mask_pad, mask_up], dim=1)
-        if self.same_length:
-            mask_lo = torch.tril(attn_mask, diagonal=-1)
-            ret = torch.cat([ret[:, :qlen] + mask_lo, ret[:, qlen:]], dim=1)
-
-        ret = ret.to(next(self.parameters()))
-        return ret
-
-    def cache_mem(self, curr_out, prev_mem):
-        """cache hidden states into memory."""
-        if self.mem_len is None or self.mem_len == 0:
-            return None
-        else:
-            if self.reuse_len is not None and self.reuse_len > 0:
-                curr_out = curr_out[:self.reuse_len]
-
-            if prev_mem is None:
-                new_mem = curr_out[-self.mem_len:]
-            else:
-                new_mem = torch.cat([prev_mem, curr_out], dim=0)[-self.mem_len:]
-
-        return new_mem.detach()
-
-    @staticmethod
-    def positional_embedding(pos_seq, inv_freq, bsz=None):
-        sinusoid_inp = torch.einsum('i,d->id', pos_seq, inv_freq)
-        pos_emb = torch.cat([torch.sin(sinusoid_inp), torch.cos(sinusoid_inp)], dim=-1)
-        pos_emb = pos_emb[:, None, :]
-
-        if bsz is not None:
-            pos_emb = pos_emb.expand(-1, bsz, -1)
-
-        return pos_emb
-
-    def relative_positional_encoding(self, qlen, klen, bsz=None):
-        """create relative positional encoding."""
-        freq_seq = torch.arange(0, self.d_model, 2.0, dtype=torch.float)
-        inv_freq = 1 / (10000 ** (freq_seq / self.d_model))
-
-        if self.attn_type == 'bi':
-            # beg, end = klen - 1, -qlen
-            beg, end = klen, -qlen
-        elif self.attn_type == 'uni':
-            # beg, end = klen - 1, -1
-            beg, end = klen, -1
-        else:
-            raise ValueError('Unknown `attn_type` {}.'.format(self.attn_type))
-
-        if self.bi_data:
-            fwd_pos_seq = torch.arange(beg, end, -1.0, dtype=torch.float)
-            bwd_pos_seq = torch.arange(-beg, -end, 1.0, dtype=torch.float)
-
-            if self.clamp_len > 0:
-                fwd_pos_seq = fwd_pos_seq.clamp(-self.clamp_len, self.clamp_len)
-                bwd_pos_seq = bwd_pos_seq.clamp(-self.clamp_len, self.clamp_len)
-
-            if bsz is not None:
-                fwd_pos_emb = self.positional_embedding(fwd_pos_seq, inv_freq, bsz//2)
-                bwd_pos_emb = self.positional_embedding(bwd_pos_seq, inv_freq, bsz//2)
-            else:
-                fwd_pos_emb = self.positional_embedding(fwd_pos_seq, inv_freq)
-                bwd_pos_emb = self.positional_embedding(bwd_pos_seq, inv_freq)
-
-            pos_emb = torch.cat([fwd_pos_emb, bwd_pos_emb], dim=1)
-        else:
-            fwd_pos_seq = torch.arange(beg, end, -1.0)
-            if self.clamp_len > 0:
-                fwd_pos_seq = fwd_pos_seq.clamp(-self.clamp_len, self.clamp_len)
-            pos_emb = self.positional_embedding(fwd_pos_seq, inv_freq, bsz)
-
-        pos_emb = pos_emb.to(next(self.parameters()))
-        return pos_emb
-
-    def forward(self, inp_k, token_type_ids=None, input_mask=None, attention_mask=None,
-                mems=None, perm_mask=None, target_mapping=None, inp_q=None, head_mask=None):
-        """
-        Args:
-            inp_k: int32 Tensor in shape [bsz, len], the input token IDs.
-            token_type_ids: int32 Tensor in shape [bsz, len], the input segment IDs.
-            input_mask: [optional] float32 Tensor in shape [bsz, len], the input mask.
-                0 for real tokens and 1 for padding.
-            attention_mask: [optional] float32 Tensor, SAME FUNCTION as `input_mask`
-                but with 1 for real tokens and 0 for padding.
-                Added for easy compatibility with the XLM model (which uses this negative masking).
-                You can only uses one among `input_mask` and `attention_mask`
-            mems: [optional] a list of float32 Tensors in shape [mem_len, bsz, d_model], memory
-                from previous batches. The length of the list equals n_layer.
-                If None, no memory is used.
-            perm_mask: [optional] float32 Tensor in shape [bsz, len, len].
-                If perm_mask[k, i, j] = 0, i attend to j in batch k;
-                if perm_mask[k, i, j] = 1, i does not attend to j in batch k.
-                If None, each position attends to all the others.
-            target_mapping: [optional] float32 Tensor in shape [bsz, num_predict, len].
-                If target_mapping[k, i, j] = 1, the i-th predict in batch k is
-                on the j-th token.
-                Only used during pretraining for partial prediction.
-                Set to None during finetuning.
-            inp_q: [optional] float32 Tensor in shape [bsz, len].
-                1 for tokens with losses and 0 for tokens without losses.
-                Only used during pretraining for two-stream attention.
-                Set to None during finetuning.
-
-            mem_len: int, the number of tokens to cache.
-            reuse_len: int, the number of tokens in the currect batch to be cached
-                and reused in the future.
-            bi_data: bool, whether to use bidirectional input pipeline.
-                Usually set to True during pretraining and False during finetuning.
-            clamp_len: int, clamp all relative distances larger than clamp_len.
-                -1 means no clamping.
-            same_length: bool, whether to use the same attention length for each token.
-            summary_type: str, "last", "first", "mean", or "attn". The method
-                to pool the input to get a vector representation.
-        """
-        # the original code for XLM uses shapes [len, bsz] with the batch dimension at the end
-        # but we want a unified interface in the library with the batch size on the first dimension
-        # so we move here the first dimension (batch) to the end
-        inp_k = inp_k.transpose(0, 1).contiguous()
-        token_type_ids = token_type_ids.transpose(0, 1).contiguous() if token_type_ids is not None else None
-        input_mask = input_mask.transpose(0, 1).contiguous() if input_mask is not None else None
-        attention_mask = attention_mask.transpose(0, 1).contiguous() if attention_mask is not None else None
-        perm_mask = perm_mask.permute(1, 2, 0).contiguous() if perm_mask is not None else None
-        target_mapping = target_mapping.permute(1, 2, 0).contiguous() if target_mapping is not None else None
-        inp_q = inp_q.transpose(0, 1).contiguous() if inp_q is not None else None
-
-        qlen, bsz = inp_k.shape[0], inp_k.shape[1]
-        mlen = mems[0].shape[0] if mems is not None else 0
-        klen = mlen + qlen
-
-        dtype_float = next(self.parameters()).dtype
-        device = next(self.parameters()).device
-
-        ##### Attention mask
-        # causal attention mask
-        if self.attn_type == 'uni':
-            attn_mask = self.create_mask(qlen, mlen)
-            attn_mask = attn_mask[:, :, None, None]
-        elif self.attn_type == 'bi':
-            attn_mask = None
-        else:
-            raise ValueError('Unsupported attention type: {}'.format(self.attn_type))
-
-        # data mask: input mask & perm mask
-        assert input_mask is None or attention_mask is None, "You can only use one of input_mask (uses 1 for padding) "
-        "or attention_mask (uses 0 for padding, added for compatbility with XLM). Please choose one."
-        if input_mask is None and attention_mask is not None:
-            input_mask = 1.0 - attention_mask
-        if input_mask is not None and perm_mask is not None:
-            data_mask = input_mask[None] + perm_mask
-        elif input_mask is not None and perm_mask is None:
-            data_mask = input_mask[None]
-        elif input_mask is None and perm_mask is not None:
-            data_mask = perm_mask
-        else:
-            data_mask = None
-
-        if data_mask is not None:
-            # all mems can be attended to
-            mems_mask = torch.zeros([data_mask.shape[0], mlen, bsz]).to(data_mask)
-            data_mask = torch.cat([mems_mask, data_mask], dim=1)
-            if attn_mask is None:
-                attn_mask = data_mask[:, :, :, None]
-            else:
-                attn_mask += data_mask[:, :, :, None]
-
-        if attn_mask is not None:
-            attn_mask = (attn_mask > 0).to(dtype_float)
-
-        if attn_mask is not None:
-            non_tgt_mask = -torch.eye(qlen).to(attn_mask)
-            non_tgt_mask = torch.cat([torch.zeros([qlen, mlen]).to(attn_mask), non_tgt_mask], dim=-1)
-            non_tgt_mask = ((attn_mask + non_tgt_mask[:, :, None, None]) > 0).to(attn_mask)
-        else:
-            non_tgt_mask = None
-
-        ##### Word embeddings and prepare h & g hidden states
-        word_emb_k = self.word_embedding(inp_k)
-        output_h = self.dropout(word_emb_k)
-        if inp_q is not None:
-            if target_mapping is not None:
-                word_emb_q = self.mask_emb.expand(target_mapping.shape[0], bsz, -1)
-            else:
-                inp_q_ext = inp_q[:, :, None]
-                word_emb_q = inp_q_ext * self.mask_emb + (1 - inp_q_ext) * word_emb_k
-            output_g = self.dropout(word_emb_q)
-        else:
-            output_g = None
-
-        ##### Segment embedding
-        if token_type_ids is not None:
-            # Convert `token_type_ids` to one-hot `seg_mat`
-            mem_pad = torch.zeros([mlen, bsz], dtype=torch.long, device=device)
-            cat_ids = torch.cat([mem_pad, token_type_ids], dim=0)
-
-            # `1` indicates not in the same segment [qlen x klen x bsz]
-            seg_mat = (token_type_ids[:, None] != cat_ids[None, :]).long()
-            seg_mat = F.one_hot(seg_mat, num_classes=2).to(dtype_float)
-        else:
-            seg_mat = None
-
-        ##### Positional encoding
-        pos_emb = self.relative_positional_encoding(qlen, klen, bsz=bsz)
-        pos_emb = self.dropout(pos_emb)
-
-        ##### Head mask if needed (for bertology/pruning)
-        # 1.0 in head_mask indicate we keep the head
-        # attention_probs has shape bsz x n_heads x N x N
-        # input head_mask has shape [num_heads] or [n_layer x num_heads]
-        # and head_mask is converted to shape [n_layer x batch x num_heads x seq_length x seq_length]
-        if head_mask is not None:
-            if head_mask.dim() == 1:
-                head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
-                head_mask = head_mask.expand(self.config.n_layer, -1, -1, -1, -1)
-            elif head_mask.dim() == 2:
-                head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)  # We can specify head_mask for each layer
-            head_mask = head_mask.to(dtype=next(self.parameters()).dtype) # switch to fload if need + fp16 compatibility
-        else:
-            head_mask = [None] * self.config.n_layer
-
-        new_mems = []
-        if mems is None:
-            mems = [None] * len(self.layer)
-
-        hidden_states = []
-        attentions = []
-        for i, layer_module in enumerate(self.layer):
-            # cache new mems
-            new_mems.append(self.cache_mem(output_h, mems[i]))
-            # Save hidden_states
-            if output_g is None:
-                hidden_states.append(output_h)
-            else:
-                hidden_states.append((output_h, output_g))
-
-            output_h, output_g = layer_module(output_h, output_g,
-                                              attn_mask_h=non_tgt_mask, attn_mask_g=attn_mask,
-                                              r=pos_emb, seg_mat=seg_mat,
-                                              mems=mems[i], target_mapping=target_mapping,
-                                              head_mask=head_mask)
-        # Save last hidden_state
-        if output_g is None:
-            hidden_states.append(output_h)
-        else:
-            hidden_states.append((output_h, output_g))
-
-        # Select the right output and add dropout
-        output = self.dropout(output_g if output_g is not None else output_h)
-
-        # We transpose back here to shape [bsz, len, hidden_dim] (cf. beginning of forward() method)
-        output = output.permute(1, 0, 2).contiguous()
-        if output_g is None:
-            hidden_states = [hs.permute(1, 0, 2).contiguous() for hs in hidden_states]
-        else:
-            hidden_states = [h.permute(1, 0, 2).contiguous() for hs in hidden_states for h in hs]
-
-        # Build the list of outputs
-        outputs = [output, new_mems]
-        if self.output_attentions:
-            outputs.append(attentions)
+        outputs = [tensor]
         if self.output_hidden_states:
             outputs.append(hidden_states)
-
-        return outputs
+        if self.output_attentions:
+            outputs.append(attentions)
+        return outputs  # outputs, (hidden_states), (attentions)
 
 
 class XLMPredLayer(nn.Module):
@@ -1275,63 +690,59 @@ class XLMPredLayer(nn.Module):
         return self.proj.log_prob(x) if self.asm else self.proj(x)
 
 
-class XLMLMHeadModel(XLMPreTrainedModel):
-    """XLM model ("XLM: Generalized Autoregressive Pretraining for Language Understanding").
 
-    Params:
-        `config`: a XLMConfig class instance with the configuration to build a new model
-        `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
-        `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
-            This can be used to compute head importance metrics. Default: False
+class XLMWithLMHeadModel(XLMPreTrainedModel):
+        """ XLM model from: "Cross-lingual Language Model Pretraining" by Guillaume Lample, Alexis Conneau
+            Paper: https://arxiv.org/abs/1901.07291
+            Original code: https://github.com/facebookresearch/XLM
 
-    Inputs:
-        inp_k: int32 Tensor in shape [bsz, len], the input token IDs.
-        token_type_ids: int32 Tensor in shape [bsz, len], the input segment IDs.
-        attention_mask: [optional] float32 Tensor in shape [bsz, len], the input mask.
-            0 for real tokens and 1 for padding.
-        mems: [optional] a list of float32 Tensors in shape [mem_len, bsz, d_model], memory
-            from previous batches. The length of the list equals n_layer.
-            If None, no memory is used.
-        perm_mask: [optional] float32 Tensor in shape [bsz, len, len].
-            If perm_mask[k, i, j] = 0, i attend to j in batch k;
-            if perm_mask[k, i, j] = 1, i does not attend to j in batch k.
-            If None, each position attends to all the others.
-        target_mapping: [optional] float32 Tensor in shape [bsz, num_predict, len].
-            If target_mapping[k, i, j] = 1, the i-th predict in batch k is
-            on the j-th token.
-            Only used during pretraining for partial prediction.
-            Set to None during finetuning.
-        inp_q: [optional] float32 Tensor in shape [bsz, len].
-            1 for tokens with losses and 0 for tokens without losses.
-            Only used during pretraining for two-stream attention.
-            Set to None during finetuning.
+        Params:
+            `config`: a XLMConfig class instance with the configuration to build a new model
+            `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
+            `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
+                This can be used to compute head importance metrics. Default: False
+
+        Inputs:
+            `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
+                with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
+                `run_bert_extract_features.py`, `run_bert_classifier.py` and `run_bert_squad.py`)
+            `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
+                types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
+                a `sentence B` token (see XLM paper for more details).
+            `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
+                selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
+                input sequence length in the current batch. It's the mask that we typically use for attention when
+                a batch has varying length sentences.
+            `output_all_encoded_layers`: boolean which controls the content of the `encoded_layers` output as described below. Default: `True`.
+            `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
+                It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
 
 
-    Outputs: Tuple of (encoded_layers, pooled_output)
-        `encoded_layers`: controled by `output_all_encoded_layers` argument:
-            - `output_all_encoded_layers=True`: outputs a list of the full sequences of encoded-hidden-states at the end
-                of each attention block (i.e. 12 full sequences for XLM-base, 24 for XLM-large), each
-                encoded-hidden-state is a torch.FloatTensor of size [batch_size, sequence_length, d_model],
-            - `output_all_encoded_layers=False`: outputs only the full sequence of hidden-states corresponding
-                to the last attention block of shape [batch_size, sequence_length, d_model],
-        `pooled_output`: a torch.FloatTensor of size [batch_size, d_model] which is the output of a
-            classifier pretrained on top of the hidden state associated to the first character of the
-            input (`CLS`) to train on the Next-Sentence task (see XLM's paper).
+        Outputs: Tuple of (encoded_layers, pooled_output)
+            `encoded_layers`: controled by `output_all_encoded_layers` argument:
+                - `output_all_encoded_layers=True`: outputs a list of the full sequences of encoded-hidden-states at the end
+                    of each attention block (i.e. 12 full sequences for XLM-base, 24 for XLM-large), each
+                    encoded-hidden-state is a torch.FloatTensor of size [batch_size, sequence_length, hidden_size],
+                - `output_all_encoded_layers=False`: outputs only the full sequence of hidden-states corresponding
+                    to the last attention block of shape [batch_size, sequence_length, hidden_size],
+            `pooled_output`: a torch.FloatTensor of size [batch_size, hidden_size] which is the output of a
+                classifier pretrained on top of the hidden state associated to the first character of the
+                input (`CLS`) to train on the Next-Sentence task (see XLM's paper).
 
-    Example usage:
-    ```python
-    # Already been converted into WordPiece token ids
-    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
-    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
-    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
+        Example usage:
+        ```python
+        # Already been converted into WordPiece token ids
+        input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
+        input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
+        token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
 
-    config = modeling.XLMConfig(vocab_size_or_config_json_file=32000, d_model=768,
-        n_layer=12, num_attention_heads=12, intermediate_size=3072)
+        config = modeling.XLMConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
+            num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
 
-    model = modeling.XLMModel(config=config)
-    all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)
-    ```
-    """
+        model = modeling.XLMModel(config=config)
+        all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)
+        ```
+        """
     def __init__(self, config, output_attentions=False, output_hidden_states=False):
         super(XLMLMHeadModel, self).__init__(config)
         self.output_attentions = output_attentions
@@ -1341,9 +752,7 @@ class XLMLMHeadModel(XLMPreTrainedModel):
         self.same_length = config.same_length
 
         self.transformer = XLMModel(config, output_attentions=output_attentions, output_hidden_states=output_hidden_states)
-        self.lm_loss = nn.Linear(config.d_model, config.n_token, bias=True)
-
-        # Tie weights
+        self.pred_layer = XLMPredLayer(config)
 
         self.apply(self.init_weights)
         self.tie_weights()
@@ -1351,10 +760,9 @@ class XLMLMHeadModel(XLMPreTrainedModel):
     def tie_weights(self):
         """ Make sure we are sharing the embeddings
         """
-        self.lm_loss.weight = self.transformer.word_embedding.weight
+        self.pred_layer.proj.weight = self.transformer.embeddings.weight
 
-    def forward(self, inp_k, token_type_ids=None, input_mask=None, attention_mask=None,
-                mems=None, perm_mask=None, target_mapping=None, inp_q=None,
+    def forward(self, x, lengths, positions=None, langs=None, cache=None,
                 labels=None, head_mask=None):
         """
         Args:
@@ -1382,11 +790,10 @@ class XLMLMHeadModel(XLMPreTrainedModel):
             summary_type: str, "last", "first", "mean", or "attn". The method
                 to pool the input to get a vector representation.
         """
-        transformer_outputs = self.transformer(inp_k, token_type_ids, input_mask, attention_mask,
-                                               mems, perm_mask, target_mapping, inp_q, head_mask)
+        transformer_outputs = self.transformer(x, lengths, positions=positions, langs=langs, cache=cache, head_mask=head_mask)
 
         output = transformer_outputs[0]
-        logits = self.lm_loss(output)
+        logits = self.pred_layer(output, labels)
 
         outputs = transformer_outputs[1:]  # Keep new_mems and attention/hidden states if they are here
 
diff --git a/pytorch_pretrained_bert/modeling_xlnet.py b/pytorch_pretrained_bert/modeling_xlnet.py
index c8fff081cb..754a03f37d 100644
--- a/pytorch_pretrained_bert/modeling_xlnet.py
+++ b/pytorch_pretrained_bert/modeling_xlnet.py
@@ -198,7 +198,7 @@ class XLNetConfig(PretrainedConfig):
     pretrained_config_archive_map = PRETRAINED_CONFIG_ARCHIVE_MAP
 
     def __init__(self,
-                 vocab_size_or_config_json_file,
+                 vocab_size_or_config_json_file=32000,
                  d_model=1024,
                  n_layer=24,
                  n_head=16,
@@ -221,7 +221,12 @@ class XLNetConfig(PretrainedConfig):
                  bi_data=False,
                  clamp_len=-1,
                  same_length=False,
-                 finetuning_task=None):
+                 
+                 finetuning_task=None,
+                 num_labels=2,
+                 summary_type="last",
+                 use_proj=True,
+                 **kwargs):
         """Constructs XLNetConfig.
 
         Args:
@@ -265,6 +270,8 @@ class XLNetConfig(PretrainedConfig):
             same_length: bool, whether to use the same attention length for each token.
             finetuning_task: name of the glue task on which the model was fine-tuned if any
         """
+        super(XLNetConfig, self).__init__(**kwargs)
+
         if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
                         and isinstance(vocab_size_or_config_json_file, unicode)):
             with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader:
@@ -297,7 +304,11 @@ class XLNetConfig(PretrainedConfig):
             self.bi_data = bi_data
             self.clamp_len = clamp_len
             self.same_length = same_length
+
             self.finetuning_task = finetuning_task
+            self.num_labels = num_labels
+            self.summary_type = summary_type
+            self.use_proj = use_proj
         else:
             raise ValueError("First argument must be either a vocabulary size (int)"
                              "or the path to a pretrained model config file (str)")
@@ -323,9 +334,10 @@ except ImportError:
             return self.weight * x + self.bias
 
 class XLNetRelativeAttention(nn.Module):
-    def __init__(self, config, output_attentions=False):
+    def __init__(self, config):
         super(XLNetRelativeAttention, self).__init__()
-        self.output_attentions = output_attentions
+        self.output_attentions = config.output_attentions
+
         if config.d_model % config.n_head != 0:
             raise ValueError(
                 "The hidden size (%d) is not a multiple of the number of attention "
@@ -533,10 +545,9 @@ class XLNetFeedForward(nn.Module):
         return output
 
 class XLNetLayer(nn.Module):
-    def __init__(self, config, output_attentions=False, ):
+    def __init__(self, config):
         super(XLNetLayer, self).__init__()
-        self.output_attentions = output_attentions
-        self.rel_attn = XLNetRelativeAttention(config, output_attentions=output_attentions)
+        self.rel_attn = XLNetRelativeAttention(config)
         self.ff = XLNetFeedForward(config)
         self.dropout = nn.Dropout(config.dropout)
 
@@ -562,7 +573,6 @@ class XLNetPreTrainedModel(PreTrainedModel):
     """
     config_class = XLNetConfig
     pretrained_model_archive_map = PRETRAINED_MODEL_ARCHIVE_MAP
-    pretrained_config_archive_map = PRETRAINED_CONFIG_ARCHIVE_MAP
     load_tf_weights = load_tf_weights_in_xlnet
     base_model_prefix = "transformer"
 
@@ -589,10 +599,10 @@ class XLNetPreTrainedModel(PreTrainedModel):
 
 
 class XLNetModel(XLNetPreTrainedModel):
-    def __init__(self, config, output_attentions=False, output_hidden_states=False):
+    def __init__(self, config):
         super(XLNetModel, self).__init__(config)
-        self.output_attentions = output_attentions
-        self.output_hidden_states = output_hidden_states
+        self.output_attentions = config.output_attentions
+        self.output_hidden_states = config.output_hidden_states
 
         self.mem_len = config.mem_len
         self.reuse_len = config.reuse_len
@@ -601,25 +611,17 @@ class XLNetModel(XLNetPreTrainedModel):
         self.attn_type = config.attn_type
         self.bi_data = config.bi_data
         self.clamp_len = config.clamp_len
+        self.n_layer = config.n_layer
 
         self.word_embedding = nn.Embedding(config.n_token, config.d_model)
         self.mask_emb = nn.Parameter(torch.Tensor(1, 1, config.d_model))
-        layer = XLNetLayer(config, output_attentions=output_attentions)
+        layer = XLNetLayer(config)
         self.layer = nn.ModuleList([copy.deepcopy(layer) for _ in range(config.n_layer)])
         self.dropout = nn.Dropout(config.dropout)
 
-    def prune_heads(self, heads_to_prune):
-        """ Prunes heads of the model.
-            heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
-        """
-        for layer, heads in heads_to_prune.items():
-            self.layer[layer].attention.prune_heads(heads)
-
-    def get_multihead_outputs(self):
-        """ Gather all multi-head outputs.
-            Return: list (layers) of multihead module outputs with gradients
-        """
-        return [layer.attention.self.multihead_output for layer in self.layer]
+    def _prune_heads(self, heads_to_prune):
+        logger.info("Head pruning is not implemented for XLNet")
+        pass
 
     def create_mask(self, qlen, mlen):
         """ create causal attention mask.
@@ -708,11 +710,11 @@ class XLNetModel(XLNetPreTrainedModel):
         pos_emb = pos_emb.to(next(self.parameters()))
         return pos_emb
 
-    def forward(self, inp_k, token_type_ids=None, input_mask=None, attention_mask=None,
+    def forward(self, input_ids, token_type_ids=None, input_mask=None, attention_mask=None,
                 mems=None, perm_mask=None, target_mapping=None, inp_q=None, head_mask=None):
         """
         Args:
-            inp_k: int32 Tensor in shape [bsz, len], the input token IDs.
+            input_ids: int32 Tensor in shape [bsz, len], the input token IDs.
             token_type_ids: int32 Tensor in shape [bsz, len], the input segment IDs.
             input_mask: [optional] float32 Tensor in shape [bsz, len], the input mask.
                 0 for real tokens and 1 for padding.
@@ -751,7 +753,7 @@ class XLNetModel(XLNetPreTrainedModel):
         # the original code for XLNet uses shapes [len, bsz] with the batch dimension at the end
         # but we want a unified interface in the library with the batch size on the first dimension
         # so we move here the first dimension (batch) to the end
-        inp_k = inp_k.transpose(0, 1).contiguous()
+        input_ids = input_ids.transpose(0, 1).contiguous()
         token_type_ids = token_type_ids.transpose(0, 1).contiguous() if token_type_ids is not None else None
         input_mask = input_mask.transpose(0, 1).contiguous() if input_mask is not None else None
         attention_mask = attention_mask.transpose(0, 1).contiguous() if attention_mask is not None else None
@@ -759,7 +761,7 @@ class XLNetModel(XLNetPreTrainedModel):
         target_mapping = target_mapping.permute(1, 2, 0).contiguous() if target_mapping is not None else None
         inp_q = inp_q.transpose(0, 1).contiguous() if inp_q is not None else None
 
-        qlen, bsz = inp_k.shape[0], inp_k.shape[1]
+        qlen, bsz = input_ids.shape[0], input_ids.shape[1]
         mlen = mems[0].shape[0] if mems is not None else 0
         klen = mlen + qlen
 
@@ -810,7 +812,7 @@ class XLNetModel(XLNetPreTrainedModel):
             non_tgt_mask = None
 
         ##### Word embeddings and prepare h & g hidden states
-        word_emb_k = self.word_embedding(inp_k)
+        word_emb_k = self.word_embedding(input_ids)
         output_h = self.dropout(word_emb_k)
         if inp_q is not None:
             if target_mapping is not None:
@@ -838,20 +840,20 @@ class XLNetModel(XLNetPreTrainedModel):
         pos_emb = self.relative_positional_encoding(qlen, klen, bsz=bsz)
         pos_emb = self.dropout(pos_emb)
 
-        ##### Head mask if needed (for bertology/pruning)
+        # Prepare head mask if needed
         # 1.0 in head_mask indicate we keep the head
         # attention_probs has shape bsz x n_heads x N x N
-        # input head_mask has shape [num_heads] or [n_layer x num_heads]
-        # and head_mask is converted to shape [n_layer x batch x num_heads x seq_length x seq_length]
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] (a head_mask for each layer)
+        # and head_mask is converted to shape [num_hidden_layers x qlen x klen x bsz x n_head]
         if head_mask is not None:
             if head_mask.dim() == 1:
-                head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
-                head_mask = head_mask.expand(self.config.n_layer, -1, -1, -1, -1)
+                head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(0).unsqueeze(0)
+                head_mask = head_mask.expand(self.n_layer, -1, -1, -1, -1)
             elif head_mask.dim() == 2:
-                head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)  # We can specify head_mask for each layer
+                head_mask = head_mask.unsqueeze(1).unsqueeze(1).unsqueeze(1)
             head_mask = head_mask.to(dtype=next(self.parameters()).dtype) # switch to fload if need + fp16 compatibility
         else:
-            head_mask = [None] * self.config.n_layer
+            head_mask = [None] * self.n_layer
 
         new_mems = []
         if mems is None:
@@ -870,7 +872,7 @@ class XLNetModel(XLNetPreTrainedModel):
                                    head_mask=head_mask[i])
             output_h, output_g = outputs[:2]
             if self.output_attentions:
-                attentions.append(outputs[2:])
+                attentions.append(outputs[2])
 
         # Add last hidden state
         if self.output_hidden_states:
@@ -887,6 +889,7 @@ class XLNetModel(XLNetPreTrainedModel):
                 hidden_states = [hs.permute(1, 0, 2).contiguous() for hs in hidden_states]
             outputs.append(hidden_states)
         if self.output_attentions:
+            attentions = list(t.permute(2, 3, 0, 1).contiguous() for t in attentions)
             outputs.append(attentions)
 
         return outputs  # outputs, new_mems, (hidden_states), (attentions)
@@ -902,7 +905,7 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
             This can be used to compute head importance metrics. Default: False
 
     Inputs:
-        inp_k: int32 Tensor in shape [bsz, len], the input token IDs.
+        input_ids: int32 Tensor in shape [bsz, len], the input token IDs.
         token_type_ids: int32 Tensor in shape [bsz, len], the input segment IDs.
         input_mask: [optional] float32 Tensor in shape [bsz, len], the input mask.
             0 for real tokens and 1 for padding.
@@ -953,16 +956,12 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
     all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)
     ```
     """
-    def __init__(self, config, output_attentions=False, output_hidden_states=False):
+    def __init__(self, config):
         super(XLNetLMHeadModel, self).__init__(config)
-        self.output_attentions = output_attentions
-        self.output_hidden_states = output_hidden_states
-
         self.attn_type = config.attn_type
         self.same_length = config.same_length
 
-        self.transformer = XLNetModel(config, output_attentions=output_attentions,
-                                              output_hidden_states=output_hidden_states)
+        self.transformer = XLNetModel(config)
         self.lm_loss = nn.Linear(config.d_model, config.n_token, bias=True)
 
         # Tie weights
@@ -975,12 +974,12 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
         """
         self.lm_loss.weight = self.transformer.word_embedding.weight
 
-    def forward(self, inp_k, token_type_ids=None, input_mask=None, attention_mask=None,
+    def forward(self, input_ids, token_type_ids=None, input_mask=None, attention_mask=None,
                 mems=None, perm_mask=None, target_mapping=None, inp_q=None,
                 labels=None, head_mask=None):
         """
         Args:
-            inp_k: int32 Tensor in shape [bsz, len], the input token IDs.
+            input_ids: int32 Tensor in shape [bsz, len], the input token IDs.
             token_type_ids: int32 Tensor in shape [bsz, len], the input segment IDs.
             input_mask: float32 Tensor in shape [bsz, len], the input mask.
                 0 for real tokens and 1 for padding.
@@ -1008,7 +1007,7 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
             summary_type: str, "last", "first", "mean", or "attn". The method
                 to pool the input to get a vector representation.
         """
-        transformer_outputs = self.transformer(inp_k, token_type_ids, input_mask, attention_mask,
+        transformer_outputs = self.transformer(input_ids, token_type_ids, input_mask, attention_mask,
                                                mems, perm_mask, target_mapping, inp_q, head_mask)
 
         logits = self.lm_loss(transformer_outputs[0])
@@ -1025,14 +1024,14 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
         return outputs  # return (loss), logits, (mems), (hidden states), (attentions)
 
 class XLNetSequenceSummary(nn.Module):
-    def __init__(self, config, summary_type="last", use_proj=True):
+    def __init__(self, config):
         super(XLNetSequenceSummary, self).__init__()
-        self.summary_type = summary_type
-        if use_proj:
+        self.summary_type = config.summary_type
+        if config.use_proj:
             self.summary = nn.Linear(config.d_model, config.d_model)
         else:
             self.summary = None
-        if summary_type == 'attn':
+        if config.summary_type == 'attn':
             # We should use a standard multi-head attention module with absolute positional embedding for that.
             # Cf. https://github.com/zihangdai/xlnet/blob/master/modeling.py#L253-L276
             # We can probably just use the multi-head attention module of PyTorch >=1.1.0
@@ -1069,7 +1068,7 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel):
             to pool the input to get a vector representation. Default: last
 
     Inputs:
-        inp_k: int32 Tensor in shape [bsz, len], the input token IDs.
+        input_ids: int32 Tensor in shape [bsz, len], the input token IDs.
         token_type_ids: int32 Tensor in shape [bsz, len], the input segment IDs.
         input_mask: float32 Tensor in shape [bsz, len], the input mask.
             0 for real tokens and 1 for padding.
@@ -1121,30 +1120,21 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel):
     all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)
     ```
     """
-    def __init__(self, config, summary_type="last", use_proj=True, num_labels=2,
-                 output_attentions=False, output_hidden_states=False):
+    def __init__(self, config):
         super(XLNetForSequenceClassification, self).__init__(config)
-        self.output_attentions = output_attentions
-        self.output_hidden_states = output_hidden_states
 
-        self.attn_type = config.attn_type
-        self.same_length = config.same_length
-        self.summary_type = summary_type
-        self.num_labels = num_labels
+        self.transformer = XLNetModel(config)
+        self.sequence_summary = XLNetSequenceSummary(config)
+        self.logits_proj = nn.Linear(config.d_model, config.num_labels)
 
-        self.transformer = XLNetModel(config, output_attentions=output_attentions,
-                                              output_hidden_states=output_hidden_states)
-
-        self.sequence_summary = XLNetSequenceSummary(config, summary_type=summary_type, use_proj=use_proj)
-        self.logits_proj = nn.Linear(config.d_model, num_labels)
         self.apply(self.init_weights)
 
-    def forward(self, inp_k, token_type_ids=None, input_mask=None, attention_mask=None,
+    def forward(self, input_ids, token_type_ids=None, input_mask=None, attention_mask=None,
                 mems=None, perm_mask=None, target_mapping=None, inp_q=None,
                 labels=None, head_mask=None):
         """
         Args:
-            inp_k: int32 Tensor in shape [bsz, len], the input token IDs.
+            input_ids: int32 Tensor in shape [bsz, len], the input token IDs.
             token_type_ids: int32 Tensor in shape [bsz, len], the input segment IDs.
             input_mask: float32 Tensor in shape [bsz, len], the input mask.
                 0 for real tokens and 1 for padding.
@@ -1169,7 +1159,7 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel):
                 Only used during pretraining for two-stream attention.
                 Set to None during finetuning.
         """
-        transformer_outputs = self.transformer(inp_k, token_type_ids, input_mask, attention_mask,
+        transformer_outputs = self.transformer(input_ids, token_type_ids, input_mask, attention_mask,
                                                mems, perm_mask, target_mapping, inp_q, head_mask)
         output = transformer_outputs[0]
 
@@ -1247,20 +1237,18 @@ class XLNetForQuestionAnswering(XLNetPreTrainedModel):
     start_logits, end_logits = model(input_ids, token_type_ids, input_mask)
     ```
     """
-    def __init__(self, config, output_attentions=False, output_hidden_states=False):
+    def __init__(self, config):
         super(XLNetForQuestionAnswering, self).__init__(config)
-        self.output_attentions = output_attentions
-        self.output_hidden_states = output_hidden_states
 
-        self.transformer = XLNetModel(config, output_attentions=output_attentions,
-                                      output_hidden_states=output_hidden_states)
-        self.qa_outputs = nn.Linear(config.hidden_size, 2)
+        self.transformer = XLNetModel(config)
+        self.qa_outputs = nn.Linear(config.d_model, config.num_labels)
+
         self.apply(self.init_weights)
 
-    def forward(self, inp_k, token_type_ids=None, input_mask=None, attention_mask=None,
+    def forward(self, input_ids, token_type_ids=None, input_mask=None, attention_mask=None,
                 mems=None, perm_mask=None, target_mapping=None, inp_q=None,
                 start_positions=None, end_positions=None, head_mask=None):
-        transformer_outputs = self.transformer(inp_k, token_type_ids, input_mask, attention_mask,
+        transformer_outputs = self.transformer(input_ids, token_type_ids, input_mask, attention_mask,
                                             mems, perm_mask, target_mapping, inp_q, head_mask)
 
         logits = self.qa_outputs(transformer_outputs[0])
diff --git a/pytorch_pretrained_bert/tests/__init__.py b/pytorch_pretrained_bert/tests/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/tests/conftest.py b/pytorch_pretrained_bert/tests/conftest.py
similarity index 100%
rename from tests/conftest.py
rename to pytorch_pretrained_bert/tests/conftest.py
diff --git a/samples/input.txt b/pytorch_pretrained_bert/tests/fixtures/input.txt
similarity index 100%
rename from samples/input.txt
rename to pytorch_pretrained_bert/tests/fixtures/input.txt
diff --git a/samples/sample_text.txt b/pytorch_pretrained_bert/tests/fixtures/sample_text.txt
similarity index 100%
rename from samples/sample_text.txt
rename to pytorch_pretrained_bert/tests/fixtures/sample_text.txt
diff --git a/samples/test_sentencepiece.model b/pytorch_pretrained_bert/tests/fixtures/test_sentencepiece.model
similarity index 100%
rename from samples/test_sentencepiece.model
rename to pytorch_pretrained_bert/tests/fixtures/test_sentencepiece.model
diff --git a/pytorch_pretrained_bert/tests/model_tests_commons.py b/pytorch_pretrained_bert/tests/model_tests_commons.py
new file mode 100644
index 0000000000..759b31aa0a
--- /dev/null
+++ b/pytorch_pretrained_bert/tests/model_tests_commons.py
@@ -0,0 +1,379 @@
+# coding=utf-8
+# Copyright 2019 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import shutil
+import json
+import random
+
+import torch
+
+def create_and_check_for_headmasking(tester, model_classes, config, inputs_dict):
+    for model_class in model_classes:
+        config.output_hidden_states = True
+        model = model_class(config=config)
+        model.eval()
+        head_mask = torch.zeros(tester.num_hidden_layers, tester.num_attention_heads)
+        # Set that after having prepared the tensor to avoid error (leaf variable has been moved into the graph interior) 
+        head_mask.requires_grad_(requires_grad=True)
+        outputs = model(**inputs_dict, head_mask=head_mask)
+
+        # Compute some gradients
+        output = sum(t.sum() for t in outputs[0])
+        output = output.sum()
+        output.backward()
+        multihead_outputs = head_mask.grad
+
+        tester.parent.assertEqual(len(multihead_outputs), tester.num_hidden_layers)
+        # self.parent.assertListEqual(
+        #     list(multihead_outputs[0].size()),
+        #     [self.batch_size, self.num_attention_heads,
+        #      self.seq_length, self.hidden_size // self.num_attention_heads])
+        # self.parent.assertEqual(
+        #     len(multihead_outputs[0][:, 1:(self.num_attention_heads-1), :, :].nonzero()),
+        #     0)
+        # self.parent.assertEqual(
+        #     len(multihead_outputs[0][:, 0, :, :].nonzero()),
+        #     self.batch_size * self.seq_length * self.hidden_size // self.num_attention_heads)
+        # self.parent.assertEqual(
+        #     len(multihead_outputs[0][:, self.num_attention_heads-1, :, :].nonzero()),
+        #     self.batch_size * self.seq_length * self.hidden_size // self.num_attention_heads)
+
+        # self.parent.assertListEqual(
+        #     list(multihead_outputs[1].size()),
+        #     [self.batch_size, self.num_attention_heads,
+        #      self.seq_length, self.hidden_size // self.num_attention_heads])
+        # self.parent.assertEqual(
+        #     len(multihead_outputs[1].nonzero()),
+        #     multihead_outputs[1].numel())
+
+        # self.parent.assertListEqual(
+        #     list(multihead_outputs[-1].size()),
+        #     [self.batch_size, self.num_attention_heads,
+        #      self.seq_length, self.hidden_size // self.num_attention_heads])
+        # self.parent.assertEqual(
+        #     len(multihead_outputs[-1][:, 1:, :, :].nonzero()),
+        #     0)
+        # self.parent.assertEqual(
+        #     len(multihead_outputs[-1][:, 0, :, :].nonzero()),
+        #     self.batch_size * self.seq_length * self.hidden_size // self.num_attention_heads)
+
+
+def create_and_check_for_head_pruning(tester, model_classes, config, inputs_dict):
+    for model_class in model_classes:
+        model = model_class(config=config)
+        model.eval()
+        heads_to_prune = {0: list(range(1, tester.num_attention_heads)),
+                            -1: [0]}
+        model.prune_heads(heads_to_prune)
+        outputs = model(**inputs_dict)
+
+        # output = sum(t.sum() for t in outputs[0])
+        # output = output.sum()
+        # output.backward()
+        # multihead_outputs = bert_model.get_multihead_outputs()
+
+        # self.parent.assertEqual(len(multihead_outputs), self.num_hidden_layers)
+        # self.parent.assertListEqual(
+        #     list(multihead_outputs[0].size()),
+        #     [self.batch_size, 1,
+        #      self.seq_length, self.hidden_size // self.num_attention_heads])
+        # self.parent.assertListEqual(
+        #     list(multihead_outputs[1].size()),
+        #     [self.batch_size, self.num_attention_heads,
+        #      self.seq_length, self.hidden_size // self.num_attention_heads])
+        # self.parent.assertListEqual(
+        #     list(multihead_outputs[-1].size()),
+        #     [self.batch_size, self.num_attention_heads-1,
+        #      self.seq_length, self.hidden_size // self.num_attention_heads])
+
+
+def create_and_check_for_attentions(tester, model_classes, config, inputs_dict):
+    for model_class in model_classes:
+        config.output_attentions = True
+        config.output_hidden_states = False
+        model = model_class(config)
+        model.eval()
+        outputs = model(**inputs_dict)
+        attentions = outputs[-1]
+        tester.parent.assertEqual(model.config.output_attentions, True)
+        tester.parent.assertEqual(model.config.output_hidden_states, False)
+        tester.parent.assertEqual(len(attentions), tester.num_hidden_layers)
+        tester.parent.assertListEqual(
+            list(attentions[0].shape[-3:]),
+            [tester.num_attention_heads,
+             tester.seq_length,
+             tester.key_len if hasattr(tester, 'key_len') else tester.seq_length])
+        out_len = len(outputs)
+
+        # Check attention is always last and order is fine
+        config.output_attentions = True
+        config.output_hidden_states = True
+        model = model_class(config)
+        model.eval()
+        outputs = model(**inputs_dict)
+        tester.parent.assertEqual(out_len+1, len(outputs))
+        tester.parent.assertEqual(model.config.output_attentions, True)
+        tester.parent.assertEqual(model.config.output_hidden_states, True)
+
+        attentions = outputs[-1]
+        tester.parent.assertEqual(len(attentions), tester.num_hidden_layers)
+        tester.parent.assertListEqual(
+            list(attentions[0].shape[-3:]),
+            [tester.num_attention_heads,
+             tester.seq_length,
+             tester.key_len if hasattr(tester, 'key_len') else tester.seq_length])
+
+def create_and_check_for_hidden_states(tester, model_classes, config, inputs_dict):
+    for model_class in model_classes:
+        config.output_hidden_states = True
+        config.output_attentions = False
+        model = model_class(config)
+        model.eval()
+        outputs = model(**inputs_dict)
+        hidden_states = outputs[-1]
+        tester.parent.assertEqual(model.config.output_attentions, False)
+        tester.parent.assertEqual(model.config.output_hidden_states, True)
+        tester.parent.assertEqual(len(hidden_states), tester.num_hidden_layers + 1)
+        tester.parent.assertListEqual(
+            list(hidden_states[0].shape[-2:]),
+            [tester.seq_length, tester.hidden_size])
+
+
+def create_and_check_commons(tester, config, inputs_dict):
+    create_and_check_for_attentions(tester, tester.all_model_classes, config, inputs_dict)
+    create_and_check_for_headmasking(tester, tester.all_model_classes, config, inputs_dict)
+    create_and_check_for_head_pruning(tester, tester.all_model_classes, config, inputs_dict)
+    create_and_check_for_hidden_states(tester, tester.all_model_classes, config, inputs_dict)
+
+
+def ids_tensor(shape, vocab_size, rng=None, name=None):
+    """Creates a random int32 tensor of the shape within the vocab size."""
+    if rng is None:
+        rng = random.Random()
+
+    total_dims = 1
+    for dim in shape:
+        total_dims *= dim
+
+    values = []
+    for _ in range(total_dims):
+        values.append(rng.randint(0, vocab_size - 1))
+
+    return torch.tensor(data=values, dtype=torch.long).view(shape).contiguous()
+
+
+class ConfigTester(object):
+    def __init__(self, parent, config_class=None, **kwargs):
+        self.parent = parent
+        self.config_class = config_class
+        self.inputs_dict = kwargs
+
+    def create_and_test_config_to_json_string(self):
+        config = self.config_class(**self.inputs_dict)
+        obj = json.loads(config.to_json_string())
+        for key, value in self.inputs_dict.items():
+            self.parent.assertEqual(obj[key], value)
+
+    def create_and_test_config_to_json_file(self):
+        config_first = self.config_class(**self.inputs_dict)
+        json_file_path = "/tmp/config.json"
+        config_first.to_json_file(json_file_path)
+        config_second = self.config_class.from_json_file(json_file_path)
+        os.remove(json_file_path)
+        self.parent.assertEqual(config_second.to_dict(), config_first.to_dict())
+
+    def run_common_tests(self):
+        self.create_and_test_config_to_json_string()
+        self.create_and_test_config_to_json_file()
+
+
+class GPTModelTester(object):
+    def __init__(self,
+                    parent,
+                    batch_size=13,
+                    seq_length=7,
+                    is_training=True,
+                    use_position_ids=True,
+                    use_token_type_ids=True,
+                    use_labels=True,
+                    vocab_size=99,
+                    n_special=1,
+                    n_positions=33,
+                    hidden_size=32,
+                    num_hidden_layers=5,
+                    num_attention_heads=4,
+                    n_choices=3,
+                    type_sequence_label_size=2,
+                    initializer_range=0.02,
+                    num_labels=3,
+                    scope=None,
+                    config_class=None,
+                    base_model_class=None,
+                    lm_head_model_class=None,
+                    double_head_model_class=None,
+                    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.use_position_ids = use_position_ids
+        self.use_token_type_ids = use_token_type_ids
+        self.use_labels = use_labels
+        self.vocab_size = vocab_size
+        self.n_special = n_special
+        self.n_positions = n_positions
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.n_choices = n_choices
+        self.type_sequence_label_size = type_sequence_label_size
+        self.initializer_range = initializer_range
+        self.num_labels = num_labels
+        self.scope = scope
+        self.config_class = config_class
+        self.base_model_class = base_model_class
+        self.lm_head_model_class = lm_head_model_class
+        self.double_head_model_class = double_head_model_class
+        self.all_model_classes = (base_model_class, lm_head_model_class, double_head_model_class)
+
+    def prepare_config_and_inputs(self):
+        total_num_tokens = self.vocab_size + self.n_special
+        input_ids = ids_tensor([self.batch_size, self.n_choices, self.seq_length], total_num_tokens)
+
+        position_ids = None
+        if self.use_position_ids:
+            position_ids = ids_tensor([self.batch_size, self.n_choices, self.seq_length], self.n_positions)
+
+        token_type_ids = None
+        if self.use_token_type_ids:
+            total_voc = self.vocab_size
+            token_type_ids = ids_tensor([self.batch_size, self.n_choices, self.seq_length], total_voc)
+
+        mc_labels = None
+        lm_labels = None
+        mc_token_ids = None
+        if self.use_labels:
+            mc_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+            lm_labels = ids_tensor([self.batch_size, self.n_choices, self.seq_length], self.num_labels)
+            mc_token_ids = ids_tensor([self.batch_size, self.n_choices], self.seq_length)
+
+        config = self.config_class(
+            vocab_size_or_config_json_file=self.vocab_size,
+            n_special=self.n_special,
+            n_positions=self.n_positions,
+            n_embd=self.hidden_size,
+            n_layer=self.num_hidden_layers,
+            n_head=self.num_attention_heads,
+            initializer_range=self.initializer_range)
+
+        return (config, input_ids, token_type_ids, position_ids,
+                mc_labels, lm_labels, mc_token_ids)
+
+    def create_and_check_base_model(self, config, input_ids, token_type_ids, position_ids,
+                            mc_labels, lm_labels, mc_token_ids):
+        model = self.base_model_class(config)
+        model.eval()
+        outputs = model(input_ids, position_ids, token_type_ids)
+        hidden_state = outputs[0]
+        self.parent.assertListEqual(
+            list(hidden_state.size()),
+            [self.batch_size, self.n_choices, self.seq_length, self.hidden_size])
+
+
+    def create_and_check_lm_head(self, config, input_ids, token_type_ids, position_ids,
+                                    mc_labels, lm_labels, mc_token_ids):
+        model = self.lm_head_model_class(config)
+        model.eval()
+        outputs = model(input_ids, position_ids, token_type_ids, lm_labels)
+        loss, lm_logits = outputs[:2]
+
+        total_voc = self.n_special + self.vocab_size
+        self.parent.assertListEqual(
+            list(lm_logits.size()),
+            [self.batch_size, self.n_choices, self.seq_length, total_voc])
+        self.parent.assertListEqual(
+            list(loss.size()),
+            [])
+
+    def create_and_check_presents(self, config, input_ids, token_type_ids, position_ids,
+                                    mc_labels, lm_labels, mc_token_ids):
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            model.eval()
+            outputs = model(input_ids)
+            presents = outputs[-1]
+            self.parent.assertEqual(self.num_hidden_layers, len(presents))
+            self.parent.assertListEqual(
+                list(presents[0].size()),
+                [2, self.batch_size * self.n_choices, self.num_attention_heads,
+                    self.seq_length, self.hidden_size // self.num_attention_heads])
+
+    def create_and_check_double_heads(self, config, input_ids, token_type_ids, position_ids,
+                                    mc_labels, lm_labels, mc_token_ids):
+        model = self.double_head_model_class(config)
+        model.eval()
+        outputs = model(input_ids, mc_token_ids, lm_labels=lm_labels, mc_labels=mc_labels,
+                                                    token_type_ids=token_type_ids, position_ids=position_ids)
+        lm_loss, mc_loss, lm_logits, mc_logits = outputs[:4]
+        loss = [lm_loss, mc_loss]
+
+        total_voc = self.n_special + self.vocab_size
+        self.parent.assertListEqual(
+            list(lm_logits.size()),
+            [self.batch_size, self.n_choices, self.seq_length, total_voc])
+        self.parent.assertListEqual(
+            list(mc_logits.size()),
+            [self.batch_size, self.n_choices])
+        self.parent.assertListEqual(
+            [list(l.size()) for l in loss],
+            [[], []])
+
+    def create_and_check_model_from_pretrained(self):
+        cache_dir = "/tmp/pytorch_pretrained_bert_test/"
+        for model_name in list(self.base_model_class.PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
+            model = self.base_model_class.from_pretrained(model_name, cache_dir=cache_dir)
+            shutil.rmtree(cache_dir)
+            self.parent.assertIsNotNone(model)
+
+    def create_and_check_commons(self, config, input_ids, token_type_ids, position_ids,
+                                    mc_labels, lm_labels, mc_token_ids):
+        inputs_dict = {'input_ids': input_ids}
+        create_and_check_commons(self, config, inputs_dict)
+
+    def run_common_tests(self, test_presents=False):
+        config_and_inputs = self.prepare_config_and_inputs()
+        self.create_and_check_base_model(*config_and_inputs)
+
+        config_and_inputs = self.prepare_config_and_inputs()
+        self.create_and_check_lm_head(*config_and_inputs)
+
+        config_and_inputs = self.prepare_config_and_inputs()
+        self.create_and_check_double_heads(*config_and_inputs)
+
+        if test_presents:
+            config_and_inputs = self.prepare_config_and_inputs()
+            self.create_and_check_presents(*config_and_inputs)
+
+        config_and_inputs = self.prepare_config_and_inputs()
+        self.create_and_check_commons(*config_and_inputs)
+
+    def run_slow_tests(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        self.create_and_check_model_from_pretrained(*config_and_inputs)
+
diff --git a/pytorch_pretrained_bert/tests/model_utils_test.py b/pytorch_pretrained_bert/tests/model_utils_test.py
new file mode 100644
index 0000000000..76585453c8
--- /dev/null
+++ b/pytorch_pretrained_bert/tests/model_utils_test.py
@@ -0,0 +1,50 @@
+# coding=utf-8
+# Copyright 2018 HuggingFace Inc..
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import unittest
+import json
+import random
+import shutil
+import pytest
+
+import torch
+
+from pytorch_pretrained_bert import PretrainedConfig, PreTrainedModel
+from pytorch_pretrained_bert.modeling import BertModel, BertConfig, PRETRAINED_MODEL_ARCHIVE_MAP, PRETRAINED_CONFIG_ARCHIVE_MAP
+
+
+class ModelUtilsTest(unittest.TestCase):
+    def test_model_from_pretrained(self):
+        for model_name in list(PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
+            config = BertConfig.from_pretrained(model_name)
+            self.assertIsNotNone(config)
+            self.assertIsInstance(config, PretrainedConfig)
+
+            model = BertModel.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+            self.assertIsInstance(model, PreTrainedModel)
+
+            config = BertConfig.from_pretrained(model_name, output_attentions=True, output_hidden_states=True)
+            model = BertModel.from_pretrained(model_name, output_attentions=True, output_hidden_states=True)
+            self.assertEqual(model.config.output_attentions, True)
+            self.assertEqual(model.config.output_hidden_states, True)
+            self.assertEqual(model.config, config)
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/pytorch_pretrained_bert/tests/modeling_gpt2_test.py b/pytorch_pretrained_bert/tests/modeling_gpt2_test.py
new file mode 100644
index 0000000000..552599b1fc
--- /dev/null
+++ b/pytorch_pretrained_bert/tests/modeling_gpt2_test.py
@@ -0,0 +1,55 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import unittest
+import json
+import random
+import shutil
+import pytest
+
+import torch
+
+from pytorch_pretrained_bert import (GPT2Config, GPT2Model,
+                                     GPT2LMHeadModel, GPT2DoubleHeadsModel)
+
+from .model_tests_commons import (create_and_check_for_attentions, create_and_check_for_head_pruning,
+                                  create_and_check_for_headmasking, create_and_check_for_hidden_states,
+                                  ConfigTester, GPTModelTester)
+
+class GPT2ModelTest(unittest.TestCase):
+
+    def test_config(self):
+        config_tester = ConfigTester(self, config_class=GPT2Config, n_embd=37)
+        config_tester.run_common_tests()
+
+    def test_model(self):
+        model_tester = GPTModelTester(self, config_class=GPT2Config, base_model_class=GPT2Model,
+                                           lm_head_model_class=GPT2LMHeadModel,
+                                           double_head_model_class=GPT2DoubleHeadsModel)
+        model_tester.run_common_tests(test_presents=True)
+
+    @pytest.mark.slow
+    def test_pretrained(self):
+        model_tester = GPTModelTester(self, config_class=GPT2Config, base_model_class=GPT2Model,
+                                           lm_head_model_class=GPT2LMHeadModel,
+                                           double_head_model_class=GPT2DoubleHeadsModel)
+        model_tester.run_slow_tests()
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/pytorch_pretrained_bert/tests/modeling_openai_test.py b/pytorch_pretrained_bert/tests/modeling_openai_test.py
new file mode 100644
index 0000000000..83ef480f49
--- /dev/null
+++ b/pytorch_pretrained_bert/tests/modeling_openai_test.py
@@ -0,0 +1,55 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import unittest
+import json
+import random
+import shutil
+import pytest
+
+import torch
+
+from pytorch_pretrained_bert import (OpenAIGPTConfig, OpenAIGPTModel,
+                                     OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel)
+
+from .model_tests_commons import (create_and_check_for_attentions, create_and_check_for_head_pruning,
+                                  create_and_check_for_headmasking, create_and_check_for_hidden_states,
+                                  ConfigTester, GPTModelTester)
+
+class OpenAIModelTest(unittest.TestCase):
+
+    def test_config(self):
+        config_tester = ConfigTester(self, config_class=OpenAIGPTConfig, n_embd=37)
+        config_tester.run_common_tests()
+
+    def test_model(self):
+        model_tester = GPTModelTester(self, config_class=OpenAIGPTConfig, base_model_class=OpenAIGPTModel,
+                                           lm_head_model_class=OpenAIGPTLMHeadModel,
+                                           double_head_model_class=OpenAIGPTDoubleHeadsModel)
+        model_tester.run_common_tests(test_presents=False)
+
+    @pytest.mark.slow
+    def test_pretrained(self):
+        model_tester = GPTModelTester(self, config_class=OpenAIGPTConfig, base_model_class=OpenAIGPTModel,
+                                           lm_head_model_class=OpenAIGPTLMHeadModel,
+                                           double_head_model_class=OpenAIGPTDoubleHeadsModel)
+        model_tester.run_slow_tests()
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/pytorch_pretrained_bert/tests/modeling_test.py b/pytorch_pretrained_bert/tests/modeling_test.py
new file mode 100644
index 0000000000..2219ee7589
--- /dev/null
+++ b/pytorch_pretrained_bert/tests/modeling_test.py
@@ -0,0 +1,307 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import unittest
+import json
+import random
+import shutil
+import pytest
+
+import torch
+
+from pytorch_pretrained_bert import (BertConfig, BertModel, BertForMaskedLM,
+                                     BertForNextSentencePrediction, BertForPreTraining,
+                                     BertForQuestionAnswering, BertForSequenceClassification,
+                                     BertForTokenClassification, BertForMultipleChoice)
+from pytorch_pretrained_bert.modeling import PRETRAINED_MODEL_ARCHIVE_MAP
+
+from .model_tests_commons import (create_and_check_commons, ConfigTester, ids_tensor)
+
+
+class BertModelTest(unittest.TestCase):
+    class BertModelTester(object):
+
+        def __init__(self,
+                     parent,
+                     batch_size=13,
+                     seq_length=7,
+                     is_training=True,
+                     use_input_mask=True,
+                     use_token_type_ids=True,
+                     use_labels=True,
+                     vocab_size=99,
+                     hidden_size=32,
+                     num_hidden_layers=5,
+                     num_attention_heads=4,
+                     intermediate_size=37,
+                     hidden_act="gelu",
+                     hidden_dropout_prob=0.1,
+                     attention_probs_dropout_prob=0.1,
+                     max_position_embeddings=512,
+                     type_vocab_size=16,
+                     type_sequence_label_size=2,
+                     initializer_range=0.02,
+                     num_labels=3,
+                     num_choices=4,
+                     scope=None,
+                     all_model_classes = (BertModel, BertForMaskedLM, BertForNextSentencePrediction,
+                             BertForPreTraining, BertForQuestionAnswering, BertForSequenceClassification,
+                             BertForTokenClassification),
+                    ):
+            self.parent = parent
+            self.batch_size = batch_size
+            self.seq_length = seq_length
+            self.is_training = is_training
+            self.use_input_mask = use_input_mask
+            self.use_token_type_ids = use_token_type_ids
+            self.use_labels = use_labels
+            self.vocab_size = vocab_size
+            self.hidden_size = hidden_size
+            self.num_hidden_layers = num_hidden_layers
+            self.num_attention_heads = num_attention_heads
+            self.intermediate_size = intermediate_size
+            self.hidden_act = hidden_act
+            self.hidden_dropout_prob = hidden_dropout_prob
+            self.attention_probs_dropout_prob = attention_probs_dropout_prob
+            self.max_position_embeddings = max_position_embeddings
+            self.type_vocab_size = type_vocab_size
+            self.type_sequence_label_size = type_sequence_label_size
+            self.initializer_range = initializer_range
+            self.num_labels = num_labels
+            self.num_choices = num_choices
+            self.scope = scope
+            self.all_model_classes = all_model_classes
+
+        def prepare_config_and_inputs(self):
+            input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+            input_mask = None
+            if self.use_input_mask:
+                input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
+
+            token_type_ids = None
+            if self.use_token_type_ids:
+                token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
+
+            sequence_labels = None
+            token_labels = None
+            choice_labels = None
+            if self.use_labels:
+                sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+                token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+                choice_labels = ids_tensor([self.batch_size], self.num_choices)
+
+            config = BertConfig(
+                vocab_size_or_config_json_file=self.vocab_size,
+                hidden_size=self.hidden_size,
+                num_hidden_layers=self.num_hidden_layers,
+                num_attention_heads=self.num_attention_heads,
+                intermediate_size=self.intermediate_size,
+                hidden_act=self.hidden_act,
+                hidden_dropout_prob=self.hidden_dropout_prob,
+                attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+                max_position_embeddings=self.max_position_embeddings,
+                type_vocab_size=self.type_vocab_size,
+                initializer_range=self.initializer_range)
+
+            return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+
+        def check_loss_output(self, result):
+            self.parent.assertListEqual(
+                list(result["loss"].size()),
+                [])
+
+        def create_and_check_bert_model(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+            model = BertModel(config=config)
+            model.eval()
+            sequence_output, pooled_output = model(input_ids, token_type_ids, input_mask)
+
+            result = {
+                "sequence_output": sequence_output,
+                "pooled_output": pooled_output,
+            }
+            self.parent.assertListEqual(
+                list(result["sequence_output"].size()),
+                [self.batch_size, self.seq_length, self.hidden_size])
+            self.parent.assertListEqual(list(result["pooled_output"].size()), [self.batch_size, self.hidden_size])
+
+
+        def create_and_check_bert_for_masked_lm(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+            model = BertForMaskedLM(config=config)
+            model.eval()
+            loss, prediction_scores = model(input_ids, token_type_ids, input_mask, token_labels)
+            result = {
+                "loss": loss,
+                "prediction_scores": prediction_scores,
+            }
+            self.parent.assertListEqual(
+                list(result["prediction_scores"].size()),
+                [self.batch_size, self.seq_length, self.vocab_size])
+            self.check_loss_output(result)
+
+        def create_and_check_bert_for_next_sequence_prediction(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+            model = BertForNextSentencePrediction(config=config)
+            model.eval()
+            loss, seq_relationship_score = model(input_ids, token_type_ids, input_mask, sequence_labels)
+            result = {
+                "loss": loss,
+                "seq_relationship_score": seq_relationship_score,
+            }
+            self.parent.assertListEqual(
+                list(result["seq_relationship_score"].size()),
+                [self.batch_size, 2])
+            self.check_loss_output(result)
+
+
+        def create_and_check_bert_for_pretraining(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+            model = BertForPreTraining(config=config)
+            model.eval()
+            loss, prediction_scores, seq_relationship_score = model(input_ids, token_type_ids, input_mask, token_labels, sequence_labels)
+            result = {
+                "loss": loss,
+                "prediction_scores": prediction_scores,
+                "seq_relationship_score": seq_relationship_score,
+            }
+            self.parent.assertListEqual(
+                list(result["prediction_scores"].size()),
+                [self.batch_size, self.seq_length, self.vocab_size])
+            self.parent.assertListEqual(
+                list(result["seq_relationship_score"].size()),
+                [self.batch_size, 2])
+            self.check_loss_output(result)
+
+
+        def create_and_check_bert_for_question_answering(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+            model = BertForQuestionAnswering(config=config)
+            model.eval()
+            loss, start_logits, end_logits = model(input_ids, token_type_ids, input_mask, sequence_labels, sequence_labels)
+            result = {
+                "loss": loss,
+                "start_logits": start_logits,
+                "end_logits": end_logits,
+            }
+            self.parent.assertListEqual(
+                list(result["start_logits"].size()),
+                [self.batch_size, self.seq_length])
+            self.parent.assertListEqual(
+                list(result["end_logits"].size()),
+                [self.batch_size, self.seq_length])
+            self.check_loss_output(result)
+
+
+        def create_and_check_bert_for_sequence_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+            config.num_labels = self.num_labels
+            model = BertForSequenceClassification(config)
+            model.eval()
+            loss, logits = model(input_ids, token_type_ids, input_mask, sequence_labels)
+            result = {
+                "loss": loss,
+                "logits": logits,
+            }
+            self.parent.assertListEqual(
+                list(result["logits"].size()),
+                [self.batch_size, self.num_labels])
+            self.check_loss_output(result)
+
+
+        def create_and_check_bert_for_token_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+            config.num_labels = self.num_labels
+            model = BertForTokenClassification(config=config)
+            model.eval()
+            loss, logits = model(input_ids, token_type_ids, input_mask, token_labels)
+            result = {
+                "loss": loss,
+                "logits": logits,
+            }
+            self.parent.assertListEqual(
+                list(result["logits"].size()),
+                [self.batch_size, self.seq_length, self.num_labels])
+            self.check_loss_output(result)
+
+
+        def create_and_check_bert_for_multiple_choice(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+            config.num_choices = self.num_choices
+            model = BertForMultipleChoice(config=config)
+            model.eval()
+            multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+            multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+            multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+            loss, logits = model(multiple_choice_inputs_ids,
+                         multiple_choice_token_type_ids,
+                         multiple_choice_input_mask,
+                         choice_labels)
+            result = {
+                "loss": loss,
+                "logits": logits,
+            }
+            self.parent.assertListEqual(
+                list(result["logits"].size()),
+                [self.batch_size, self.num_choices])
+            self.check_loss_output(result)
+
+
+        def create_and_check_bert_commons(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+            inputs_dict = {'input_ids': input_ids, 'token_type_ids': token_type_ids, 'attention_mask': input_mask}
+            create_and_check_commons(self, config, inputs_dict)
+
+    def test_default(self):
+        self.run_tester(BertModelTest.BertModelTester(self))
+
+    def test_config(self):
+        config_tester = ConfigTester(self, config_class=BertConfig, hidden_size=37)
+        config_tester.run_common_tests()
+
+    @pytest.mark.slow
+    def test_model_from_pretrained(self):
+        cache_dir = "/tmp/pytorch_pretrained_bert_test/"
+        for model_name in list(PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
+            model = BertModel.from_pretrained(model_name, cache_dir=cache_dir)
+            shutil.rmtree(cache_dir)
+            self.assertIsNotNone(model)
+
+    def run_tester(self, tester):
+        config_and_inputs = tester.prepare_config_and_inputs()
+        tester.create_and_check_bert_model(*config_and_inputs)
+
+        config_and_inputs = tester.prepare_config_and_inputs()
+        tester.create_and_check_bert_for_masked_lm(*config_and_inputs)
+
+        config_and_inputs = tester.prepare_config_and_inputs()
+        tester.create_and_check_bert_for_multiple_choice(*config_and_inputs)
+
+        config_and_inputs = tester.prepare_config_and_inputs()
+        tester.create_and_check_bert_for_next_sequence_prediction(*config_and_inputs)
+
+        config_and_inputs = tester.prepare_config_and_inputs()
+        tester.create_and_check_bert_for_pretraining(*config_and_inputs)
+
+        config_and_inputs = tester.prepare_config_and_inputs()
+        tester.create_and_check_bert_for_question_answering(*config_and_inputs)
+
+        config_and_inputs = tester.prepare_config_and_inputs()
+        tester.create_and_check_bert_for_sequence_classification(*config_and_inputs)
+
+        config_and_inputs = tester.prepare_config_and_inputs()
+        tester.create_and_check_bert_for_token_classification(*config_and_inputs)
+
+        config_and_inputs = tester.prepare_config_and_inputs()
+        tester.create_and_check_bert_commons(*config_and_inputs)
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/modeling_transfo_xl_test.py b/pytorch_pretrained_bert/tests/modeling_transfo_xl_test.py
similarity index 61%
rename from tests/modeling_transfo_xl_test.py
rename to pytorch_pretrained_bert/tests/modeling_transfo_xl_test.py
index 88a6ad35fe..e6acbb627d 100644
--- a/tests/modeling_transfo_xl_test.py
+++ b/pytorch_pretrained_bert/tests/modeling_transfo_xl_test.py
@@ -28,6 +28,8 @@ import torch
 from pytorch_pretrained_bert import (TransfoXLConfig, TransfoXLModel, TransfoXLLMHeadModel)
 from pytorch_pretrained_bert.modeling_transfo_xl import PRETRAINED_MODEL_ARCHIVE_MAP
 
+from .model_tests_commons import ConfigTester, create_and_check_commons, ids_tensor
+
 class TransfoXLModelTest(unittest.TestCase):
     class TransfoXLModelTester(object):
 
@@ -41,54 +43,58 @@ class TransfoXLModelTest(unittest.TestCase):
                      use_labels=True,
                      vocab_size=99,
                      cutoffs=[10, 50, 80],
-                     d_model=32,
+                     hidden_size=32,
                      d_embed=32,
-                     n_head=4,
+                     num_attention_heads=4,
                      d_head=8,
                      d_inner=128,
                      div_val=2,
-                     n_layer=5,
+                     num_hidden_layers=5,
                      scope=None,
-                     seed=1):
+                     seed=1,
+                     all_model_classes=(TransfoXLModel, TransfoXLLMHeadModel),
+                     ):
             self.parent = parent
             self.batch_size = batch_size
             self.seq_length = seq_length
             self.mem_len = mem_len
+            self.key_len = seq_length + mem_len
             self.clamp_len = clamp_len
             self.is_training = is_training
             self.use_labels = use_labels
             self.vocab_size = vocab_size
             self.cutoffs = cutoffs
-            self.d_model = d_model
+            self.hidden_size = hidden_size
             self.d_embed = d_embed
-            self.n_head = n_head
+            self.num_attention_heads = num_attention_heads
             self.d_head = d_head
             self.d_inner = d_inner
             self.div_val = div_val
-            self.n_layer = n_layer
+            self.num_hidden_layers = num_hidden_layers
             self.scope = scope
             self.seed = seed
+            self.all_model_classes = all_model_classes
 
         def prepare_config_and_inputs(self):
-            input_ids_1 = TransfoXLModelTest.ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-            input_ids_2 = TransfoXLModelTest.ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+            input_ids_1 = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+            input_ids_2 = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
 
             lm_labels = None
             if self.use_labels:
-                lm_labels = TransfoXLModelTest.ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+                lm_labels = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
 
             config = TransfoXLConfig(
                 vocab_size_or_config_json_file=self.vocab_size,
                 mem_len=self.mem_len,
                 clamp_len=self.clamp_len,
                 cutoffs=self.cutoffs,
-                d_model=self.d_model,
+                d_model=self.hidden_size,
                 d_embed=self.d_embed,
-                n_head=self.n_head,
+                n_head=self.num_attention_heads,
                 d_head=self.d_head,
                 d_inner=self.d_inner,
                 div_val=self.div_val,
-                n_layer=self.n_layer)
+                n_layer=self.num_hidden_layers)
 
             return (config, input_ids_1, input_ids_2, lm_labels)
 
@@ -113,37 +119,34 @@ class TransfoXLModelTest(unittest.TestCase):
         def check_transfo_xl_model_output(self, result):
             self.parent.assertListEqual(
                 list(result["hidden_states_1"].size()),
-                [self.batch_size, self.seq_length, self.d_model])
+                [self.batch_size, self.seq_length, self.hidden_size])
             self.parent.assertListEqual(
                 list(result["hidden_states_2"].size()),
-                [self.batch_size, self.seq_length, self.d_model])
+                [self.batch_size, self.seq_length, self.hidden_size])
             self.parent.assertListEqual(
                 list(list(mem.size()) for mem in result["mems_1"]),
-                [[self.mem_len, self.batch_size, self.d_model]] * self.n_layer)
+                [[self.mem_len, self.batch_size, self.hidden_size]] * self.num_hidden_layers)
             self.parent.assertListEqual(
                 list(list(mem.size()) for mem in result["mems_2"]),
-                [[self.mem_len, self.batch_size, self.d_model]] * self.n_layer)
+                [[self.mem_len, self.batch_size, self.hidden_size]] * self.num_hidden_layers)
 
 
         def create_transfo_xl_lm_head(self, config, input_ids_1, input_ids_2, lm_labels):
             model = TransfoXLLMHeadModel(config)
             model.eval()
 
-            loss_1, mems_1a = model(input_ids_1, labels=lm_labels)
-            lm_logits_1, mems_1b = model(input_ids_1)
-
-            loss_2, mems_2a = model(input_ids_2, labels=lm_labels, mems=mems_1a)
-            lm_logits_2, mems_2b = model(input_ids_2, mems=mems_1b)
+            lm_logits_1, mems_1 = model(input_ids_1)
+            loss_1, _, mems_1 = model(input_ids_1, labels=lm_labels)
+            lm_logits_2, mems_2 = model(input_ids_2, mems=mems_1)
+            loss_2, _, mems_2 = model(input_ids_2, labels=lm_labels, mems=mems_1)
 
             outputs = {
                 "loss_1": loss_1,
-                "mems_1a": mems_1a,
+                "mems_1": mems_1,
                 "lm_logits_1": lm_logits_1,
-                "mems_1b": mems_1b,
                 "loss_2": loss_2,
-                "mems_2a": mems_2a,
+                "mems_2": mems_2,
                 "lm_logits_2": lm_logits_2,
-                "mems_2b": mems_2b,
             }
             return outputs
 
@@ -155,14 +158,8 @@ class TransfoXLModelTest(unittest.TestCase):
                 list(result["lm_logits_1"].size()),
                 [self.batch_size, self.seq_length, self.vocab_size])
             self.parent.assertListEqual(
-                list(list(mem.size()) for mem in result["mems_1a"]),
-                [[self.mem_len, self.batch_size, self.d_model]] * self.n_layer)
-            self.parent.assertListEqual(
-                list(list(mem.size()) for mem in result["mems_1b"]),
-                [[self.mem_len, self.batch_size, self.d_model]] * self.n_layer)
-            self.parent.assertListEqual(
-                list(mem[~torch.isnan(mem)].sum() for mem in result["mems_1a"]),
-                list(mem[~torch.isnan(mem)].sum() for mem in result["mems_1b"]))
+                list(list(mem.size()) for mem in result["mems_1"]),
+                [[self.mem_len, self.batch_size, self.hidden_size]] * self.num_hidden_layers)
 
             self.parent.assertListEqual(
                 list(result["loss_2"].size()),
@@ -171,31 +168,19 @@ class TransfoXLModelTest(unittest.TestCase):
                 list(result["lm_logits_2"].size()),
                 [self.batch_size, self.seq_length, self.vocab_size])
             self.parent.assertListEqual(
-                list(list(mem.size()) for mem in result["mems_2a"]),
-                [[self.mem_len, self.batch_size, self.d_model]] * self.n_layer)
-            self.parent.assertListEqual(
-                list(list(mem.size()) for mem in result["mems_2b"]),
-                [[self.mem_len, self.batch_size, self.d_model]] * self.n_layer)
-            self.parent.assertListEqual(
-                list(mem[~torch.isnan(mem)].sum() for mem in result["mems_2a"]),
-                list(mem[~torch.isnan(mem)].sum() for mem in result["mems_2b"]))
+                list(list(mem.size()) for mem in result["mems_2"]),
+                [[self.mem_len, self.batch_size, self.hidden_size]] * self.num_hidden_layers)
+
+        def create_and_check_transfo_xl_commons(self, config, input_ids_1, input_ids_2, lm_labels):
+            inputs_dict = {'input_ids': input_ids_1}
+            create_and_check_commons(self, config, inputs_dict)
 
     def test_default(self):
         self.run_tester(TransfoXLModelTest.TransfoXLModelTester(self))
 
-    def test_config_to_json_string(self):
-        config = TransfoXLConfig(vocab_size_or_config_json_file=96, d_embed=37)
-        obj = json.loads(config.to_json_string())
-        self.assertEqual(obj["n_token"], 96)
-        self.assertEqual(obj["d_embed"], 37)
-
-    def test_config_to_json_file(self):
-        config_first = TransfoXLConfig(vocab_size_or_config_json_file=96, d_embed=37)
-        json_file_path = "/tmp/config.json"
-        config_first.to_json_file(json_file_path)
-        config_second = TransfoXLConfig.from_json_file(json_file_path)
-        os.remove(json_file_path)
-        self.assertEqual(config_second.to_dict(), config_first.to_dict())
+    def test_config(self):
+        config_tester = ConfigTester(self, config_class=TransfoXLConfig, d_embed=37)
+        config_tester.run_common_tests()
 
     @pytest.mark.slow
     def test_model_from_pretrained(self):
@@ -209,28 +194,18 @@ class TransfoXLModelTest(unittest.TestCase):
         config_and_inputs = tester.prepare_config_and_inputs()
 
         tester.set_seed()
+        config_and_inputs = tester.prepare_config_and_inputs()
         output_result = tester.create_transfo_xl_model(*config_and_inputs)
         tester.check_transfo_xl_model_output(output_result)
 
         tester.set_seed()
+        config_and_inputs = tester.prepare_config_and_inputs()
         output_result = tester.create_transfo_xl_lm_head(*config_and_inputs)
         tester.check_transfo_xl_lm_head_output(output_result)
 
-    @classmethod
-    def ids_tensor(cls, shape, vocab_size, rng=None, name=None):
-        """Creates a random int32 tensor of the shape within the vocab size."""
-        if rng is None:
-            rng = random.Random()
-
-        total_dims = 1
-        for dim in shape:
-            total_dims *= dim
-
-        values = []
-        for _ in range(total_dims):
-            values.append(rng.randint(0, vocab_size - 1))
-
-        return torch.tensor(data=values, dtype=torch.long).view(shape).contiguous()
+        tester.set_seed()
+        config_and_inputs = tester.prepare_config_and_inputs()
+        tester.create_and_check_transfo_xl_commons(*config_and_inputs)
 
 
 if __name__ == "__main__":
diff --git a/tests/modeling_xlnet_test.py b/pytorch_pretrained_bert/tests/modeling_xlnet_test.py
similarity index 77%
rename from tests/modeling_xlnet_test.py
rename to pytorch_pretrained_bert/tests/modeling_xlnet_test.py
index e696c618b1..cf55889a96 100644
--- a/tests/modeling_xlnet_test.py
+++ b/pytorch_pretrained_bert/tests/modeling_xlnet_test.py
@@ -25,9 +25,11 @@ import pytest
 
 import torch
 
-from pytorch_pretrained_bert import (XLNetConfig, XLNetModel, XLNetLMHeadModel)
+from pytorch_pretrained_bert import (XLNetConfig, XLNetModel, XLNetLMHeadModel, XLNetForSequenceClassification, XLNetForQuestionAnswering)
 from pytorch_pretrained_bert.modeling_xlnet import PRETRAINED_MODEL_ARCHIVE_MAP
 
+from .model_tests_commons import ConfigTester, create_and_check_commons, ids_tensor
+
 class XLNetModelTest(unittest.TestCase):
     class XLNetModelTester(object):
 
@@ -42,43 +44,48 @@ class XLNetModelTest(unittest.TestCase):
                      use_labels=True,
                      vocab_size=99,
                      cutoffs=[10, 50, 80],
-                     d_model=32,
-                     n_head=4,
+                     hidden_size=32,
+                     num_attention_heads=4,
                      d_inner=128,
-                     n_layer=5,
+                     num_hidden_layers=5,
                      max_position_embeddings=10,
                      untie_r=True,
                      bi_data=False,
                      same_length=False,
                      seed=1,
-                     type_vocab_size=2):
+                     type_vocab_size=2,
+                     all_model_classes=(XLNetModel, XLNetLMHeadModel,
+                                        XLNetForSequenceClassification, XLNetForQuestionAnswering),
+            ):
             self.parent = parent
             self.batch_size = batch_size
             self.seq_length = seq_length
             self.mem_len = mem_len
+            # self.key_len = seq_length + mem_len
             self.clamp_len = clamp_len
             self.reuse_len = reuse_len
             self.is_training = is_training
             self.use_labels = use_labels
             self.vocab_size = vocab_size
             self.cutoffs = cutoffs
-            self.d_model = d_model
-            self.n_head = n_head
+            self.hidden_size = hidden_size
+            self.num_attention_heads = num_attention_heads
             self.d_inner = d_inner
-            self.n_layer = n_layer
+            self.num_hidden_layers = num_hidden_layers
             self.max_position_embeddings = max_position_embeddings
             self.bi_data = bi_data
             self.untie_r = untie_r
             self.same_length = same_length
             self.seed = seed
             self.type_vocab_size = type_vocab_size
+            self.all_model_classes = all_model_classes
 
         def prepare_config_and_inputs(self):
-            input_ids_1 = XLNetModelTest.ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-            input_ids_2 = XLNetModelTest.ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-            segment_ids = XLNetModelTest.ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
+            input_ids_1 = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+            input_ids_2 = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+            segment_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
 
-            input_ids_q = XLNetModelTest.ids_tensor([self.batch_size, self.seq_length + 1], self.vocab_size)
+            input_ids_q = ids_tensor([self.batch_size, self.seq_length + 1], self.vocab_size)
             perm_mask = torch.zeros(self.batch_size, self.seq_length + 1, self.seq_length + 1, dtype=torch.float)
             perm_mask[:, :, -1] = 1.0  # Previous tokens don't see last token
             target_mapping = torch.zeros(self.batch_size, 1, self.seq_length + 1, dtype=torch.float)
@@ -89,8 +96,8 @@ class XLNetModelTest(unittest.TestCase):
             # token_type_ids: int32 Tensor in shape [bsz, len], the input segment IDs.
             # input_mask: float32 Tensor in shape [bsz, len], the input mask.
             #     0 for real tokens and 1 for padding.
-            # mems: a list of float32 Tensors in shape [bsz, mem_len, d_model], memory
-            #     from previous batches. The length of the list equals n_layer.
+            # mems: a list of float32 Tensors in shape [bsz, mem_len, hidden_size], memory
+            #     from previous batches. The length of the list equals num_hidden_layers.
             #     If None, no memory is used.
             # perm_mask: float32 Tensor in shape [bsz, len, len].
             #     If perm_mask[k, i, j] = 0, i attend to j in batch k;
@@ -108,14 +115,14 @@ class XLNetModelTest(unittest.TestCase):
 
             lm_labels = None
             if self.use_labels:
-                lm_labels = XLNetModelTest.ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+                lm_labels = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
 
             config = XLNetConfig(
                 vocab_size_or_config_json_file=self.vocab_size,
-                d_model=self.d_model,
-                n_head=self.n_head,
+                d_model=self.hidden_size,
+                n_head=self.num_attention_heads,
                 d_inner=self.d_inner,
-                n_layer=self.n_layer,
+                n_layer=self.num_hidden_layers,
                 untie_r=self.untie_r,
                 max_position_embeddings=self.max_position_embeddings,
                 mem_len=self.mem_len,
@@ -159,7 +166,7 @@ class XLNetModelTest(unittest.TestCase):
                 [self.batch_size, self.seq_length, self.vocab_size])
             self.parent.assertListEqual(
                 list(list(mem.size()) for mem in result["mems_1"]),
-                [[self.seq_length, self.batch_size, self.d_model]] * self.n_layer)
+                [[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers)
 
             self.parent.assertListEqual(
                 list(result["loss_2"].size()),
@@ -169,24 +176,18 @@ class XLNetModelTest(unittest.TestCase):
                 [self.batch_size, self.seq_length, self.vocab_size])
             self.parent.assertListEqual(
                 list(list(mem.size()) for mem in result["mems_2"]),
-                [[self.mem_len, self.batch_size, self.d_model]] * self.n_layer)
+                [[self.mem_len, self.batch_size, self.hidden_size]] * self.num_hidden_layers)
+
+        def create_and_check_xlnet_commons(self, config, input_ids_1, input_ids_2, input_ids_q, perm_mask, target_mapping, inp_q, segment_ids, lm_labels):
+            inputs_dict = {'input_ids': input_ids_1}
+            create_and_check_commons(self, config, inputs_dict)
 
     def test_default(self):
         self.run_tester(XLNetModelTest.XLNetModelTester(self))
 
-    def test_config_to_json_string(self):
-        config = XLNetConfig(vocab_size_or_config_json_file=96, d_model=16*4)
-        obj = json.loads(config.to_json_string())
-        self.assertEqual(obj["n_token"], 96)
-        self.assertEqual(obj["d_model"], 16*4)
-
-    def test_config_to_json_file(self):
-        config_first = XLNetConfig(vocab_size_or_config_json_file=96, d_model=16*4)
-        json_file_path = "/tmp/config.json"
-        config_first.to_json_file(json_file_path)
-        config_second = XLNetConfig.from_json_file(json_file_path)
-        os.remove(json_file_path)
-        self.assertEqual(config_second.to_dict(), config_first.to_dict())
+    def test_config(self):
+        config_tester = ConfigTester(self, config_class=XLNetConfig, d_inner=37)
+        config_tester.run_common_tests()
 
     @pytest.mark.slow
     def test_model_from_pretrained(self):
@@ -197,27 +198,14 @@ class XLNetModelTest(unittest.TestCase):
             self.assertIsNotNone(model)
 
     def run_tester(self, tester):
-        config_and_inputs = tester.prepare_config_and_inputs()
-
         tester.set_seed()
+        config_and_inputs = tester.prepare_config_and_inputs()
         output_result = tester.create_transfo_xl_lm_head(*config_and_inputs)
         tester.check_transfo_xl_lm_head_output(output_result)
 
-    @classmethod
-    def ids_tensor(cls, shape, vocab_size, rng=None, name=None):
-        """Creates a random int32 tensor of the shape within the vocab size."""
-        if rng is None:
-            rng = random.Random()
-
-        total_dims = 1
-        for dim in shape:
-            total_dims *= dim
-
-        values = []
-        for _ in range(total_dims):
-            values.append(rng.randint(0, vocab_size - 1))
-
-        return torch.tensor(data=values, dtype=torch.long).view(shape).contiguous()
+        tester.set_seed()
+        config_and_inputs = tester.prepare_config_and_inputs()
+        tester.create_and_check_xlnet_commons(*config_and_inputs)
 
     @classmethod
     def mask_tensor(cls, shape, vocab_size, rng=None, name=None):
diff --git a/tests/optimization_test.py b/pytorch_pretrained_bert/tests/optimization_test.py
similarity index 100%
rename from tests/optimization_test.py
rename to pytorch_pretrained_bert/tests/optimization_test.py
diff --git a/tests/tokenization_gpt2_test.py b/pytorch_pretrained_bert/tests/tokenization_gpt2_test.py
similarity index 100%
rename from tests/tokenization_gpt2_test.py
rename to pytorch_pretrained_bert/tests/tokenization_gpt2_test.py
diff --git a/tests/tokenization_openai_test.py b/pytorch_pretrained_bert/tests/tokenization_openai_test.py
similarity index 100%
rename from tests/tokenization_openai_test.py
rename to pytorch_pretrained_bert/tests/tokenization_openai_test.py
diff --git a/tests/tokenization_test.py b/pytorch_pretrained_bert/tests/tokenization_test.py
similarity index 100%
rename from tests/tokenization_test.py
rename to pytorch_pretrained_bert/tests/tokenization_test.py
diff --git a/tests/tokenization_transfo_xl_test.py b/pytorch_pretrained_bert/tests/tokenization_transfo_xl_test.py
similarity index 100%
rename from tests/tokenization_transfo_xl_test.py
rename to pytorch_pretrained_bert/tests/tokenization_transfo_xl_test.py
diff --git a/tests/tokenization_xlnet_test.py b/pytorch_pretrained_bert/tests/tokenization_xlnet_test.py
similarity index 97%
rename from tests/tokenization_xlnet_test.py
rename to pytorch_pretrained_bert/tests/tokenization_xlnet_test.py
index 285dee226d..707a516b96 100644
--- a/tests/tokenization_xlnet_test.py
+++ b/pytorch_pretrained_bert/tests/tokenization_xlnet_test.py
@@ -30,9 +30,8 @@ from pytorch_pretrained_bert.tokenization_xlnet import (XLNetTokenizer,
                                                         PRETRAINED_VOCAB_ARCHIVE_MAP,
                                                         SPIECE_UNDERLINE)
 
-SAMPLE_VOCAB = os.path.join(os.path.dirname(
-                    os.path.dirname(os.path.abspath(__file__))),
-                    'samples/test_sentencepiece.model')
+SAMPLE_VOCAB = os.path.join(os.path.dirname(os.path.abspath(__file__)),
+                    'fixtures/test_sentencepiece.model')
 
 class XLNetTokenizationTest(unittest.TestCase):
 
diff --git a/tests/modeling_gpt2_test.py b/tests/modeling_gpt2_test.py
deleted file mode 100644
index 589de22f5d..0000000000
--- a/tests/modeling_gpt2_test.py
+++ /dev/null
@@ -1,364 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-import unittest
-import json
-import random
-import shutil
-import pytest
-
-import torch
-
-from pytorch_pretrained_bert import (GPT2Config, GPT2Model,
-                                     GPT2LMHeadModel, GPT2DoubleHeadsModel)
-from pytorch_pretrained_bert.modeling_gpt2 import PRETRAINED_MODEL_ARCHIVE_MAP
-
-class GPT2ModelTest(unittest.TestCase):
-    class GPT2ModelTester(object):
-
-        def __init__(self,
-                     parent,
-                     batch_size=13,
-                     seq_length=7,
-                     is_training=True,
-                     use_position_ids=True,
-                     use_token_type_ids=True,
-                     use_labels=True,
-                     vocab_size=99,
-                     n_special=1,
-                     n_positions=33,
-                     n_embd=32,
-                     n_layer=5,
-                     n_head=4,
-                     n_choices=3,
-                     type_sequence_label_size=2,
-                     initializer_range=0.02,
-                     num_labels=3,
-                     scope=None):
-            self.parent = parent
-            self.batch_size = batch_size
-            self.seq_length = seq_length
-            self.is_training = is_training
-            self.use_position_ids = use_position_ids
-            self.use_token_type_ids = use_token_type_ids
-            self.use_labels = use_labels
-            self.vocab_size = vocab_size
-            self.n_special = n_special
-            self.n_positions = n_positions
-            self.n_embd = n_embd
-            self.n_layer = n_layer
-            self.n_head = n_head
-            self.n_choices = n_choices
-            self.type_sequence_label_size = type_sequence_label_size
-            self.initializer_range = initializer_range
-            self.num_labels = num_labels
-            self.scope = scope
-
-        def prepare_config_and_inputs(self):
-            total_num_tokens = self.vocab_size + self.n_special
-            input_ids = GPT2ModelTest.ids_tensor([self.batch_size, self.n_choices, self.seq_length], total_num_tokens)
-
-            position_ids = None
-            if self.use_position_ids:
-                position_ids = GPT2ModelTest.ids_tensor([self.batch_size, self.n_choices, self.seq_length], self.n_positions)
-
-            token_type_ids = None
-            if self.use_token_type_ids:
-                total_voc = self.vocab_size
-                token_type_ids = GPT2ModelTest.ids_tensor([self.batch_size, self.n_choices, self.seq_length], total_voc)
-
-            mc_labels = None
-            lm_labels = None
-            mc_token_ids = None
-            if self.use_labels:
-                mc_labels = GPT2ModelTest.ids_tensor([self.batch_size], self.type_sequence_label_size)
-                lm_labels = GPT2ModelTest.ids_tensor([self.batch_size, self.n_choices, self.seq_length], self.num_labels)
-                mc_token_ids = GPT2ModelTest.ids_tensor([self.batch_size, self.n_choices], self.seq_length)
-
-            config = GPT2Config(
-                vocab_size_or_config_json_file=self.vocab_size,
-                n_special=self.n_special,
-                n_positions=self.n_positions,
-                n_embd=self.n_embd,
-                n_layer=self.n_layer,
-                n_head=self.n_head,
-                initializer_range=self.initializer_range)
-
-            return (config, input_ids, token_type_ids, position_ids,
-                    mc_labels, lm_labels, mc_token_ids)
-
-        def create_gpt2_model(self, config, input_ids, token_type_ids, position_ids,
-                                mc_labels, lm_labels, mc_token_ids):
-            model = GPT2Model(config)
-            model.eval()
-            hidden_states, presents = model(input_ids, position_ids, token_type_ids)
-            outputs = {
-                "hidden_states": hidden_states,
-                "presents": presents,
-            }
-            return outputs
-
-        def check_gpt2_model_output(self, result):
-            self.parent.assertEqual(len(result["hidden_states"]), self.n_layer + 1)
-            self.parent.assertListEqual(
-                list(result["hidden_states"][0].size()),
-                [self.batch_size, self.n_choices, self.seq_length, self.n_embd])
-
-
-        def create_gpt2_lm_head(self, config, input_ids, token_type_ids, position_ids,
-                                       mc_labels, lm_labels, mc_token_ids):
-            model = GPT2LMHeadModel(config)
-            model.eval()
-            loss = model(input_ids, position_ids, token_type_ids, lm_labels)
-            lm_logits, presents = model(input_ids, position_ids, token_type_ids)
-            outputs = {
-                "loss": loss,
-                "lm_logits": lm_logits,
-                "presents": presents,
-            }
-            return outputs
-
-        def create_gpt2_lm_head_with_output_attention(self, config, input_ids, token_type_ids, position_ids,
-                                       mc_labels, lm_labels, mc_token_ids):
-            model = GPT2LMHeadModel(config, output_attentions=True)
-            model.eval()
-            loss = model(input_ids, position_ids, token_type_ids, lm_labels)
-            attentions, lm_logits, presents = model(input_ids, position_ids, token_type_ids)
-            outputs = {
-                "loss": loss,
-                "lm_logits": lm_logits,
-                "presents": presents,
-                "attentions": attentions,
-            }
-            return outputs
-
-        def check_gpt2_lm_head_output(self, result):
-            total_voc = self.n_special + self.vocab_size
-            self.parent.assertListEqual(
-                list(result["lm_logits"].size()),
-                [self.batch_size, self.n_choices, self.seq_length, total_voc])
-            self.parent.assertEqual(self.n_layer, len(result["presents"]))
-            self.parent.assertListEqual(
-                list(result["presents"][0].size()),
-                [2, self.batch_size * self.n_choices, self.n_head, self.seq_length, self.n_embd // self.n_head])
-
-        def check_gpt2_lm_head_loss_output(self, result):
-            self.parent.assertListEqual(
-                list(result["loss"].size()),
-                [])
-
-        def create_gpt2_double_heads(self, config, input_ids, token_type_ids, position_ids,
-                                       mc_labels, lm_labels, mc_token_ids):
-            model = GPT2DoubleHeadsModel(config)
-            model.eval()
-            loss = model(input_ids, mc_token_ids,
-                         lm_labels=lm_labels, mc_labels=mc_labels,
-                         token_type_ids=token_type_ids, position_ids=position_ids)
-            lm_logits, mc_logits, presents = model(input_ids, mc_token_ids, position_ids=position_ids, token_type_ids=token_type_ids)
-            outputs = {
-                "loss": loss,
-                "lm_logits": lm_logits,
-                "mc_logits": mc_logits,
-                "presents": presents,
-            }
-            return outputs
-
-        def create_gpt2_double_heads_with_output_attention(self, config, input_ids, token_type_ids, position_ids,
-                                       mc_labels, lm_labels, mc_token_ids):
-            model = GPT2DoubleHeadsModel(config, output_attentions=True)
-            model.eval()
-            loss = model(input_ids, mc_token_ids,
-                         lm_labels=lm_labels, mc_labels=mc_labels,
-                         token_type_ids=token_type_ids, position_ids=position_ids)
-            attentions, lm_logits, mc_logits, presents = model(input_ids, mc_token_ids, position_ids=position_ids, token_type_ids=token_type_ids)
-            outputs = {
-                "loss": loss,
-                "lm_logits": lm_logits,
-                "mc_logits": mc_logits,
-                "presents": presents,
-                "attentions": attentions,
-            }
-            return outputs
-
-        def check_gpt2_double_heads_output(self, result):
-            total_voc = self.n_special + self.vocab_size
-            self.parent.assertListEqual(
-                list(result["lm_logits"].size()),
-                [self.batch_size, self.n_choices, self.seq_length, total_voc])
-            self.parent.assertListEqual(
-                list(result["mc_logits"].size()),
-                [self.batch_size, self.n_choices])
-
-        def check_gpt2_double_heads_loss_output(self, result):
-            self.parent.assertListEqual(
-                [list(l.size()) for l in result["loss"]],
-                [[], []])
-
-        def create_and_check_gpt2_for_headmasking(self, config, input_ids, token_type_ids, position_ids,
-                                                mc_labels, lm_labels, mc_token_ids):
-            for model_class in (GPT2Model, GPT2LMHeadModel, GPT2DoubleHeadsModel):
-                model = model_class(config=config, keep_multihead_output=True)
-                model.eval()
-                head_mask = torch.ones(self.n_layer, self.n_head).to(input_ids.device)
-                head_mask[0, 1:-1] = 0.0 # Mask all but the first and last heads on the first layer
-                head_mask[-1, 1:] = 0.0  # Mask all but the first head on the last layer
-                if isinstance(model, GPT2DoubleHeadsModel):
-                    output = model(input_ids, mc_token_ids, head_mask=head_mask)
-                else:
-                    output = model(input_ids, head_mask=head_mask)
-
-                if isinstance(model, GPT2Model):
-                    output = sum(t.sum() for t in output[0])
-                elif isinstance(output, (list, tuple)):
-                    output = sum(t.sum() for t in output[:-1])
-                output = output.sum()
-                output.backward()
-                multihead_outputs = (model if isinstance(model, GPT2Model) else model.transformer).get_multihead_outputs()
-
-                self.parent.assertEqual(len(multihead_outputs), self.n_layer)
-                self.parent.assertListEqual(
-                    list(multihead_outputs[0].size()),
-                    [self.batch_size * self.n_choices, self.n_head,
-                        self.seq_length, self.n_embd // self.n_head])
-                self.parent.assertEqual(
-                    len(multihead_outputs[0][:, 1:(self.n_head-1), :, :].nonzero()),
-                    0)
-                self.parent.assertEqual(
-                    len(multihead_outputs[0][:, 0, :, :].nonzero()),
-                    self.batch_size * self.n_choices * self.seq_length * self.n_embd // self.n_head)
-                self.parent.assertEqual(
-                    len(multihead_outputs[0][:, self.n_head-1, :, :].nonzero()),
-                    self.batch_size * self.n_choices * self.seq_length * self.n_embd // self.n_head)
-
-                self.parent.assertListEqual(
-                    list(multihead_outputs[1].size()),
-                    [self.batch_size * self.n_choices, self.n_head,
-                     self.seq_length, self.n_embd // self.n_head])
-                self.parent.assertEqual(
-                    len(multihead_outputs[1].nonzero()),
-                    multihead_outputs[1].numel())
-
-                self.parent.assertListEqual(
-                    list(multihead_outputs[-1].size()),
-                    [self.batch_size * self.n_choices, self.n_head,
-                     self.seq_length, self.n_embd // self.n_head])
-                self.parent.assertEqual(
-                    len(multihead_outputs[-1][:, 1:, :, :].nonzero()),
-                    0)
-                self.parent.assertEqual(
-                    len(multihead_outputs[-1][:, 0, :, :].nonzero()),
-                    self.batch_size * self.n_choices * self.seq_length * self.n_embd // self.n_head)
-
-        def create_and_check_gpt2_for_head_pruning(self, config, input_ids, token_type_ids, position_ids,
-                                                   mc_labels, lm_labels, mc_token_ids):
-            for model_class in (GPT2Model, GPT2LMHeadModel, GPT2DoubleHeadsModel):
-                model = model_class(config=config, keep_multihead_output=True)
-                model.eval()
-                transformer = model if isinstance(model, GPT2Model) else model.transformer
-                heads_to_prune = {0: list(range(1, self.n_head)),
-                                  -1: [0]}
-                transformer.prune_heads(heads_to_prune)
-                if isinstance(model, GPT2DoubleHeadsModel):
-                    output = model(input_ids, mc_token_ids)
-                else:
-                    output = model(input_ids)
-
-                if isinstance(model, GPT2Model):
-                    output = sum(t.sum() for t in output[0])
-                elif isinstance(output, (list, tuple)):
-                    output = sum(t.sum() for t in output[:-1])
-                output = output.sum()
-                output.backward()
-                multihead_outputs = transformer.get_multihead_outputs()
-
-                self.parent.assertEqual(len(multihead_outputs), self.n_layer)
-                self.parent.assertListEqual(
-                    list(multihead_outputs[0].size()),
-                    [self.batch_size * self.n_choices, 1,
-                        self.seq_length, self.n_embd // self.n_head])
-                self.parent.assertListEqual(
-                    list(multihead_outputs[1].size()),
-                    [self.batch_size * self.n_choices, self.n_head,
-                        self.seq_length, self.n_embd // self.n_head])
-                self.parent.assertListEqual(
-                    list(multihead_outputs[-1].size()),
-                    [self.batch_size * self.n_choices, self.n_head-1,
-                        self.seq_length, self.n_embd // self.n_head])
-
-
-    def test_default(self):
-        self.run_tester(GPT2ModelTest.GPT2ModelTester(self))
-
-    def test_config_to_json_string(self):
-        config = GPT2Config(vocab_size_or_config_json_file=99, n_embd=37)
-        obj = json.loads(config.to_json_string())
-        self.assertEqual(obj["vocab_size"], 99)
-        self.assertEqual(obj["n_embd"], 37)
-
-    def test_config_to_json_file(self):
-        config_first = GPT2Config(vocab_size_or_config_json_file=99, n_embd=37)
-        json_file_path = "/tmp/config.json"
-        config_first.to_json_file(json_file_path)
-        config_second = GPT2Config.from_json_file(json_file_path)
-        os.remove(json_file_path)
-        self.assertEqual(config_second.to_dict(), config_first.to_dict())
-
-    @pytest.mark.slow
-    def test_model_from_pretrained(self):
-        cache_dir = "/tmp/pytorch_pretrained_bert_test/"
-        for model_name in list(PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
-            model = GPT2Model.from_pretrained(model_name, cache_dir=cache_dir)
-            shutil.rmtree(cache_dir)
-            self.assertIsNotNone(model)
-
-    def run_tester(self, tester):
-        config_and_inputs = tester.prepare_config_and_inputs()
-        output_result = tester.create_gpt2_model(*config_and_inputs)
-        tester.check_gpt2_model_output(output_result)
-
-        output_result = tester.create_gpt2_lm_head(*config_and_inputs)
-        tester.check_gpt2_lm_head_output(output_result)
-        tester.check_gpt2_lm_head_loss_output(output_result)
-
-        output_result = tester.create_gpt2_double_heads(*config_and_inputs)
-        tester.check_gpt2_double_heads_output(output_result)
-        tester.check_gpt2_double_heads_loss_output(output_result)
-
-        tester.create_and_check_gpt2_for_headmasking(*config_and_inputs)
-        tester.create_and_check_gpt2_for_head_pruning(*config_and_inputs)
-
-    @classmethod
-    def ids_tensor(cls, shape, vocab_size, rng=None, name=None):
-        """Creates a random int32 tensor of the shape within the vocab size."""
-        if rng is None:
-            rng = random.Random()
-
-        total_dims = 1
-        for dim in shape:
-            total_dims *= dim
-
-        values = []
-        for _ in range(total_dims):
-            values.append(rng.randint(0, vocab_size - 1))
-
-        return torch.tensor(data=values, dtype=torch.long).view(shape).contiguous()
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/tests/modeling_openai_test.py b/tests/modeling_openai_test.py
deleted file mode 100644
index c8fc8f48fc..0000000000
--- a/tests/modeling_openai_test.py
+++ /dev/null
@@ -1,338 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-import unittest
-import json
-import random
-import shutil
-import pytest
-
-import torch
-
-from pytorch_pretrained_bert import (OpenAIGPTConfig, OpenAIGPTModel,
-                                     OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel)
-from pytorch_pretrained_bert.modeling_openai import PRETRAINED_MODEL_ARCHIVE_MAP
-
-class OpenAIGPTModelTest(unittest.TestCase):
-    class OpenAIGPTModelTester(object):
-
-        def __init__(self,
-                     parent,
-                     batch_size=13,
-                     seq_length=7,
-                     is_training=True,
-                     use_position_ids=True,
-                     use_token_type_ids=True,
-                     use_labels=True,
-                     vocab_size=99,
-                     n_special=1,
-                     n_positions=33,
-                     n_embd=32,
-                     n_layer=5,
-                     n_head=4,
-                     n_choices=3,
-                     afn="gelu",
-                     resid_pdrop=0.1,
-                     attn_pdrop=0.1,
-                     embd_pdrop=0.1,
-                     type_sequence_label_size=2,
-                     initializer_range=0.02,
-                     num_labels=3,
-                     scope=None):
-            self.parent = parent
-            self.batch_size = batch_size
-            self.seq_length = seq_length
-            self.is_training = is_training
-            self.use_position_ids = use_position_ids
-            self.use_token_type_ids = use_token_type_ids
-            self.use_labels = use_labels
-            self.vocab_size = vocab_size
-            self.n_special = n_special
-            self.n_positions = n_positions
-            self.n_embd = n_embd
-            self.n_layer = n_layer
-            self.n_head = n_head
-            self.afn = afn
-            self.n_choices = n_choices
-            self.resid_pdrop = resid_pdrop
-            self.attn_pdrop = attn_pdrop
-            self.embd_pdrop = embd_pdrop
-            self.type_sequence_label_size = type_sequence_label_size
-            self.initializer_range = initializer_range
-            self.num_labels = num_labels
-            self.scope = scope
-
-        def prepare_config_and_inputs(self):
-            input_ids = OpenAIGPTModelTest.ids_tensor([self.batch_size, self.n_choices, self.seq_length], self.vocab_size)
-
-            position_ids = None
-            if self.use_position_ids:
-                position_ids = OpenAIGPTModelTest.ids_tensor([self.batch_size, self.n_choices, self.seq_length], self.n_positions)
-
-            token_type_ids = None
-            if self.use_token_type_ids:
-                total_voc = self.vocab_size + self.n_special
-                token_type_ids = OpenAIGPTModelTest.ids_tensor([self.batch_size, self.n_choices, self.seq_length], total_voc)
-
-            mc_labels = None
-            lm_labels = None
-            mc_token_ids = None
-            if self.use_labels:
-                mc_labels = OpenAIGPTModelTest.ids_tensor([self.batch_size], self.type_sequence_label_size)
-                lm_labels = OpenAIGPTModelTest.ids_tensor([self.batch_size, self.n_choices, self.seq_length], self.num_labels)
-                mc_token_ids = OpenAIGPTModelTest.ids_tensor([self.batch_size, self.n_choices], self.seq_length)
-
-            config = OpenAIGPTConfig(
-                vocab_size_or_config_json_file=self.vocab_size,
-                n_positions=self.n_positions,
-                n_special=self.n_special,
-                n_embd=self.n_embd,
-                n_layer=self.n_layer,
-                n_head=self.n_head,
-                afn=self.afn,
-                resid_pdrop=self.resid_pdrop,
-                attn_pdrop=self.attn_pdrop,
-                embd_pdrop=self.embd_pdrop,
-                initializer_range=self.initializer_range)
-
-            return (config, input_ids, token_type_ids, position_ids,
-                    mc_labels, lm_labels, mc_token_ids)
-
-        def create_openai_model(self, config, input_ids, token_type_ids, position_ids,
-                                mc_labels, lm_labels, mc_token_ids):
-            model = OpenAIGPTModel(config)
-            model.eval()
-            hidden_states = model(input_ids, position_ids, token_type_ids)
-            outputs = {
-                "hidden_states": hidden_states,
-            }
-            return outputs
-
-        def check_openai_model_output(self, result):
-            self.parent.assertEqual(len(result["hidden_states"]), self.n_layer + 1)
-            self.parent.assertListEqual(
-                list(result["hidden_states"][0].size()),
-                [self.batch_size, self.n_choices, self.seq_length, self.n_embd])
-
-
-        def create_openai_lm_head(self, config, input_ids, token_type_ids, position_ids,
-                                       mc_labels, lm_labels, mc_token_ids):
-            model = OpenAIGPTLMHeadModel(config)
-            model.eval()
-            loss = model(input_ids, position_ids, token_type_ids, lm_labels)
-            lm_logits = model(input_ids, position_ids, token_type_ids)
-            outputs = {
-                "loss": loss,
-                "lm_logits": lm_logits,
-            }
-            return outputs
-
-        def check_openai_lm_head_output(self, result):
-            total_voc = self.n_special + self.vocab_size
-            self.parent.assertListEqual(
-                list(result["lm_logits"].size()),
-                [self.batch_size, self.n_choices, self.seq_length, total_voc])
-
-        def check_openai_lm_head_loss_output(self, result):
-            self.parent.assertListEqual(
-                list(result["loss"].size()),
-                [])
-
-        def create_openai_double_heads(self, config, input_ids, token_type_ids, position_ids,
-                                       mc_labels, lm_labels, mc_token_ids):
-            model = OpenAIGPTDoubleHeadsModel(config)
-            model.eval()
-            loss = model(input_ids, mc_token_ids,
-                         lm_labels=lm_labels, mc_labels=mc_labels,
-                         token_type_ids=token_type_ids, position_ids=position_ids)
-            lm_logits, mc_logits = model(input_ids, mc_token_ids, position_ids=position_ids, token_type_ids=token_type_ids)
-            outputs = {
-                "loss": loss,
-                "lm_logits": lm_logits,
-                "mc_logits": mc_logits,
-            }
-            return outputs
-
-        def check_openai_double_heads_output(self, result):
-            total_voc = self.n_special + self.vocab_size
-            self.parent.assertListEqual(
-                list(result["lm_logits"].size()),
-                [self.batch_size, self.n_choices, self.seq_length, total_voc])
-            self.parent.assertListEqual(
-                list(result["mc_logits"].size()),
-                [self.batch_size, self.n_choices])
-
-        def check_openai_double_heads_loss_output(self, result):
-            self.parent.assertListEqual(
-                [list(l.size()) for l in result["loss"]],
-                [[], []])
-
-        def create_and_check_openai_for_headmasking(self, config, input_ids, token_type_ids, position_ids,
-                                                mc_labels, lm_labels, mc_token_ids):
-            for model_class in (OpenAIGPTModel, OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel):
-                model = model_class(config=config, keep_multihead_output=True)
-                model.eval()
-                head_mask = torch.ones(self.n_layer, self.n_head).to(input_ids.device)
-                head_mask[0, 1:-1] = 0.0 # Mask all but the first and last heads on the first layer
-                head_mask[-1, 1:] = 0.0  # Mask all but the first head on the last layer
-                if isinstance(model, OpenAIGPTDoubleHeadsModel):
-                    output = model(input_ids, mc_token_ids, head_mask=head_mask)
-                else:
-                    output = model(input_ids, head_mask=head_mask)
-
-                if isinstance(model, OpenAIGPTModel):
-                    output = sum(t.sum() for t in output[0])
-                elif isinstance(output, (list, tuple)):
-                    output = sum(t.sum() for t in output)
-                output = output.sum()
-                output.backward()
-                multihead_outputs = (model if isinstance(model, OpenAIGPTModel) else model.transformer).get_multihead_outputs()
-
-                self.parent.assertEqual(len(multihead_outputs), self.n_layer)
-                self.parent.assertListEqual(
-                    list(multihead_outputs[0].size()),
-                    [self.batch_size * self.n_choices, self.n_head,
-                        self.seq_length, self.n_embd // self.n_head])
-                self.parent.assertEqual(
-                    len(multihead_outputs[0][:, 1:(self.n_head-1), :, :].nonzero()),
-                    0)
-                self.parent.assertEqual(
-                    len(multihead_outputs[0][:, 0, :, :].nonzero()),
-                    self.batch_size * self.n_choices * self.seq_length * self.n_embd // self.n_head)
-                self.parent.assertEqual(
-                    len(multihead_outputs[0][:, self.n_head-1, :, :].nonzero()),
-                    self.batch_size * self.n_choices * self.seq_length * self.n_embd // self.n_head)
-
-                self.parent.assertListEqual(
-                    list(multihead_outputs[1].size()),
-                    [self.batch_size * self.n_choices, self.n_head,
-                     self.seq_length, self.n_embd // self.n_head])
-                self.parent.assertEqual(
-                    len(multihead_outputs[1].nonzero()),
-                    multihead_outputs[1].numel())
-
-                self.parent.assertListEqual(
-                    list(multihead_outputs[-1].size()),
-                    [self.batch_size * self.n_choices, self.n_head,
-                     self.seq_length, self.n_embd // self.n_head])
-                self.parent.assertEqual(
-                    len(multihead_outputs[-1][:, 1:, :, :].nonzero()),
-                    0)
-                self.parent.assertEqual(
-                    len(multihead_outputs[-1][:, 0, :, :].nonzero()),
-                    self.batch_size * self.n_choices * self.seq_length * self.n_embd // self.n_head)
-
-
-        def create_and_check_openai_for_head_pruning(self, config, input_ids, token_type_ids, position_ids,
-                                                     mc_labels, lm_labels, mc_token_ids):
-            for model_class in (OpenAIGPTModel, OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel):
-                model = model_class(config=config, keep_multihead_output=True)
-                model.eval()
-                transformer = model if isinstance(model, OpenAIGPTModel) else model.transformer
-                heads_to_prune = {0: list(range(1, self.n_head)),
-                                  -1: [0]}
-                transformer.prune_heads(heads_to_prune)
-                if isinstance(model, OpenAIGPTDoubleHeadsModel):
-                    output = model(input_ids, mc_token_ids)
-                else:
-                    output = model(input_ids)
-
-                if isinstance(model, OpenAIGPTModel):
-                    output = sum(t.sum() for t in output[0])
-                elif isinstance(output, (list, tuple)):
-                    output = sum(t.sum() for t in output)
-                output = output.sum()
-                output.backward()
-                multihead_outputs = transformer.get_multihead_outputs()
-
-                self.parent.assertEqual(len(multihead_outputs), self.n_layer)
-                self.parent.assertListEqual(
-                    list(multihead_outputs[0].size()),
-                    [self.batch_size * self.n_choices, 1,
-                        self.seq_length, self.n_embd // self.n_head])
-                self.parent.assertListEqual(
-                    list(multihead_outputs[1].size()),
-                    [self.batch_size * self.n_choices, self.n_head,
-                        self.seq_length, self.n_embd // self.n_head])
-                self.parent.assertListEqual(
-                    list(multihead_outputs[-1].size()),
-                    [self.batch_size * self.n_choices, self.n_head-1,
-                        self.seq_length, self.n_embd // self.n_head])
-
-
-    def test_default(self):
-        self.run_tester(OpenAIGPTModelTest.OpenAIGPTModelTester(self))
-
-    def test_config_to_json_string(self):
-        config = OpenAIGPTConfig(vocab_size_or_config_json_file=99, n_embd=37)
-        obj = json.loads(config.to_json_string())
-        self.assertEqual(obj["vocab_size"], 99)
-        self.assertEqual(obj["n_embd"], 37)
-
-    def test_config_to_json_file(self):
-        config_first = OpenAIGPTConfig(vocab_size_or_config_json_file=99, n_embd=37)
-        json_file_path = "/tmp/config.json"
-        config_first.to_json_file(json_file_path)
-        config_second = OpenAIGPTConfig.from_json_file(json_file_path)
-        os.remove(json_file_path)
-        self.assertEqual(config_second.to_dict(), config_first.to_dict())
-
-    @pytest.mark.slow
-    def test_model_from_pretrained(self):
-        cache_dir = "/tmp/pytorch_pretrained_bert_test/"
-        for model_name in list(PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
-            model = OpenAIGPTModel.from_pretrained(model_name, cache_dir=cache_dir)
-            shutil.rmtree(cache_dir)
-            self.assertIsNotNone(model)
-
-    def run_tester(self, tester):
-        config_and_inputs = tester.prepare_config_and_inputs()
-        output_result = tester.create_openai_model(*config_and_inputs)
-        tester.check_openai_model_output(output_result)
-
-        output_result = tester.create_openai_lm_head(*config_and_inputs)
-        tester.check_openai_lm_head_output(output_result)
-        tester.check_openai_lm_head_loss_output(output_result)
-
-        output_result = tester.create_openai_double_heads(*config_and_inputs)
-        tester.check_openai_double_heads_output(output_result)
-        tester.check_openai_double_heads_loss_output(output_result)
-
-        tester.create_and_check_openai_for_headmasking(*config_and_inputs)
-        tester.create_and_check_openai_for_head_pruning(*config_and_inputs)
-
-    @classmethod
-    def ids_tensor(cls, shape, vocab_size, rng=None, name=None):
-        """Creates a random int32 tensor of the shape within the vocab size."""
-        if rng is None:
-            rng = random.Random()
-
-        total_dims = 1
-        for dim in shape:
-            total_dims *= dim
-
-        values = []
-        for _ in range(total_dims):
-            values.append(rng.randint(0, vocab_size - 1))
-
-        return torch.tensor(data=values, dtype=torch.long).view(shape).contiguous()
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/tests/modeling_test.py b/tests/modeling_test.py
deleted file mode 100644
index 10e93658c9..0000000000
--- a/tests/modeling_test.py
+++ /dev/null
@@ -1,467 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-import unittest
-import json
-import random
-import shutil
-import pytest
-
-import torch
-
-from pytorch_pretrained_bert import (BertConfig, BertModel, BertForMaskedLM,
-                                     BertForNextSentencePrediction, BertForPreTraining,
-                                     BertForQuestionAnswering, BertForSequenceClassification,
-                                     BertForTokenClassification, BertForMultipleChoice)
-from pytorch_pretrained_bert.modeling import PRETRAINED_MODEL_ARCHIVE_MAP
-
-
-class BertModelTest(unittest.TestCase):
-    class BertModelTester(object):
-
-        def __init__(self,
-                     parent,
-                     batch_size=13,
-                     seq_length=7,
-                     is_training=True,
-                     use_input_mask=True,
-                     use_token_type_ids=True,
-                     use_labels=True,
-                     vocab_size=99,
-                     hidden_size=32,
-                     num_hidden_layers=5,
-                     num_attention_heads=4,
-                     intermediate_size=37,
-                     hidden_act="gelu",
-                     hidden_dropout_prob=0.1,
-                     attention_probs_dropout_prob=0.1,
-                     max_position_embeddings=512,
-                     type_vocab_size=16,
-                     type_sequence_label_size=2,
-                     initializer_range=0.02,
-                     num_labels=3,
-                     num_choices=4,
-                     scope=None):
-            self.parent = parent
-            self.batch_size = batch_size
-            self.seq_length = seq_length
-            self.is_training = is_training
-            self.use_input_mask = use_input_mask
-            self.use_token_type_ids = use_token_type_ids
-            self.use_labels = use_labels
-            self.vocab_size = vocab_size
-            self.hidden_size = hidden_size
-            self.num_hidden_layers = num_hidden_layers
-            self.num_attention_heads = num_attention_heads
-            self.intermediate_size = intermediate_size
-            self.hidden_act = hidden_act
-            self.hidden_dropout_prob = hidden_dropout_prob
-            self.attention_probs_dropout_prob = attention_probs_dropout_prob
-            self.max_position_embeddings = max_position_embeddings
-            self.type_vocab_size = type_vocab_size
-            self.type_sequence_label_size = type_sequence_label_size
-            self.initializer_range = initializer_range
-            self.num_labels = num_labels
-            self.num_choices = num_choices
-            self.scope = scope
-
-        def prepare_config_and_inputs(self):
-            input_ids = BertModelTest.ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-            input_mask = None
-            if self.use_input_mask:
-                input_mask = BertModelTest.ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
-
-            token_type_ids = None
-            if self.use_token_type_ids:
-                token_type_ids = BertModelTest.ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
-
-            sequence_labels = None
-            token_labels = None
-            choice_labels = None
-            if self.use_labels:
-                sequence_labels = BertModelTest.ids_tensor([self.batch_size], self.type_sequence_label_size)
-                token_labels = BertModelTest.ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-                choice_labels = BertModelTest.ids_tensor([self.batch_size], self.num_choices)
-
-            config = BertConfig(
-                vocab_size_or_config_json_file=self.vocab_size,
-                hidden_size=self.hidden_size,
-                num_hidden_layers=self.num_hidden_layers,
-                num_attention_heads=self.num_attention_heads,
-                intermediate_size=self.intermediate_size,
-                hidden_act=self.hidden_act,
-                hidden_dropout_prob=self.hidden_dropout_prob,
-                attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-                max_position_embeddings=self.max_position_embeddings,
-                type_vocab_size=self.type_vocab_size,
-                initializer_range=self.initializer_range)
-
-            return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-
-        def check_loss_output(self, result):
-            self.parent.assertListEqual(
-                list(result["loss"].size()),
-                [])
-
-        def create_bert_model(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
-            model = BertModel(config=config)
-            model.eval()
-            sequence_output, pooled_output = model(input_ids, token_type_ids, input_mask)
-
-            model = BertModel(config=config, output_hidden_states=True)
-            model.eval()
-            _, _, all_encoder_layers = model(input_ids, token_type_ids, input_mask)
-            outputs = {
-                "sequence_output": sequence_output,
-                "pooled_output": pooled_output,
-                "all_encoder_layers": all_encoder_layers,
-            }
-            return outputs
-
-        def check_bert_model_output(self, result):
-            self.parent.assertListEqual(
-                [size for layer in result["all_encoder_layers"] for size in layer.size()],
-                [self.batch_size, self.seq_length, self.hidden_size] * (self.num_hidden_layers + 1))
-            self.parent.assertListEqual(
-                list(result["sequence_output"].size()),
-                [self.batch_size, self.seq_length, self.hidden_size])
-            self.parent.assertListEqual(list(result["pooled_output"].size()), [self.batch_size, self.hidden_size])
-
-
-        def create_bert_for_masked_lm(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
-            model = BertForMaskedLM(config=config)
-            model.eval()
-            loss, prediction_scores = model(input_ids, token_type_ids, input_mask, token_labels)
-            outputs = {
-                "loss": loss,
-                "prediction_scores": prediction_scores,
-            }
-            return outputs
-
-        def check_bert_for_masked_lm_output(self, result):
-            self.parent.assertListEqual(
-                list(result["prediction_scores"].size()),
-                [self.batch_size, self.seq_length, self.vocab_size])
-
-        def create_bert_for_next_sequence_prediction(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
-            model = BertForNextSentencePrediction(config=config)
-            model.eval()
-            loss, seq_relationship_score = model(input_ids, token_type_ids, input_mask, sequence_labels)
-            outputs = {
-                "loss": loss,
-                "seq_relationship_score": seq_relationship_score,
-            }
-            return outputs
-
-        def check_bert_for_next_sequence_prediction_output(self, result):
-            self.parent.assertListEqual(
-                list(result["seq_relationship_score"].size()),
-                [self.batch_size, 2])
-
-
-        def create_bert_for_pretraining(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
-            model = BertForPreTraining(config=config)
-            model.eval()
-            loss, prediction_scores, seq_relationship_score = model(input_ids, token_type_ids, input_mask, token_labels, sequence_labels)
-            outputs = {
-                "loss": loss,
-                "prediction_scores": prediction_scores,
-                "seq_relationship_score": seq_relationship_score,
-            }
-            return outputs
-
-        def check_bert_for_pretraining_output(self, result):
-            self.parent.assertListEqual(
-                list(result["prediction_scores"].size()),
-                [self.batch_size, self.seq_length, self.vocab_size])
-            self.parent.assertListEqual(
-                list(result["seq_relationship_score"].size()),
-                [self.batch_size, 2])
-
-
-        def create_bert_for_question_answering(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
-            model = BertForQuestionAnswering(config=config)
-            model.eval()
-            loss, start_logits, end_logits = model(input_ids, token_type_ids, input_mask, sequence_labels, sequence_labels)
-            outputs = {
-                "loss": loss,
-                "start_logits": start_logits,
-                "end_logits": end_logits,
-            }
-            return outputs
-
-        def check_bert_for_question_answering_output(self, result):
-            self.parent.assertListEqual(
-                list(result["start_logits"].size()),
-                [self.batch_size, self.seq_length])
-            self.parent.assertListEqual(
-                list(result["end_logits"].size()),
-                [self.batch_size, self.seq_length])
-
-
-        def create_bert_for_sequence_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
-            model = BertForSequenceClassification(config=config, num_labels=self.num_labels)
-            model.eval()
-            loss, logits = model(input_ids, token_type_ids, input_mask, sequence_labels)
-            outputs = {
-                "loss": loss,
-                "logits": logits,
-            }
-            return outputs
-
-        def check_bert_for_sequence_classification_output(self, result):
-            self.parent.assertListEqual(
-                list(result["logits"].size()),
-                [self.batch_size, self.num_labels])
-
-
-        def create_bert_for_token_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
-            model = BertForTokenClassification(config=config, num_labels=self.num_labels)
-            model.eval()
-            loss, logits = model(input_ids, token_type_ids, input_mask, token_labels)
-            outputs = {
-                "loss": loss,
-                "logits": logits,
-            }
-            return outputs
-
-        def check_bert_for_token_classification_output(self, result):
-            self.parent.assertListEqual(
-                list(result["logits"].size()),
-                [self.batch_size, self.seq_length, self.num_labels])
-
-
-        def create_bert_for_multiple_choice(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
-            model = BertForMultipleChoice(config=config, num_choices=self.num_choices)
-            model.eval()
-            multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
-            multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
-            multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
-            loss, logits = model(multiple_choice_inputs_ids,
-                         multiple_choice_token_type_ids,
-                         multiple_choice_input_mask,
-                         choice_labels)
-            outputs = {
-                "loss": loss,
-                "logits": logits,
-            }
-            return outputs
-
-        def check_bert_for_multiple_choice(self, result):
-            self.parent.assertListEqual(
-                list(result["logits"].size()),
-                [self.batch_size, self.num_choices])
-
-
-        def create_and_check_bert_for_attentions(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
-            for model_class in (BertModel, BertForMaskedLM, BertForNextSentencePrediction,
-                                BertForPreTraining, BertForQuestionAnswering, BertForSequenceClassification,
-                                BertForTokenClassification):
-                if model_class in [BertForSequenceClassification,
-                                   BertForTokenClassification]:
-                    model = model_class(config=config, num_labels=self.num_labels, output_attentions=True)
-                else:
-                    model = model_class(config=config, output_attentions=True)
-                model.eval()
-                outputs = model(input_ids, token_type_ids, input_mask)
-                attentions = outputs[-1]
-                self.parent.assertEqual(len(attentions), self.num_hidden_layers)
-                self.parent.assertListEqual(
-                    list(attentions[0].size()),
-                    [self.batch_size, self.num_attention_heads, self.seq_length, self.seq_length])
-
-
-        def create_and_check_bert_for_headmasking(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
-            for model_class in (BertModel, BertForMaskedLM, BertForNextSentencePrediction,
-                                BertForPreTraining, BertForQuestionAnswering, BertForSequenceClassification,
-                                BertForTokenClassification):
-                if model_class in [BertForSequenceClassification,
-                                   BertForTokenClassification]:
-                    model = model_class(config=config,
-                                        num_labels=self.num_labels)
-                else:
-                    model = model_class(config=config)
-                model.eval()
-                head_mask = torch.ones(self.num_hidden_layers, self.num_attention_heads).to(input_ids.device)
-                head_mask[0, 1:-1] = 0.0 # Mask all but the first and last heads on the first layer
-                head_mask[-1, 1:] = 0.0  # Mask all but the first head on the last layer
-                # Set that after having prepared the tensor to avoid error (leaf variable has been moved into the graph interior) 
-                head_mask.requires_grad_(requires_grad=True)
-                outputs = model(input_ids, token_type_ids, input_mask, head_mask=head_mask)
-
-                # Compute some gradients
-                output = sum(t.sum() for t in outputs[0])
-                output = output.sum()
-                output.backward()
-                multihead_outputs = head_mask.grad
-
-                self.parent.assertEqual(len(multihead_outputs), self.num_hidden_layers)
-                # self.parent.assertListEqual(
-                #     list(multihead_outputs[0].size()),
-                #     [self.batch_size, self.num_attention_heads,
-                #      self.seq_length, self.hidden_size // self.num_attention_heads])
-                # self.parent.assertEqual(
-                #     len(multihead_outputs[0][:, 1:(self.num_attention_heads-1), :, :].nonzero()),
-                #     0)
-                # self.parent.assertEqual(
-                #     len(multihead_outputs[0][:, 0, :, :].nonzero()),
-                #     self.batch_size * self.seq_length * self.hidden_size // self.num_attention_heads)
-                # self.parent.assertEqual(
-                #     len(multihead_outputs[0][:, self.num_attention_heads-1, :, :].nonzero()),
-                #     self.batch_size * self.seq_length * self.hidden_size // self.num_attention_heads)
-
-                # self.parent.assertListEqual(
-                #     list(multihead_outputs[1].size()),
-                #     [self.batch_size, self.num_attention_heads,
-                #      self.seq_length, self.hidden_size // self.num_attention_heads])
-                # self.parent.assertEqual(
-                #     len(multihead_outputs[1].nonzero()),
-                #     multihead_outputs[1].numel())
-
-                # self.parent.assertListEqual(
-                #     list(multihead_outputs[-1].size()),
-                #     [self.batch_size, self.num_attention_heads,
-                #      self.seq_length, self.hidden_size // self.num_attention_heads])
-                # self.parent.assertEqual(
-                #     len(multihead_outputs[-1][:, 1:, :, :].nonzero()),
-                #     0)
-                # self.parent.assertEqual(
-                #     len(multihead_outputs[-1][:, 0, :, :].nonzero()),
-                #     self.batch_size * self.seq_length * self.hidden_size // self.num_attention_heads)
-
-
-        def create_and_check_bert_for_head_pruning(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
-            for model_class in (BertModel, BertForMaskedLM, BertForNextSentencePrediction,
-                                BertForPreTraining, BertForQuestionAnswering, BertForSequenceClassification,
-                                BertForTokenClassification):
-                if model_class in [BertForSequenceClassification,
-                                   BertForTokenClassification]:
-                    model = model_class(config=config,
-                                        num_labels=self.num_labels)
-                else:
-                    model = model_class(config=config)
-                model.eval()
-                bert_model = model if isinstance(model, BertModel) else model.bert
-                heads_to_prune = {0: list(range(1, self.num_attention_heads)),
-                                  -1: [0]}
-                bert_model.prune_heads(heads_to_prune)
-                outputs = model(input_ids, token_type_ids, input_mask)
-
-                # output = sum(t.sum() for t in outputs[0])
-                # output = output.sum()
-                # output.backward()
-                # multihead_outputs = bert_model.get_multihead_outputs()
-
-                # self.parent.assertEqual(len(multihead_outputs), self.num_hidden_layers)
-                # self.parent.assertListEqual(
-                #     list(multihead_outputs[0].size()),
-                #     [self.batch_size, 1,
-                #      self.seq_length, self.hidden_size // self.num_attention_heads])
-                # self.parent.assertListEqual(
-                #     list(multihead_outputs[1].size()),
-                #     [self.batch_size, self.num_attention_heads,
-                #      self.seq_length, self.hidden_size // self.num_attention_heads])
-                # self.parent.assertListEqual(
-                #     list(multihead_outputs[-1].size()),
-                #     [self.batch_size, self.num_attention_heads-1,
-                #      self.seq_length, self.hidden_size // self.num_attention_heads])
-
-
-    def test_default(self):
-        self.run_tester(BertModelTest.BertModelTester(self))
-
-    def test_config_to_json_string(self):
-        config = BertConfig(vocab_size_or_config_json_file=99, hidden_size=37)
-        obj = json.loads(config.to_json_string())
-        self.assertEqual(obj["vocab_size"], 99)
-        self.assertEqual(obj["hidden_size"], 37)
-
-    def test_config_to_json_file(self):
-        config_first = BertConfig(vocab_size_or_config_json_file=99, hidden_size=37)
-        json_file_path = "/tmp/config.json"
-        config_first.to_json_file(json_file_path)
-        config_second = BertConfig.from_json_file(json_file_path)
-        os.remove(json_file_path)
-        self.assertEqual(config_second.to_dict(), config_first.to_dict())
-
-    @pytest.mark.slow
-    def test_model_from_pretrained(self):
-        cache_dir = "/tmp/pytorch_pretrained_bert_test/"
-        for model_name in list(PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
-            model = BertModel.from_pretrained(model_name, cache_dir=cache_dir)
-            shutil.rmtree(cache_dir)
-            self.assertIsNotNone(model)
-
-    def run_tester(self, tester):
-        config_and_inputs = tester.prepare_config_and_inputs()
-        output_result = tester.create_bert_model(*config_and_inputs)
-        tester.check_bert_model_output(output_result)
-
-        output_result = tester.create_bert_for_masked_lm(*config_and_inputs)
-        tester.check_bert_for_masked_lm_output(output_result)
-        tester.check_loss_output(output_result)
-
-        output_result = tester.create_bert_for_next_sequence_prediction(*config_and_inputs)
-        tester.check_bert_for_next_sequence_prediction_output(output_result)
-        tester.check_loss_output(output_result)
-
-        output_result = tester.create_bert_for_pretraining(*config_and_inputs)
-        tester.check_bert_for_pretraining_output(output_result)
-        tester.check_loss_output(output_result)
-
-        output_result = tester.create_bert_for_question_answering(*config_and_inputs)
-        tester.check_bert_for_question_answering_output(output_result)
-        tester.check_loss_output(output_result)
-
-        output_result = tester.create_bert_for_sequence_classification(*config_and_inputs)
-        tester.check_bert_for_sequence_classification_output(output_result)
-        tester.check_loss_output(output_result)
-
-        output_result = tester.create_bert_for_token_classification(*config_and_inputs)
-        tester.check_bert_for_token_classification_output(output_result)
-        tester.check_loss_output(output_result)
-
-        output_result = tester.create_bert_for_multiple_choice(*config_and_inputs)
-        tester.check_bert_for_multiple_choice(output_result)
-        tester.check_loss_output(output_result)
-
-        tester.create_and_check_bert_for_attentions(*config_and_inputs)
-        tester.create_and_check_bert_for_headmasking(*config_and_inputs)
-        tester.create_and_check_bert_for_head_pruning(*config_and_inputs)
-
-    @classmethod
-    def ids_tensor(cls, shape, vocab_size, rng=None, name=None):
-        """Creates a random int32 tensor of the shape within the vocab size."""
-        if rng is None:
-            rng = random.Random()
-
-        total_dims = 1
-        for dim in shape:
-            total_dims *= dim
-
-        values = []
-        for _ in range(total_dims):
-            values.append(rng.randint(0, vocab_size - 1))
-
-        return torch.tensor(data=values, dtype=torch.long).view(shape).contiguous()
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/xlnet b/xlnet
deleted file mode 160000
index cbdedecbc7..0000000000
--- a/xlnet
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit cbdedecbc7951fc000a1547f9feb086c34f0698b

From 99ae5ab8831f8ceaa39822f6ca5632daf44be7e6 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Tue, 2 Jul 2019 12:40:39 +0200
Subject: [PATCH 035/139] update config tests and circle-ci

---
 .circleci/config.yml                                |  4 ++--
 pytorch_pretrained_bert/modeling_gpt2.py            | 13 +++++++++++++
 pytorch_pretrained_bert/modeling_openai.py          | 12 ++++++++++++
 pytorch_pretrained_bert/modeling_transfo_xl.py      | 11 +++++++++++
 pytorch_pretrained_bert/modeling_xlnet.py           | 12 ++++++++++++
 .../tests/model_tests_commons.py                    |  7 +++++++
 6 files changed, 57 insertions(+), 2 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 3a4bae2984..89819c1009 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -10,7 +10,7 @@ jobs:
             - run: sudo pip install pytest codecov pytest-cov
             - run: sudo pip install spacy ftfy==4.4.3
             - run: sudo python -m spacy download en
-            - run: python -m pytest -sv tests/ --cov
+            - run: python -m pytest -sv ./pytorch_pretrained_bert/tests/ --cov
             - run: codecov
     build_py2:
         working_directory: ~/pytorch-pretrained-BERT
@@ -22,7 +22,7 @@ jobs:
             - run: sudo pip install pytest codecov pytest-cov
             - run: sudo pip install spacy ftfy==4.4.3
             - run: sudo python -m spacy download en
-            - run: python -m pytest -sv tests/ --cov
+            - run: python -m pytest -sv ./pytorch_pretrained_bert/tests/ --cov
             - run: codecov
 workflows:
   version: 2
diff --git a/pytorch_pretrained_bert/modeling_gpt2.py b/pytorch_pretrained_bert/modeling_gpt2.py
index 688512ae80..85ec85c16c 100644
--- a/pytorch_pretrained_bert/modeling_gpt2.py
+++ b/pytorch_pretrained_bert/modeling_gpt2.py
@@ -175,6 +175,19 @@ class GPT2Config(PretrainedConfig):
     def total_tokens_embeddings(self):
         return self.vocab_size + self.n_special
 
+    @property
+    def hidden_size(self):
+        return self.n_embd
+
+    @property
+    def num_attention_heads(self):
+        return self.n_head
+
+    @property
+    def num_hidden_layers(self):
+        return self.n_layer
+
+
 
 class Attention(nn.Module):
     def __init__(self, nx, n_ctx, config, scale=False):
diff --git a/pytorch_pretrained_bert/modeling_openai.py b/pytorch_pretrained_bert/modeling_openai.py
index 5ee4e9224a..f394723d10 100644
--- a/pytorch_pretrained_bert/modeling_openai.py
+++ b/pytorch_pretrained_bert/modeling_openai.py
@@ -206,6 +206,18 @@ class OpenAIGPTConfig(PretrainedConfig):
     def total_tokens_embeddings(self):
         return self.vocab_size + self.n_special
 
+    @property
+    def hidden_size(self):
+        return self.n_embd
+
+    @property
+    def num_attention_heads(self):
+        return self.n_head
+
+    @property
+    def num_hidden_layers(self):
+        return self.n_layer
+
 
 class Attention(nn.Module):
     def __init__(self, nx, n_ctx, config, scale=False):
diff --git a/pytorch_pretrained_bert/modeling_transfo_xl.py b/pytorch_pretrained_bert/modeling_transfo_xl.py
index 84df603a53..1b129544a3 100644
--- a/pytorch_pretrained_bert/modeling_transfo_xl.py
+++ b/pytorch_pretrained_bert/modeling_transfo_xl.py
@@ -289,6 +289,17 @@ class TransfoXLConfig(PretrainedConfig):
             raise ValueError("First argument must be either a vocabulary size (int)"
                              "or the path to a pretrained model config file (str)")
 
+    @property
+    def hidden_size(self):
+        return self.d_model
+
+    @property
+    def num_attention_heads(self):
+        return self.n_head
+
+    @property
+    def num_hidden_layers(self):
+        return self.n_layer
 
 
 class PositionalEmbedding(nn.Module):
diff --git a/pytorch_pretrained_bert/modeling_xlnet.py b/pytorch_pretrained_bert/modeling_xlnet.py
index 754a03f37d..33fd78c61e 100644
--- a/pytorch_pretrained_bert/modeling_xlnet.py
+++ b/pytorch_pretrained_bert/modeling_xlnet.py
@@ -313,6 +313,18 @@ class XLNetConfig(PretrainedConfig):
             raise ValueError("First argument must be either a vocabulary size (int)"
                              "or the path to a pretrained model config file (str)")
 
+    @property
+    def hidden_size(self):
+        return self.d_model
+
+    @property
+    def num_attention_heads(self):
+        return self.n_head
+
+    @property
+    def num_hidden_layers(self):
+        return self.n_layer
+
 
 try:
     from apex.normalization.fused_layer_norm import FusedLayerNorm as XLNetLayerNorm
diff --git a/pytorch_pretrained_bert/tests/model_tests_commons.py b/pytorch_pretrained_bert/tests/model_tests_commons.py
index 759b31aa0a..6ebc9dbaa6 100644
--- a/pytorch_pretrained_bert/tests/model_tests_commons.py
+++ b/pytorch_pretrained_bert/tests/model_tests_commons.py
@@ -184,6 +184,12 @@ class ConfigTester(object):
         self.config_class = config_class
         self.inputs_dict = kwargs
 
+    def create_and_test_config_common_properties(self):
+        config = self.config_class(**self.inputs_dict)
+        self.parent.assertTrue(hasattr(config, 'hidden_size'))
+        self.parent.assertTrue(hasattr(config, 'num_attention_heads'))
+        self.parent.assertTrue(hasattr(config, 'num_hidden_layers'))
+
     def create_and_test_config_to_json_string(self):
         config = self.config_class(**self.inputs_dict)
         obj = json.loads(config.to_json_string())
@@ -199,6 +205,7 @@ class ConfigTester(object):
         self.parent.assertEqual(config_second.to_dict(), config_first.to_dict())
 
     def run_common_tests(self):
+        self.create_and_test_config_common_properties()
         self.create_and_test_config_to_json_string()
         self.create_and_test_config_to_json_file()
 

From 708877958a308a0f0e8fd199f8f327e4797f1583 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Tue, 2 Jul 2019 16:35:29 +0200
Subject: [PATCH 036/139] updating tests and models, adding weights
 initialization test

---
 pytorch_pretrained_bert/file_utils.py         |   2 +
 pytorch_pretrained_bert/model_utils.py        |   3 +-
 pytorch_pretrained_bert/modeling.py           |   8 +-
 pytorch_pretrained_bert/modeling_gpt2.py      |  13 +-
 pytorch_pretrained_bert/modeling_openai.py    |  13 +-
 .../modeling_transfo_xl.py                    |   6 +-
 pytorch_pretrained_bert/modeling_xlm.py       |   1 -
 pytorch_pretrained_bert/modeling_xlnet.py     |  12 +-
 .../tests/model_tests_commons.py              | 136 +++++++++---------
 .../tests/modeling_gpt2_test.py               |  12 +-
 .../tests/modeling_openai_test.py             |   4 +-
 .../tests/modeling_transfo_xl_test.py         |   2 +-
 .../tests/modeling_xlnet_test.py              |   7 +-
 13 files changed, 112 insertions(+), 107 deletions(-)

diff --git a/pytorch_pretrained_bert/file_utils.py b/pytorch_pretrained_bert/file_utils.py
index ed509e5033..994f47d57c 100644
--- a/pytorch_pretrained_bert/file_utils.py
+++ b/pytorch_pretrained_bert/file_utils.py
@@ -191,6 +191,8 @@ def get_from_cache(url, cache_dir=None):
         cache_dir = PYTORCH_PRETRAINED_BERT_CACHE
     if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
         cache_dir = str(cache_dir)
+    if sys.version_info[0] == 2 and not isinstance(cache_dir, str):
+        cache_dir = str(cache_dir)
 
     if not os.path.exists(cache_dir):
         os.makedirs(cache_dir)
diff --git a/pytorch_pretrained_bert/model_utils.py b/pytorch_pretrained_bert/model_utils.py
index 15f6a4d5b4..8c116df54a 100644
--- a/pytorch_pretrained_bert/model_utils.py
+++ b/pytorch_pretrained_bert/model_utils.py
@@ -60,8 +60,7 @@ class PretrainedConfig(object):
                     . `config.json` a configuration file for the model
             cache_dir: an optional path to a folder in which the pre-trained model configuration will be cached.
         """
-        cache_dir = kwargs.get('cache_dir', None)
-        kwargs.pop('cache_dir', None)
+        cache_dir = kwargs.pop('cache_dir', None)
 
         if pretrained_model_name_or_path in cls.pretrained_config_archive_map:
             config_file = cls.pretrained_config_archive_map[pretrained_model_name_or_path]
diff --git a/pytorch_pretrained_bert/modeling.py b/pytorch_pretrained_bert/modeling.py
index f2b63634b1..999ba9d79f 100644
--- a/pytorch_pretrained_bert/modeling.py
+++ b/pytorch_pretrained_bert/modeling.py
@@ -17,7 +17,6 @@
 
 from __future__ import absolute_import, division, print_function, unicode_literals
 
-import copy
 import json
 import logging
 import math
@@ -422,8 +421,7 @@ class BertEncoder(nn.Module):
         super(BertEncoder, self).__init__()
         self.output_attentions = config.output_attentions
         self.output_hidden_states = config.output_hidden_states
-        layer = BertLayer(config)
-        self.layer = nn.ModuleList([copy.deepcopy(layer) for _ in range(config.num_hidden_layers)])
+        self.layer = nn.ModuleList([BertLayer(config) for _ in range(config.num_hidden_layers)])
 
     def forward(self, hidden_states, attention_mask, head_mask=None):
         all_hidden_states = []
@@ -539,10 +537,12 @@ class BertPreTrainedModel(PreTrainedModel):
     """
     config_class = BertConfig
     pretrained_model_archive_map = PRETRAINED_MODEL_ARCHIVE_MAP
-    pretrained_config_archive_map = PRETRAINED_CONFIG_ARCHIVE_MAP
     load_tf_weights = load_tf_weights_in_bert
     base_model_prefix = "bert"
 
+    def __init__(self, *inputs, **kwargs):
+        super(BertPreTrainedModel, self).__init__(*inputs, **kwargs)
+
     def init_weights(self, module):
         """ Initialize the weights.
         """
diff --git a/pytorch_pretrained_bert/modeling_gpt2.py b/pytorch_pretrained_bert/modeling_gpt2.py
index 85ec85c16c..fef4937400 100644
--- a/pytorch_pretrained_bert/modeling_gpt2.py
+++ b/pytorch_pretrained_bert/modeling_gpt2.py
@@ -18,7 +18,6 @@
 from __future__ import absolute_import, division, print_function, unicode_literals
 
 import collections
-import copy
 import json
 import logging
 import math
@@ -378,18 +377,21 @@ class GPT2PreTrainedModel(PreTrainedModel):
     load_tf_weights = load_tf_weights_in_gpt2
     base_model_prefix = "transformer"
 
+    def __init__(self, *inputs, **kwargs):
+        super(GPT2PreTrainedModel, self).__init__(*inputs, **kwargs)
+
     def init_weights(self, module):
         """ Initialize the weights.
         """
-        if isinstance(module, (nn.Linear, nn.Embedding)):
+        if isinstance(module, (nn.Linear, nn.Embedding, Conv1D)):
             # Slightly different from the TF version which uses truncated_normal for initialization
             # cf https://github.com/pytorch/pytorch/pull/5617
             module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if isinstance(module, (nn.Linear, Conv1D)) and module.bias is not None:
+                module.bias.data.zero_()
         elif isinstance(module, LayerNorm):
             module.bias.data.zero_()
             module.weight.data.fill_(1.0)
-        if isinstance(module, nn.Linear) and module.bias is not None:
-            module.bias.data.zero_()
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
@@ -489,8 +491,7 @@ class GPT2Model(GPT2PreTrainedModel):
         self.wte = nn.Embedding(config.total_tokens_embeddings, config.n_embd)
         self.wpe = nn.Embedding(config.n_positions, config.n_embd)
         self.drop = nn.Dropout(config.embd_pdrop)
-        block = Block(config.n_ctx, config, scale=True)
-        self.h = nn.ModuleList([copy.deepcopy(block) for _ in range(config.n_layer)])
+        self.h = nn.ModuleList([Block(config.n_ctx, config, scale=True) for _ in range(config.n_layer)])
         self.ln_f = LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)
 
         self.apply(self.init_weights)
diff --git a/pytorch_pretrained_bert/modeling_openai.py b/pytorch_pretrained_bert/modeling_openai.py
index f394723d10..f4fe09110a 100644
--- a/pytorch_pretrained_bert/modeling_openai.py
+++ b/pytorch_pretrained_bert/modeling_openai.py
@@ -18,7 +18,6 @@
 from __future__ import absolute_import, division, print_function, unicode_literals
 
 import collections
-import copy
 import json
 import logging
 import math
@@ -405,18 +404,21 @@ class OpenAIGPTPreTrainedModel(PreTrainedModel):
     load_tf_weights = load_tf_weights_in_openai_gpt
     base_model_prefix = "transformer"
 
+    def __init__(self, *inputs, **kwargs):
+        super(OpenAIGPTPreTrainedModel, self).__init__(*inputs, **kwargs)
+
     def init_weights(self, module):
         """ Initialize the weights.
         """
-        if isinstance(module, (nn.Linear, nn.Embedding)):
+        if isinstance(module, (nn.Linear, nn.Embedding, Conv1D)):
             # Slightly different from the TF version which uses truncated_normal for initialization
             # cf https://github.com/pytorch/pytorch/pull/5617
             module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if isinstance(module, (nn.Linear, Conv1D)) and module.bias is not None:
+                module.bias.data.zero_()
         elif isinstance(module, LayerNorm):
             module.bias.data.zero_()
             module.weight.data.fill_(1.0)
-        if isinstance(module, nn.Linear) and module.bias is not None:
-            module.bias.data.zero_()
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
@@ -513,8 +515,7 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
         self.tokens_embed = nn.Embedding(config.total_tokens_embeddings, config.n_embd)
         self.positions_embed = nn.Embedding(config.n_positions, config.n_embd)
         self.drop = nn.Dropout(config.embd_pdrop)
-        block = Block(config.n_ctx, config, scale=True)
-        self.h = nn.ModuleList([copy.deepcopy(block) for _ in range(config.n_layer)])
+        self.h = nn.ModuleList([Block(config.n_ctx, config, scale=True) for _ in range(config.n_layer)])
 
         self.apply(self.init_weights)
 
diff --git a/pytorch_pretrained_bert/modeling_transfo_xl.py b/pytorch_pretrained_bert/modeling_transfo_xl.py
index 1b129544a3..871f699b1a 100644
--- a/pytorch_pretrained_bert/modeling_transfo_xl.py
+++ b/pytorch_pretrained_bert/modeling_transfo_xl.py
@@ -21,7 +21,6 @@
 from __future__ import absolute_import, division, print_function, unicode_literals
 
 import os
-import copy
 import json
 import math
 import logging
@@ -843,6 +842,9 @@ class TransfoXLPreTrainedModel(PreTrainedModel):
     load_tf_weights = load_tf_weights_in_transfo_xl
     base_model_prefix = "transformer"
 
+    def __init__(self, *inputs, **kwargs):
+        super(TransfoXLPreTrainedModel, self).__init__(*inputs, **kwargs)
+
     def _init_weight(self, weight):
         if self.config.init == 'uniform':
             nn.init.uniform_(weight, -self.config.init_range, self.config.init_range)
@@ -883,7 +885,7 @@ class TransfoXLPreTrainedModel(PreTrainedModel):
                 nn.init.normal_(m.weight, 1.0, self.config.init_std)
             if hasattr(m, 'bias') and m.bias is not None:
                 self._init_bias(m.bias)
-        elif classname.find('TransformerLM') != -1:
+        else:
             if hasattr(m, 'r_emb'):
                 self._init_weight(m.r_emb)
             if hasattr(m, 'r_w_bias'):
diff --git a/pytorch_pretrained_bert/modeling_xlm.py b/pytorch_pretrained_bert/modeling_xlm.py
index b86c9778a2..325f887923 100644
--- a/pytorch_pretrained_bert/modeling_xlm.py
+++ b/pytorch_pretrained_bert/modeling_xlm.py
@@ -18,7 +18,6 @@ from __future__ import (absolute_import, division, print_function,
                         unicode_literals)
 from __future__ import absolute_import, division, print_function, unicode_literals
 
-import copy
 import json
 import logging
 import math
diff --git a/pytorch_pretrained_bert/modeling_xlnet.py b/pytorch_pretrained_bert/modeling_xlnet.py
index 33fd78c61e..da5ccdb8f3 100644
--- a/pytorch_pretrained_bert/modeling_xlnet.py
+++ b/pytorch_pretrained_bert/modeling_xlnet.py
@@ -19,7 +19,6 @@ from __future__ import (absolute_import, division, print_function,
                         unicode_literals)
 from __future__ import absolute_import, division, print_function, unicode_literals
 
-import copy
 import json
 import logging
 import math
@@ -598,6 +597,8 @@ class XLNetPreTrainedModel(PreTrainedModel):
             # Slightly different from the TF version which uses truncated_normal for initialization
             # cf https://github.com/pytorch/pytorch/pull/5617
             module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if isinstance(module, nn.Linear) and module.bias is not None:
+                module.bias.data.zero_()
         elif isinstance(module, XLNetLayerNorm):
             module.bias.data.zero_()
             module.weight.data.fill_(1.0)
@@ -606,8 +607,8 @@ class XLNetPreTrainedModel(PreTrainedModel):
                           module.r_r_bias, module.r_s_bias, module.r_w_bias,
                           module.seg_embed]:
                 param.data.normal_(mean=0.0, std=self.config.initializer_range)
-        if isinstance(module, nn.Linear) and module.bias is not None:
-            module.bias.data.zero_()
+        elif isinstance(module, XLNetModel):
+                module.mask_emb.data.normal_(mean=0.0, std=self.config.initializer_range)
 
 
 class XLNetModel(XLNetPreTrainedModel):
@@ -627,10 +628,11 @@ class XLNetModel(XLNetPreTrainedModel):
 
         self.word_embedding = nn.Embedding(config.n_token, config.d_model)
         self.mask_emb = nn.Parameter(torch.Tensor(1, 1, config.d_model))
-        layer = XLNetLayer(config)
-        self.layer = nn.ModuleList([copy.deepcopy(layer) for _ in range(config.n_layer)])
+        self.layer = nn.ModuleList([XLNetLayer(config) for _ in range(config.n_layer)])
         self.dropout = nn.Dropout(config.dropout)
 
+        self.apply(self.init_weights)
+
     def _prune_heads(self, heads_to_prune):
         logger.info("Head pruning is not implemented for XLNet")
         pass
diff --git a/pytorch_pretrained_bert/tests/model_tests_commons.py b/pytorch_pretrained_bert/tests/model_tests_commons.py
index 6ebc9dbaa6..da5d0f8b8a 100644
--- a/pytorch_pretrained_bert/tests/model_tests_commons.py
+++ b/pytorch_pretrained_bert/tests/model_tests_commons.py
@@ -16,6 +16,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import copy
 import os
 import shutil
 import json
@@ -23,87 +24,84 @@ import random
 
 import torch
 
-def create_and_check_for_headmasking(tester, model_classes, config, inputs_dict):
-    for model_class in model_classes:
-        config.output_hidden_states = True
-        model = model_class(config=config)
-        model.eval()
-        head_mask = torch.zeros(tester.num_hidden_layers, tester.num_attention_heads)
-        # Set that after having prepared the tensor to avoid error (leaf variable has been moved into the graph interior) 
-        head_mask.requires_grad_(requires_grad=True)
-        outputs = model(**inputs_dict, head_mask=head_mask)
+def _config_zero_init(config):
+    configs_no_init = copy.deepcopy(config)
+    for key in configs_no_init.__dict__.keys():
+        if '_range' in key or '_std' in key:
+            setattr(configs_no_init, key, 0.0)
+    return configs_no_init
 
-        # Compute some gradients
+def _create_and_check_initialization(tester, model_classes, config, inputs_dict):
+    configs_no_init = _config_zero_init(config)
+    for model_class in model_classes:
+        model = model_class(config=configs_no_init)
+        for name, param in model.named_parameters():
+            tester.parent.assertIn(param.data.mean().item(), [0.0, 1.0], msg="Parameter {} of model {} seems not properly initialized".format(name, model_class))
+
+def _create_and_check_for_headmasking(tester, model_classes, config, inputs_dict):
+    configs_no_init = _config_zero_init(config)
+    for model_class in model_classes:
+        config.output_attentions = True
+        config.output_hidden_states = True
+        model = model_class(config=configs_no_init)
+        model.eval()
+
+        # Prepare head_mask
+        # Set require_grad after having prepared the tensor to avoid error (leaf variable has been moved into the graph interior) 
+        head_mask = torch.ones(tester.num_hidden_layers, tester.num_attention_heads)
+        head_mask[0, 0] = 0
+        head_mask[-1, :-1] = 0
+        head_mask.requires_grad_(requires_grad=True)
+        inputs = inputs_dict.copy()
+        inputs['head_mask'] = head_mask
+
+        outputs = model(**inputs)
+
+        # Test that we can get a gradient back for importance score computation
         output = sum(t.sum() for t in outputs[0])
         output = output.sum()
         output.backward()
         multihead_outputs = head_mask.grad
 
+        attentions = outputs[-1]
+        hidden_states = outputs[-2]
+
+        tester.parent.assertIsNotNone(multihead_outputs)
         tester.parent.assertEqual(len(multihead_outputs), tester.num_hidden_layers)
-        # self.parent.assertListEqual(
-        #     list(multihead_outputs[0].size()),
-        #     [self.batch_size, self.num_attention_heads,
-        #      self.seq_length, self.hidden_size // self.num_attention_heads])
-        # self.parent.assertEqual(
-        #     len(multihead_outputs[0][:, 1:(self.num_attention_heads-1), :, :].nonzero()),
-        #     0)
-        # self.parent.assertEqual(
-        #     len(multihead_outputs[0][:, 0, :, :].nonzero()),
-        #     self.batch_size * self.seq_length * self.hidden_size // self.num_attention_heads)
-        # self.parent.assertEqual(
-        #     len(multihead_outputs[0][:, self.num_attention_heads-1, :, :].nonzero()),
-        #     self.batch_size * self.seq_length * self.hidden_size // self.num_attention_heads)
-
-        # self.parent.assertListEqual(
-        #     list(multihead_outputs[1].size()),
-        #     [self.batch_size, self.num_attention_heads,
-        #      self.seq_length, self.hidden_size // self.num_attention_heads])
-        # self.parent.assertEqual(
-        #     len(multihead_outputs[1].nonzero()),
-        #     multihead_outputs[1].numel())
-
-        # self.parent.assertListEqual(
-        #     list(multihead_outputs[-1].size()),
-        #     [self.batch_size, self.num_attention_heads,
-        #      self.seq_length, self.hidden_size // self.num_attention_heads])
-        # self.parent.assertEqual(
-        #     len(multihead_outputs[-1][:, 1:, :, :].nonzero()),
-        #     0)
-        # self.parent.assertEqual(
-        #     len(multihead_outputs[-1][:, 0, :, :].nonzero()),
-        #     self.batch_size * self.seq_length * self.hidden_size // self.num_attention_heads)
+        tester.parent.assertAlmostEqual(
+            attentions[0][..., 0, :, :].flatten().sum().item(), 0.0)
+        tester.parent.assertNotEqual(
+            attentions[0][..., -1, :, :].flatten().sum().item(), 0.0)
+        tester.parent.assertNotEqual(
+            attentions[1][..., 0, :, :].flatten().sum().item(), 0.0)
+        tester.parent.assertAlmostEqual(
+            attentions[-1][..., -2, :, :].flatten().sum().item(), 0.0)
+        tester.parent.assertNotEqual(
+            attentions[-1][..., -1, :, :].flatten().sum().item(), 0.0)
 
 
-def create_and_check_for_head_pruning(tester, model_classes, config, inputs_dict):
+def _create_and_check_for_head_pruning(tester, model_classes, config, inputs_dict):
     for model_class in model_classes:
+        config.output_attentions = True
+        config.output_hidden_states = False
         model = model_class(config=config)
         model.eval()
         heads_to_prune = {0: list(range(1, tester.num_attention_heads)),
-                            -1: [0]}
+                          -1: [0]}
         model.prune_heads(heads_to_prune)
         outputs = model(**inputs_dict)
 
-        # output = sum(t.sum() for t in outputs[0])
-        # output = output.sum()
-        # output.backward()
-        # multihead_outputs = bert_model.get_multihead_outputs()
+        attentions = outputs[-1]
 
-        # self.parent.assertEqual(len(multihead_outputs), self.num_hidden_layers)
-        # self.parent.assertListEqual(
-        #     list(multihead_outputs[0].size()),
-        #     [self.batch_size, 1,
-        #      self.seq_length, self.hidden_size // self.num_attention_heads])
-        # self.parent.assertListEqual(
-        #     list(multihead_outputs[1].size()),
-        #     [self.batch_size, self.num_attention_heads,
-        #      self.seq_length, self.hidden_size // self.num_attention_heads])
-        # self.parent.assertListEqual(
-        #     list(multihead_outputs[-1].size()),
-        #     [self.batch_size, self.num_attention_heads-1,
-        #      self.seq_length, self.hidden_size // self.num_attention_heads])
+        tester.parent.assertEqual(
+            attentions[0].shape[-3], 1)
+        tester.parent.assertEqual(
+            attentions[1].shape[-3], tester.num_attention_heads)
+        tester.parent.assertEqual(
+            attentions[-1].shape[-3], tester.num_attention_heads - 1)
 
 
-def create_and_check_for_attentions(tester, model_classes, config, inputs_dict):
+def _create_and_check_for_attentions(tester, model_classes, config, inputs_dict):
     for model_class in model_classes:
         config.output_attentions = True
         config.output_hidden_states = False
@@ -139,7 +137,7 @@ def create_and_check_for_attentions(tester, model_classes, config, inputs_dict):
              tester.seq_length,
              tester.key_len if hasattr(tester, 'key_len') else tester.seq_length])
 
-def create_and_check_for_hidden_states(tester, model_classes, config, inputs_dict):
+def _create_and_check_for_hidden_states(tester, model_classes, config, inputs_dict):
     for model_class in model_classes:
         config.output_hidden_states = True
         config.output_attentions = False
@@ -155,11 +153,13 @@ def create_and_check_for_hidden_states(tester, model_classes, config, inputs_dic
             [tester.seq_length, tester.hidden_size])
 
 
-def create_and_check_commons(tester, config, inputs_dict):
-    create_and_check_for_attentions(tester, tester.all_model_classes, config, inputs_dict)
-    create_and_check_for_headmasking(tester, tester.all_model_classes, config, inputs_dict)
-    create_and_check_for_head_pruning(tester, tester.all_model_classes, config, inputs_dict)
-    create_and_check_for_hidden_states(tester, tester.all_model_classes, config, inputs_dict)
+def create_and_check_commons(tester, config, inputs_dict, test_pruning=True):
+    _create_and_check_initialization(tester, tester.all_model_classes, config, inputs_dict)
+    _create_and_check_for_attentions(tester, tester.all_model_classes, config, inputs_dict)
+    _create_and_check_for_headmasking(tester, tester.all_model_classes, config, inputs_dict)
+    _create_and_check_for_hidden_states(tester, tester.all_model_classes, config, inputs_dict)
+    if test_pruning:
+        _create_and_check_for_head_pruning(tester, tester.all_model_classes, config, inputs_dict)
 
 
 def ids_tensor(shape, vocab_size, rng=None, name=None):
diff --git a/pytorch_pretrained_bert/tests/modeling_gpt2_test.py b/pytorch_pretrained_bert/tests/modeling_gpt2_test.py
index 552599b1fc..122cdf3c7b 100644
--- a/pytorch_pretrained_bert/tests/modeling_gpt2_test.py
+++ b/pytorch_pretrained_bert/tests/modeling_gpt2_test.py
@@ -28,9 +28,7 @@ import torch
 from pytorch_pretrained_bert import (GPT2Config, GPT2Model,
                                      GPT2LMHeadModel, GPT2DoubleHeadsModel)
 
-from .model_tests_commons import (create_and_check_for_attentions, create_and_check_for_head_pruning,
-                                  create_and_check_for_headmasking, create_and_check_for_hidden_states,
-                                  ConfigTester, GPTModelTester)
+from .model_tests_commons import (create_and_check_commons, ConfigTester, GPTModelTester)
 
 class GPT2ModelTest(unittest.TestCase):
 
@@ -40,15 +38,15 @@ class GPT2ModelTest(unittest.TestCase):
 
     def test_model(self):
         model_tester = GPTModelTester(self, config_class=GPT2Config, base_model_class=GPT2Model,
-                                           lm_head_model_class=GPT2LMHeadModel,
-                                           double_head_model_class=GPT2DoubleHeadsModel)
+                                            lm_head_model_class=GPT2LMHeadModel,
+                                            double_head_model_class=GPT2DoubleHeadsModel)
         model_tester.run_common_tests(test_presents=True)
 
     @pytest.mark.slow
     def test_pretrained(self):
         model_tester = GPTModelTester(self, config_class=GPT2Config, base_model_class=GPT2Model,
-                                           lm_head_model_class=GPT2LMHeadModel,
-                                           double_head_model_class=GPT2DoubleHeadsModel)
+                                            lm_head_model_class=GPT2LMHeadModel,
+                                            double_head_model_class=GPT2DoubleHeadsModel)
         model_tester.run_slow_tests()
 
 if __name__ == "__main__":
diff --git a/pytorch_pretrained_bert/tests/modeling_openai_test.py b/pytorch_pretrained_bert/tests/modeling_openai_test.py
index 83ef480f49..e3e9e2849d 100644
--- a/pytorch_pretrained_bert/tests/modeling_openai_test.py
+++ b/pytorch_pretrained_bert/tests/modeling_openai_test.py
@@ -28,9 +28,7 @@ import torch
 from pytorch_pretrained_bert import (OpenAIGPTConfig, OpenAIGPTModel,
                                      OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel)
 
-from .model_tests_commons import (create_and_check_for_attentions, create_and_check_for_head_pruning,
-                                  create_and_check_for_headmasking, create_and_check_for_hidden_states,
-                                  ConfigTester, GPTModelTester)
+from .model_tests_commons import (create_and_check_commons, ConfigTester, GPTModelTester)
 
 class OpenAIModelTest(unittest.TestCase):
 
diff --git a/pytorch_pretrained_bert/tests/modeling_transfo_xl_test.py b/pytorch_pretrained_bert/tests/modeling_transfo_xl_test.py
index e6acbb627d..8b46b6d755 100644
--- a/pytorch_pretrained_bert/tests/modeling_transfo_xl_test.py
+++ b/pytorch_pretrained_bert/tests/modeling_transfo_xl_test.py
@@ -173,7 +173,7 @@ class TransfoXLModelTest(unittest.TestCase):
 
         def create_and_check_transfo_xl_commons(self, config, input_ids_1, input_ids_2, lm_labels):
             inputs_dict = {'input_ids': input_ids_1}
-            create_and_check_commons(self, config, inputs_dict)
+            create_and_check_commons(self, config, inputs_dict, test_pruning=False)
 
     def test_default(self):
         self.run_tester(TransfoXLModelTest.TransfoXLModelTester(self))
diff --git a/pytorch_pretrained_bert/tests/modeling_xlnet_test.py b/pytorch_pretrained_bert/tests/modeling_xlnet_test.py
index cf55889a96..1527f08642 100644
--- a/pytorch_pretrained_bert/tests/modeling_xlnet_test.py
+++ b/pytorch_pretrained_bert/tests/modeling_xlnet_test.py
@@ -52,6 +52,7 @@ class XLNetModelTest(unittest.TestCase):
                      untie_r=True,
                      bi_data=False,
                      same_length=False,
+                     initializer_range=0.05,
                      seed=1,
                      type_vocab_size=2,
                      all_model_classes=(XLNetModel, XLNetLMHeadModel,
@@ -76,6 +77,7 @@ class XLNetModelTest(unittest.TestCase):
             self.bi_data = bi_data
             self.untie_r = untie_r
             self.same_length = same_length
+            self.initializer_range = initializer_range
             self.seed = seed
             self.type_vocab_size = type_vocab_size
             self.all_model_classes = all_model_classes
@@ -129,7 +131,8 @@ class XLNetModelTest(unittest.TestCase):
                 clamp_len=self.clamp_len,
                 same_length=self.same_length,
                 reuse_len=self.reuse_len,
-                bi_data=self.bi_data)
+                bi_data=self.bi_data,
+                initializer_range=self.initializer_range)
 
             return (config, input_ids_1, input_ids_2, input_ids_q, perm_mask, target_mapping, inp_q, segment_ids, lm_labels)
 
@@ -180,7 +183,7 @@ class XLNetModelTest(unittest.TestCase):
 
         def create_and_check_xlnet_commons(self, config, input_ids_1, input_ids_2, input_ids_q, perm_mask, target_mapping, inp_q, segment_ids, lm_labels):
             inputs_dict = {'input_ids': input_ids_1}
-            create_and_check_commons(self, config, inputs_dict)
+            create_and_check_commons(self, config, inputs_dict, test_pruning=False)
 
     def test_default(self):
         self.run_tester(XLNetModelTest.XLNetModelTester(self))

From 7ed5bf706f4d378d0e8fd297fd5d4529c64a3553 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Tue, 2 Jul 2019 16:42:22 +0200
Subject: [PATCH 037/139] add tests

---
 pytorch_pretrained_bert/tests/model_tests_commons.py | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/pytorch_pretrained_bert/tests/model_tests_commons.py b/pytorch_pretrained_bert/tests/model_tests_commons.py
index da5d0f8b8a..ee48a70d65 100644
--- a/pytorch_pretrained_bert/tests/model_tests_commons.py
+++ b/pytorch_pretrained_bert/tests/model_tests_commons.py
@@ -31,6 +31,14 @@ def _config_zero_init(config):
             setattr(configs_no_init, key, 0.0)
     return configs_no_init
 
+def _create_and_check_torchscript(tester, model_classes, config, inputs_dict):
+    configs_no_init = _config_zero_init(config)  # To be sure we have no Nan
+    for model_class in model_classes:
+        model = model_class(config=configs_no_init)
+        model.eval()
+        inputs = inputs_dict['input_ids']  # Let's keep only input_ids
+        traced_model = torch.jit.trace(model, inputs)
+
 def _create_and_check_initialization(tester, model_classes, config, inputs_dict):
     configs_no_init = _config_zero_init(config)
     for model_class in model_classes:
@@ -39,7 +47,7 @@ def _create_and_check_initialization(tester, model_classes, config, inputs_dict)
             tester.parent.assertIn(param.data.mean().item(), [0.0, 1.0], msg="Parameter {} of model {} seems not properly initialized".format(name, model_class))
 
 def _create_and_check_for_headmasking(tester, model_classes, config, inputs_dict):
-    configs_no_init = _config_zero_init(config)
+    configs_no_init = _config_zero_init(config)  # To be sure we have no Nan
     for model_class in model_classes:
         config.output_attentions = True
         config.output_hidden_states = True
@@ -155,6 +163,7 @@ def _create_and_check_for_hidden_states(tester, model_classes, config, inputs_di
 
 def create_and_check_commons(tester, config, inputs_dict, test_pruning=True):
     _create_and_check_initialization(tester, tester.all_model_classes, config, inputs_dict)
+    _create_and_check_torchscript(tester, tester.all_model_classes, config, inputs_dict)
     _create_and_check_for_attentions(tester, tester.all_model_classes, config, inputs_dict)
     _create_and_check_for_headmasking(tester, tester.all_model_classes, config, inputs_dict)
     _create_and_check_for_hidden_states(tester, tester.all_model_classes, config, inputs_dict)

From 6ce1ee04fca7bbe8719ac3fc7b0e23288fba0dfc Mon Sep 17 00:00:00 2001
From: LysandreJik <lysandre.debut@reseau.eseo.fr>
Date: Tue, 2 Jul 2019 17:22:59 -0400
Subject: [PATCH 038/139] TorchScript testing with output_attentions and
 output_hidden_state

---
 .../tests/model_tests_commons.py              | 41 ++++++++++++++++++-
 1 file changed, 40 insertions(+), 1 deletion(-)

diff --git a/pytorch_pretrained_bert/tests/model_tests_commons.py b/pytorch_pretrained_bert/tests/model_tests_commons.py
index ee48a70d65..75c0ae19fd 100644
--- a/pytorch_pretrained_bert/tests/model_tests_commons.py
+++ b/pytorch_pretrained_bert/tests/model_tests_commons.py
@@ -31,13 +31,50 @@ def _config_zero_init(config):
             setattr(configs_no_init, key, 0.0)
     return configs_no_init
 
+def _create_and_check_torchscript_output_attentions(tester, model_classes, config, inputs_dict):
+    config.output_attentions = True
+    _create_and_check_torchscript(tester, model_classes, config, inputs_dict)
+
+def _create_and_check_torchscript_output_hidden_state(tester, model_classes, config, inputs_dict):
+    config.output_hidden_states = True
+    _create_and_check_torchscript(tester, model_classes, config, inputs_dict)
+
 def _create_and_check_torchscript(tester, model_classes, config, inputs_dict):
     configs_no_init = _config_zero_init(config)  # To be sure we have no Nan
     for model_class in model_classes:
         model = model_class(config=configs_no_init)
         model.eval()
         inputs = inputs_dict['input_ids']  # Let's keep only input_ids
-        traced_model = torch.jit.trace(model, inputs)
+
+        try:
+            torch.jit.trace(model, inputs)
+        except RuntimeError:
+            tester.parent.fail("Couldn't trace module.")
+
+        try:
+            traced_gpt2 = torch.jit.trace(model, inputs)
+            torch.jit.save(traced_gpt2, "traced_model.pt")
+        except RuntimeError:
+            tester.parent.fail("Couldn't save module.")
+
+        try:
+            loaded_model = torch.jit.load("traced_model.pt")
+            os.remove("traced_model.pt")
+        except ValueError:
+            tester.parent.fail("Couldn't load module.")
+
+        model.eval()
+        loaded_model.eval()
+
+        model_params = model.parameters()
+        loaded_model_params = loaded_model.parameters()
+
+        models_equal = True
+        for p1, p2 in zip(model_params, loaded_model_params):
+            if p1.data.ne(p2.data).sum() > 0:
+                models_equal = False
+
+        tester.parent.assertTrue(models_equal)
 
 def _create_and_check_initialization(tester, model_classes, config, inputs_dict):
     configs_no_init = _config_zero_init(config)
@@ -164,6 +201,8 @@ def _create_and_check_for_hidden_states(tester, model_classes, config, inputs_di
 def create_and_check_commons(tester, config, inputs_dict, test_pruning=True):
     _create_and_check_initialization(tester, tester.all_model_classes, config, inputs_dict)
     _create_and_check_torchscript(tester, tester.all_model_classes, config, inputs_dict)
+    _create_and_check_torchscript_output_attentions(tester, tester.all_model_classes, config, inputs_dict)
+    _create_and_check_torchscript_output_hidden_state(tester, tester.all_model_classes, config, inputs_dict)
     _create_and_check_for_attentions(tester, tester.all_model_classes, config, inputs_dict)
     _create_and_check_for_headmasking(tester, tester.all_model_classes, config, inputs_dict)
     _create_and_check_for_hidden_states(tester, tester.all_model_classes, config, inputs_dict)

From e891bb43d5d9017ca739812b84751f0c81eefbfd Mon Sep 17 00:00:00 2001
From: LysandreJik <lysandre.debut@reseau.eseo.fr>
Date: Tue, 2 Jul 2019 17:23:18 -0400
Subject: [PATCH 039/139] BERT can be exported to TorchScript

---
 pytorch_pretrained_bert/modeling.py | 54 ++++++++++++++---------------
 1 file changed, 27 insertions(+), 27 deletions(-)

diff --git a/pytorch_pretrained_bert/modeling.py b/pytorch_pretrained_bert/modeling.py
index 999ba9d79f..eb7fdf1a14 100644
--- a/pytorch_pretrained_bert/modeling.py
+++ b/pytorch_pretrained_bert/modeling.py
@@ -323,7 +323,7 @@ class BertSelfAttention(nn.Module):
         new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
         context_layer = context_layer.view(*new_context_layer_shape)
 
-        outputs = [context_layer, attention_probs] if self.output_attentions else [context_layer]
+        outputs = (context_layer, attention_probs) if self.output_attentions else (context_layer,)
         return outputs
 
 
@@ -367,7 +367,7 @@ class BertAttention(nn.Module):
     def forward(self, input_tensor, attention_mask, head_mask=None):
         self_outputs = self.self(input_tensor, attention_mask, head_mask)
         attention_output = self.output(self_outputs[0], input_tensor)
-        outputs = [attention_output] + self_outputs[1:]  # add attentions if we output them
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
         return outputs
 
 
@@ -412,7 +412,7 @@ class BertLayer(nn.Module):
         attention_output = attention_outputs[0]
         intermediate_output = self.intermediate(attention_output)
         layer_output = self.output(intermediate_output, attention_output)
-        outputs = [layer_output] + attention_outputs[1:]  # add attentions if we output them
+        outputs = (layer_output,) + attention_outputs[1:]  # add attentions if we output them
         return outputs
 
 
@@ -424,27 +424,27 @@ class BertEncoder(nn.Module):
         self.layer = nn.ModuleList([BertLayer(config) for _ in range(config.num_hidden_layers)])
 
     def forward(self, hidden_states, attention_mask, head_mask=None):
-        all_hidden_states = []
-        all_attentions = []
+        all_hidden_states = ()
+        all_attentions = ()
         for i, layer_module in enumerate(self.layer):
             if self.output_hidden_states:
-                all_hidden_states.append(hidden_states)
+                all_hidden_states += (hidden_states,)
 
             layer_outputs = layer_module(hidden_states, attention_mask, head_mask[i])
             hidden_states = layer_outputs[0]
 
             if self.output_attentions:
-                all_attentions.append(layer_outputs[1])
+                all_attentions += (layer_outputs[1],)
 
         # Add last layer
         if self.output_hidden_states:
-            all_hidden_states.append(hidden_states)
+            all_hidden_states += (hidden_states,)
 
-        outputs = [hidden_states]
+        outputs = (hidden_states,)
         if self.output_hidden_states:
-            outputs.append(all_hidden_states)
+            outputs += (all_hidden_states,)
         if self.output_attentions:
-            outputs.append(all_attentions)
+            outputs += (all_attentions,)
         return outputs  # outputs, (hidden states), (attentions)
 
 
@@ -490,7 +490,7 @@ class BertLMPredictionHead(nn.Module):
         self.decoder = nn.Linear(bert_model_embedding_weights.size(1),
                                  bert_model_embedding_weights.size(0),
                                  bias=False)
-        self.decoder.weight = bert_model_embedding_weights
+        self.decoder.weight = nn.Parameter(bert_model_embedding_weights.clone())
         self.bias = nn.Parameter(torch.zeros(bert_model_embedding_weights.size(0)))
 
     def forward(self, hidden_states):
@@ -666,7 +666,7 @@ class BertModel(BertPreTrainedModel):
         sequence_output = encoder_outputs[0]
         pooled_output = self.pooler(sequence_output)
 
-        outputs = [sequence_output, pooled_output] + encoder_outputs[1:]  # add hidden_states and attentions if they are here
+        outputs = (sequence_output, pooled_output,) + encoder_outputs[1:]  # add hidden_states and attentions if they are here
         return outputs  # sequence_output, pooled_output, (hidden_states), (attentions)
 
 
@@ -739,14 +739,14 @@ class BertForPreTraining(BertPreTrainedModel):
         sequence_output, pooled_output = outputs[:2]
         prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output)
 
-        outputs = [prediction_scores, seq_relationship_score] + outputs[2:]  # add hidden states and attention if they are here
+        outputs = (prediction_scores, seq_relationship_score,) + outputs[2:]  # add hidden states and attention if they are here
 
         if masked_lm_labels is not None and next_sentence_label is not None:
             loss_fct = CrossEntropyLoss(ignore_index=-1)
             masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1))
             next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1))
             total_loss = masked_lm_loss + next_sentence_loss
-            outputs = [total_loss] + outputs
+            outputs = (total_loss,) + outputs
 
         return outputs  # (loss), prediction_scores, seq_relationship_score, (hidden_states), (attentions)
 
@@ -815,11 +815,11 @@ class BertForMaskedLM(BertPreTrainedModel):
         sequence_output = outputs[0]
         prediction_scores = self.cls(sequence_output)
 
-        outputs = [prediction_scores] + outputs[2:]  # Add hidden states and attention is they are here
+        outputs = (prediction_scores,) + outputs[2:]  # Add hidden states and attention is they are here
         if masked_lm_labels is not None:
             loss_fct = CrossEntropyLoss(ignore_index=-1)
             masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1))
-            outputs = [masked_lm_loss] + outputs
+            outputs = (masked_lm_loss,) + outputs
 
         return outputs  # (masked_lm_loss), prediction_scores, (hidden_states), (attentions)
 
@@ -885,11 +885,11 @@ class BertForNextSentencePrediction(BertPreTrainedModel):
 
         seq_relationship_score = self.cls(pooled_output)
 
-        outputs = [seq_relationship_score] + outputs[2:]  # add hidden states and attention if they are here
+        outputs = (seq_relationship_score,) + outputs[2:]  # add hidden states and attention if they are here
         if next_sentence_label is not None:
             loss_fct = CrossEntropyLoss(ignore_index=-1)
             next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1))
-            outputs = [next_sentence_loss] + outputs
+            outputs = (next_sentence_loss,) + outputs
 
         return outputs  # (next_sentence_loss), seq_relationship_score, (hidden_states), (attentions)
 
@@ -960,7 +960,7 @@ class BertForSequenceClassification(BertPreTrainedModel):
         pooled_output = self.dropout(pooled_output)
         logits = self.classifier(pooled_output)
 
-        outputs = [logits] + outputs[2:]  # add hidden states and attention if they are here
+        outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here
 
         if labels is not None:
             if self.num_labels == 1:
@@ -970,7 +970,7 @@ class BertForSequenceClassification(BertPreTrainedModel):
             else:
                 loss_fct = CrossEntropyLoss()
                 loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
-            outputs = [loss] + outputs
+            outputs = (loss,) + outputs
 
         return outputs  # (loss), logits, (hidden_states), (attentions)
 
@@ -1043,12 +1043,12 @@ class BertForMultipleChoice(BertPreTrainedModel):
         logits = self.classifier(pooled_output)
         reshaped_logits = logits.view(-1, num_choices)
 
-        outputs = [reshaped_logits] + outputs[2:]  # add hidden states and attention if they are here
+        outputs = (reshaped_logits,) + outputs[2:]  # add hidden states and attention if they are here
 
         if labels is not None:
             loss_fct = CrossEntropyLoss()
             loss = loss_fct(reshaped_logits, labels)
-            outputs = [loss] + outputs
+            outputs = (loss,) + outputs
 
         return outputs  # (loss), reshaped_logits, (hidden_states), (attentions)
 
@@ -1119,7 +1119,7 @@ class BertForTokenClassification(BertPreTrainedModel):
         sequence_output = self.dropout(sequence_output)
         logits = self.classifier(sequence_output)
 
-        outputs = [logits] + outputs[2:]  # add hidden states and attention if they are here
+        outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here
         if labels is not None:
             loss_fct = CrossEntropyLoss()
             # Only keep active parts of the loss
@@ -1130,7 +1130,7 @@ class BertForTokenClassification(BertPreTrainedModel):
                 loss = loss_fct(active_logits, active_labels)
             else:
                 loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
-            outputs = [loss] + outputs
+            outputs = (loss,) + outputs
 
         return outputs  # (loss), logits, (hidden_states), (attentions)
 
@@ -1205,7 +1205,7 @@ class BertForQuestionAnswering(BertPreTrainedModel):
         start_logits = start_logits.squeeze(-1)
         end_logits = end_logits.squeeze(-1)
 
-        outputs = [start_logits, end_logits] + outputs[2:]
+        outputs = (start_logits, end_logits,) + outputs[2:]
         if start_positions is not None and end_positions is not None:
             # If we are on multi-GPU, split add a dimension
             if len(start_positions.size()) > 1:
@@ -1221,6 +1221,6 @@ class BertForQuestionAnswering(BertPreTrainedModel):
             start_loss = loss_fct(start_logits, start_positions)
             end_loss = loss_fct(end_logits, end_positions)
             total_loss = (start_loss + end_loss) / 2
-            outputs = [total_loss] + outputs
+            outputs = (total_loss,) + outputs
 
         return outputs  # (loss), start_logits, end_logits, (hidden_states), (attentions)

From 288be7b7ea47ef342cb7649402f879d8f99ceb16 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Tue, 2 Jul 2019 23:42:31 +0200
Subject: [PATCH 040/139] xlm

---
 .../convert_xlm_checkpoint_to_pytorch.py      |  73 ++++
 pytorch_pretrained_bert/modeling_xlm.py       | 353 ++++++++----------
 pytorch_pretrained_bert/tokenization_xlm.py   | 326 ++++++++++++++++
 3 files changed, 547 insertions(+), 205 deletions(-)
 create mode 100755 pytorch_pretrained_bert/convert_xlm_checkpoint_to_pytorch.py
 create mode 100644 pytorch_pretrained_bert/tokenization_xlm.py

diff --git a/pytorch_pretrained_bert/convert_xlm_checkpoint_to_pytorch.py b/pytorch_pretrained_bert/convert_xlm_checkpoint_to_pytorch.py
new file mode 100755
index 0000000000..44a40174b4
--- /dev/null
+++ b/pytorch_pretrained_bert/convert_xlm_checkpoint_to_pytorch.py
@@ -0,0 +1,73 @@
+# coding=utf-8
+# Copyright 2018 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert OpenAI GPT checkpoint."""
+
+from __future__ import absolute_import, division, print_function
+
+import argparse
+import json
+from io import open
+
+import torch
+import numpy
+
+from pytorch_pretrained_bert.modeling_xlm import (CONFIG_NAME, WEIGHTS_NAME, XLMConfig, XLMModel)
+from pytorch_pretrained_bert.tokenization_xlm import MERGES_NAME, VOCAB_NAME
+
+
+def convert_xlm_checkpoint_to_pytorch(xlm_checkpoint_path, pytorch_dump_folder_path):
+    # Load checkpoint
+    chkpt = torch.load(xlm_checkpoint_path, map_location='cpu')
+
+    model = chkpt['model']
+
+    config = chkpt['params']
+    config = dict((n, v) for n, v in config.items() if not isinstance(v, (torch.Tensor, numpy.ndarray)))
+
+    vocab = chkpt['dico_word2id']
+    vocab = dict((s + '</w>' if s.find('@@') == -1 and i > 13 else s.replace('@@', ''), i) for s, i in d.items())
+
+    # Save pytorch-model
+    pytorch_weights_dump_path = pytorch_dump_folder_path + '/' + WEIGHTS_NAME
+    pytorch_config_dump_path = pytorch_dump_folder_path + '/' + CONFIG_NAME
+    pytorch_vocab_dump_path = pytorch_dump_folder_path + '/' + VOCAB_NAME
+
+    print("Save PyTorch model to {}".format(pytorch_weights_dump_path))
+    torch.save(model, pytorch_weights_dump_path)
+
+    print("Save configuration file to {}".format(pytorch_config_dump_path))
+    with open(pytorch_config_dump_path, "w", encoding="utf-8") as f:
+        f.write(json.dumps(config, indent=2) + "\n")
+
+    print("Save vocab file to {}".format(pytorch_config_dump_path))
+    with open(pytorch_vocab_dump_path, "w", encoding="utf-8") as f:
+        f.write(json.dumps(vocab, indent=2) + "\n")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    ## Required parameters
+    parser.add_argument("--xlm_checkpoint_path",
+                        default = None,
+                        type = str,
+                        required = True,
+                        help = "Path the official PyTorch dump.")
+    parser.add_argument("--pytorch_dump_folder_path",
+                        default = None,
+                        type = str,
+                        required = True,
+                        help = "Path to the output PyTorch model.")
+    args = parser.parse_args()
+    convert_xlm_checkpoint_to_pytorch(args.xlm_checkpoint_path, args.pytorch_dump_folder_path)
diff --git a/pytorch_pretrained_bert/modeling_xlm.py b/pytorch_pretrained_bert/modeling_xlm.py
index 325f887923..fa196215a5 100644
--- a/pytorch_pretrained_bert/modeling_xlm.py
+++ b/pytorch_pretrained_bert/modeling_xlm.py
@@ -72,29 +72,22 @@ class XLMConfig(PretrainedConfig):
 
     def __init__(self,
                  vocab_size_or_config_json_file,
-                 causal=True,
-                 d_model=1024,
-                 n_layer=24,
-                 n_head=16,
-                 d_inner=4096,
-                 ff_activation="gelu",
-                 untie_r=True,
-                 attn_type="bi",
-
+                 n_special=0,
+                 emb_dim=2048,
+                 n_layers=12,
+                 n_heads=16,
+                 dropout=0.1,
+                 attention_dropout=0.1,
+                 gelu_activation=True,
+                 sinusoidal_embeddings=False,
+                 asm=False,
+                 id2lang={ 0: "en" },
+                 lang2id={ "en": 0 },
+                 n_langs=1,
+                 n_words=30145,
                  max_position_embeddings=512,
                  initializer_range=0.02,
-                 layer_norm_eps=1e-12,
-
-                 dropout=0.1,
-                 dropatt=0.1,
-                 init="normal",
-                 init_range=0.1,
-                 init_std=0.02,
-                 mem_len=None,
-                 reuse_len=None,
-                 bi_data=False,
-                 clamp_len=-1,
-                 same_length=False):
+                 **kwargs):
         """Constructs XLMConfig.
 
         Args:
@@ -137,6 +130,8 @@ class XLMConfig(PretrainedConfig):
                 -1 means no clamping.
             same_length: bool, whether to use the same attention length for each token.
         """
+        super(XLMConfig, self).__init__(**kwargs)
+
         if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
                         and isinstance(vocab_size_or_config_json_file, unicode)):
             with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader:
@@ -144,36 +139,41 @@ class XLMConfig(PretrainedConfig):
             for key, value in json_config.items():
                 self.__dict__[key] = value
         elif isinstance(vocab_size_or_config_json_file, int):
-            self.n_token = vocab_size_or_config_json_file
-            self.causal = causal
-            self.d_model = d_model
-            self.n_layer = n_layer
-            self.n_head = n_head
-            assert d_model % n_head == 0
-            self.d_head = d_model // n_head
-            self.ff_activation = ff_activation
-            self.d_inner = d_inner
-            self.untie_r = untie_r
-            self.attn_type = attn_type
-
+            self.n_words = vocab_size_or_config_json_file
+            self.n_special = n_special
+            self.emb_dim = emb_dim
+            self.n_layers = n_layers
+            self.n_heads = n_heads
+            self.dropout = dropout
+            self.attention_dropout = attention_dropout
+            self.gelu_activation = gelu_activation
+            self.sinusoidal_embeddings = sinusoidal_embeddings
+            self.asm = asm
+            self.id2lang = id2lang
+            self.lang2id = lang2id
+            self.n_langs = n_langs
             self.max_position_embeddings = max_position_embeddings
             self.initializer_range = initializer_range
-            self.layer_norm_eps = layer_norm_eps
-
-            self.init = init
-            self.init_range = init_range
-            self.init_std = init_std
-            self.dropout = dropout
-            self.dropatt = dropatt
-            self.mem_len = mem_len
-            self.reuse_len = reuse_len
-            self.bi_data = bi_data
-            self.clamp_len = clamp_len
-            self.same_length = same_length
         else:
             raise ValueError("First argument must be either a vocabulary size (int)"
                              "or the path to a pretrained model config file (str)")
 
+    @property
+    def total_tokens_embeddings(self):
+        return self.n_words + self.n_special
+
+    @property
+    def hidden_size(self):
+        return self.emb_dim
+
+    @property
+    def num_attention_heads(self):
+        return self.n_heads
+
+    @property
+    def num_hidden_layers(self):
+        return self.n_layers
+
 
 try:
     from apex.normalization.fused_layer_norm import FusedLayerNorm as XLMLayerNorm
@@ -259,9 +259,10 @@ class MultiHeadAttention(nn.Module):
 
     NEW_ID = itertools.count()
 
-    def __init__(self, n_heads, dim, dropout):
+    def __init__(self, n_heads, dim, dropout, output_attentions=False):
         super().__init__()
         self.layer_id = next(MultiHeadAttention.NEW_ID)
+        self.output_attentions = output_attentions
         self.dim = dim
         self.n_heads = n_heads
         self.dropout = dropout
@@ -325,7 +326,10 @@ class MultiHeadAttention(nn.Module):
         context = torch.matmul(weights, v)                                    # (bs, n_heads, qlen, dim_per_head)
         context = unshape(context)                                            # (bs, qlen, dim)
 
-        return self.out_lin(context)
+        outputs = (self.out_lin(context),)
+        if self.output_attentions:
+            outputs = outputs + (weights)
+        return outputs
 
 
 class TransformerFFN(nn.Module):
@@ -345,52 +349,6 @@ class TransformerFFN(nn.Module):
         return x
 
 
-class BeamHypotheses(object):
-
-    def __init__(self, n_hyp, max_len, length_penalty, early_stopping):
-        """
-        Initialize n-best list of hypotheses.
-        """
-        self.max_len = max_len - 1  # ignoring <BOS>
-        self.length_penalty = length_penalty
-        self.early_stopping = early_stopping
-        self.n_hyp = n_hyp
-        self.hyp = []
-        self.worst_score = 1e9
-
-    def __len__(self):
-        """
-        Number of hypotheses in the list.
-        """
-        return len(self.hyp)
-
-    def add(self, hyp, sum_logprobs):
-        """
-        Add a new hypothesis to the list.
-        """
-        score = sum_logprobs / len(hyp) ** self.length_penalty
-        if len(self) < self.n_hyp or score > self.worst_score:
-            self.hyp.append((score, hyp))
-            if len(self) > self.n_hyp:
-                sorted_scores = sorted([(s, idx) for idx, (s, _) in enumerate(self.hyp)])
-                del self.hyp[sorted_scores[0][1]]
-                self.worst_score = sorted_scores[1][0]
-            else:
-                self.worst_score = min(score, self.worst_score)
-
-    def is_done(self, best_sum_logprobs):
-        """
-        If there are enough hypotheses and that none of the hypotheses being generated
-        can become better than the worst one in the heap, then we are done with this sentence.
-        """
-        if len(self) < self.n_hyp:
-            return False
-        elif self.early_stopping:
-            return True
-        else:
-            return self.worst_score >= best_sum_logprobs / self.max_len ** self.length_penalty
-
-
 class XLMPreTrainedModel(PreTrainedModel):
     """ An abstract class to handle weights initialization and
         a simple interface for dowloading and loading pretrained models.
@@ -410,16 +368,11 @@ class XLMPreTrainedModel(PreTrainedModel):
             # Slightly different from the TF version which uses truncated_normal for initialization
             # cf https://github.com/pytorch/pytorch/pull/5617
             module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if isinstance(module, nn.Linear) and module.bias is not None:
+                module.bias.data.zero_()
         elif isinstance(module, XLMLayerNorm):
             module.bias.data.zero_()
             module.weight.data.fill_(1.0)
-        elif isinstance(module, XLMRelativeAttention):
-            for param in [module.q, module.k, module.v, module.o, module.r,
-                          module.r_r_bias, module.r_s_bias, module.r_w_bias,
-                          module.seg_embed]:
-                param.data.normal_(mean=0.0, std=self.config.initializer_range)
-        if isinstance(module, nn.Linear) and module.bias is not None:
-            module.bias.data.zero_()
 
 
 class XLMModel(XLMPreTrainedModel):
@@ -429,7 +382,7 @@ class XLMModel(XLMPreTrainedModel):
                   'hidden_dim', 'dropout', 'attention_dropout', 'asm',
                   'asm_cutoffs', 'asm_div_value']
 
-    def __init__(self, params, output_attentions=False, output_hidden_states=False):  #, dico, is_encoder, with_output):
+    def __init__(self, config):  #, dico, is_encoder, with_output):
         """ XLM model from: "Cross-lingual Language Model Pretraining" by Guillaume Lample, Alexis Conneau
             Paper: https://arxiv.org/abs/1901.07291
             Original code: https://github.com/facebookresearch/XLM
@@ -481,41 +434,41 @@ class XLMModel(XLMPreTrainedModel):
         all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)
         ```
         """
-        super(XLMModel, self).__init__(params)
-        self.output_attentions = output_attentions
-        self.output_hidden_states = output_hidden_states
+        super(XLMModel, self).__init__(config)
+        self.output_attentions = config.output_attentions
+        self.output_hidden_states = config.output_hidden_states
 
         # encoder / decoder, output layer
         # self.is_encoder = is_encoder
         # self.is_decoder = not is_encoder
         # self.with_output = with_output
-        self.causal = params.causal
+        self.causal = config.causal
 
         # dictionary / languages
-        self.n_langs = params.n_langs
-        self.n_words = params.n_words
-        self.eos_index = params.eos_index
-        self.pad_index = params.pad_index
+        self.n_langs = config.n_langs
+        self.n_words = config.n_words
+        self.eos_index = config.eos_index
+        self.pad_index = config.pad_index
         # self.dico = dico
-        self.id2lang = params.id2lang
-        self.lang2id = params.lang2id
+        self.id2lang = config.id2lang
+        self.lang2id = config.lang2id
         # assert len(self.dico) == self.n_words
         assert len(self.id2lang) == len(self.lang2id) == self.n_langs
 
         # model parameters
-        self.dim = params.emb_dim       # 512 by default
+        self.dim = config.emb_dim       # 512 by default
         self.hidden_dim = self.dim * 4  # 2048 by default
-        self.n_heads = params.n_heads   # 8 by default
-        self.n_layers = params.n_layers
-        self.dropout = params.dropout
-        self.attention_dropout = params.attention_dropout
+        self.n_heads = config.n_heads   # 8 by default
+        self.n_layers = config.n_layers
+        self.dropout = config.dropout
+        self.attention_dropout = config.attention_dropout
         assert self.dim % self.n_heads == 0, 'transformer dim must be a multiple of n_heads'
 
         # embeddings
-        self.position_embeddings = Embedding(params.max_position_embeddings, self.dim)
-        if params.sinusoidal_embeddings:
-            create_sinusoidal_embeddings(params.max_position_embeddings, self.dim, out=self.position_embeddings.weight)
-        if params.n_langs > 1:
+        self.position_embeddings = Embedding(config.max_position_embeddings, self.dim)
+        if config.sinusoidal_embeddings:
+            create_sinusoidal_embeddings(config.max_position_embeddings, self.dim, out=self.position_embeddings.weight)
+        if config.n_langs > 1:
             self.lang_embeddings = Embedding(self.n_langs, self.dim)
         self.embeddings = Embedding(self.n_words, self.dim, padding_idx=self.pad_index)
         self.layer_norm_emb = nn.LayerNorm(self.dim, eps=1e-12)
@@ -535,26 +488,26 @@ class XLMModel(XLMPreTrainedModel):
             if self.is_decoder:
                 self.layer_norm15.append(nn.LayerNorm(self.dim, eps=1e-12))
                 self.encoder_attn.append(MultiHeadAttention(self.n_heads, self.dim, dropout=self.attention_dropout))
-            self.ffns.append(TransformerFFN(self.dim, self.hidden_dim, self.dim, dropout=self.dropout, gelu_activation=params.gelu_activation))
+            self.ffns.append(TransformerFFN(self.dim, self.hidden_dim, self.dim, dropout=self.dropout, gelu_activation=config.gelu_activation))
             self.layer_norm2.append(nn.LayerNorm(self.dim, eps=1e-12))
 
-    def forward(self, x, lengths, positions=None, langs=None, cache=None, head_mask=None):  # src_enc=None, src_len=None, 
+    def forward(self, input_ids, lengths, positions=None, langs=None, cache=None, head_mask=None):  # src_enc=None, src_len=None, 
         """
         Inputs:
-            `x` LongTensor(bs, slen), containing word indices
+            `input_ids` LongTensor(bs, slen), containing word indices
             `lengths` LongTensor(bs), containing the length of each sentence
             `causal` Boolean, if True, the attention is only done over previous hidden states
             `positions` LongTensor(bs, slen), containing word positions
             `langs` LongTensor(bs, slen), containing language IDs
         """
-        # lengths = (x != self.pad_index).float().sum(dim=1)
-        # mask = x != self.pad_index
+        # lengths = (input_ids != self.pad_index).float().sum(dim=1)
+        # mask = input_ids != self.pad_index
 
         # check inputs
-        bs, slen = x.size()
+        bs, slen = input_ids.size()
         assert lengths.size(0) == bs
         assert lengths.max().item() <= slen
-        # x = x.transpose(0, 1)  # batch size as dimension 0
+        # input_ids = input_ids.transpose(0, 1)  # batch size as dimension 0
         # assert (src_enc is None) == (src_len is None)
         # if src_enc is not None:
         #     assert self.is_decoder
@@ -567,7 +520,7 @@ class XLMModel(XLMPreTrainedModel):
 
         # positions
         if positions is None:
-            positions = x.new(slen).long()
+            positions = input_ids.new(slen).long()
             positions = torch.arange(slen, out=positions).unsqueeze(0)
         else:
             assert positions.size() == (bs, slen)  # (slen, bs)
@@ -581,7 +534,7 @@ class XLMModel(XLMPreTrainedModel):
         # do not recompute cached elements
         if cache is not None:
             _slen = slen - cache['slen']
-            x = x[:, -_slen:]
+            input_ids = input_ids[:, -_slen:]
             positions = positions[:, -_slen:]
             if langs is not None:
                 langs = langs[:, -_slen:]
@@ -589,7 +542,7 @@ class XLMModel(XLMPreTrainedModel):
             attn_mask = attn_mask[:, -_slen:]
 
         # embeddings
-        tensor = self.embeddings(x)
+        tensor = self.embeddings(input_ids)
         tensor = tensor + self.position_embeddings(positions).expand_as(tensor)
         if langs is not None:
             tensor = tensor + self.lang_embeddings(langs)
@@ -648,21 +601,21 @@ class XLMPredLayer(nn.Module):
     """
     Prediction layer (cross_entropy or adaptive_softmax).
     """
-    def __init__(self, params):
+    def __init__(self, config):
         super().__init__()
-        self.asm = params.asm
-        self.n_words = params.n_words
-        self.pad_index = params.pad_index
-        dim = params.emb_dim
+        self.asm = config.asm
+        self.n_words = config.n_words
+        self.pad_index = config.pad_index
+        dim = config.emb_dim
 
-        if params.asm is False:
-            self.proj = Linear(dim, params.n_words, bias=True)
+        if config.asm is False:
+            self.proj = Linear(dim, config.n_words, bias=True)
         else:
             self.proj = nn.AdaptiveLogSoftmaxWithLoss(
                 in_features=dim,
-                n_classes=params.n_words,
-                cutoffs=params.asm_cutoffs,
-                div_value=params.asm_div_value,
+                n_classes=config.n_words,
+                cutoffs=config.asm_cutoffs,
+                div_value=config.asm_div_value,
                 head_bias=True,  # default is False
             )
 
@@ -691,66 +644,63 @@ class XLMPredLayer(nn.Module):
 
 
 class XLMWithLMHeadModel(XLMPreTrainedModel):
-        """ XLM model from: "Cross-lingual Language Model Pretraining" by Guillaume Lample, Alexis Conneau
-            Paper: https://arxiv.org/abs/1901.07291
-            Original code: https://github.com/facebookresearch/XLM
+    """ XLM model from: "Cross-lingual Language Model Pretraining" by Guillaume Lample, Alexis Conneau
+        Paper: https://arxiv.org/abs/1901.07291
+        Original code: https://github.com/facebookresearch/XLM
 
-        Params:
-            `config`: a XLMConfig class instance with the configuration to build a new model
-            `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
-            `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
-                This can be used to compute head importance metrics. Default: False
+    Params:
+        `config`: a XLMConfig class instance with the configuration to build a new model
+        `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
+        `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
+            This can be used to compute head importance metrics. Default: False
 
-        Inputs:
-            `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
-                with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
-                `run_bert_extract_features.py`, `run_bert_classifier.py` and `run_bert_squad.py`)
-            `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
-                types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
-                a `sentence B` token (see XLM paper for more details).
-            `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
-                selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
-                input sequence length in the current batch. It's the mask that we typically use for attention when
-                a batch has varying length sentences.
-            `output_all_encoded_layers`: boolean which controls the content of the `encoded_layers` output as described below. Default: `True`.
-            `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
-                It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
+    Inputs:
+        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
+            with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
+            `run_bert_extract_features.py`, `run_bert_classifier.py` and `run_bert_squad.py`)
+        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
+            types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
+            a `sentence B` token (see XLM paper for more details).
+        `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
+            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
+            input sequence length in the current batch. It's the mask that we typically use for attention when
+            a batch has varying length sentences.
+        `output_all_encoded_layers`: boolean which controls the content of the `encoded_layers` output as described below. Default: `True`.
+        `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
+            It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
 
 
-        Outputs: Tuple of (encoded_layers, pooled_output)
-            `encoded_layers`: controled by `output_all_encoded_layers` argument:
-                - `output_all_encoded_layers=True`: outputs a list of the full sequences of encoded-hidden-states at the end
-                    of each attention block (i.e. 12 full sequences for XLM-base, 24 for XLM-large), each
-                    encoded-hidden-state is a torch.FloatTensor of size [batch_size, sequence_length, hidden_size],
-                - `output_all_encoded_layers=False`: outputs only the full sequence of hidden-states corresponding
-                    to the last attention block of shape [batch_size, sequence_length, hidden_size],
-            `pooled_output`: a torch.FloatTensor of size [batch_size, hidden_size] which is the output of a
-                classifier pretrained on top of the hidden state associated to the first character of the
-                input (`CLS`) to train on the Next-Sentence task (see XLM's paper).
+    Outputs: Tuple of (encoded_layers, pooled_output)
+        `encoded_layers`: controled by `output_all_encoded_layers` argument:
+            - `output_all_encoded_layers=True`: outputs a list of the full sequences of encoded-hidden-states at the end
+                of each attention block (i.e. 12 full sequences for XLM-base, 24 for XLM-large), each
+                encoded-hidden-state is a torch.FloatTensor of size [batch_size, sequence_length, hidden_size],
+            - `output_all_encoded_layers=False`: outputs only the full sequence of hidden-states corresponding
+                to the last attention block of shape [batch_size, sequence_length, hidden_size],
+        `pooled_output`: a torch.FloatTensor of size [batch_size, hidden_size] which is the output of a
+            classifier pretrained on top of the hidden state associated to the first character of the
+            input (`CLS`) to train on the Next-Sentence task (see XLM's paper).
 
-        Example usage:
-        ```python
-        # Already been converted into WordPiece token ids
-        input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
-        input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
-        token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
+    Example usage:
+    ```python
+    # Already been converted into WordPiece token ids
+    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
+    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
+    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
 
-        config = modeling.XLMConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
-            num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
+    config = modeling.XLMConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
+        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
 
-        model = modeling.XLMModel(config=config)
-        all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)
-        ```
-        """
-    def __init__(self, config, output_attentions=False, output_hidden_states=False):
+    model = modeling.XLMModel(config=config)
+    all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)
+    ```
+    """
+    def __init__(self, config):
         super(XLMLMHeadModel, self).__init__(config)
-        self.output_attentions = output_attentions
-        self.output_hidden_states = output_hidden_states
-
         self.attn_type = config.attn_type
         self.same_length = config.same_length
 
-        self.transformer = XLMModel(config, output_attentions=output_attentions, output_hidden_states=output_hidden_states)
+        self.transformer = XLMModel(config)
         self.pred_layer = XLMPredLayer(config)
 
         self.apply(self.init_weights)
@@ -761,7 +711,7 @@ class XLMWithLMHeadModel(XLMPreTrainedModel):
         """
         self.pred_layer.proj.weight = self.transformer.embeddings.weight
 
-    def forward(self, x, lengths, positions=None, langs=None, cache=None,
+    def forward(self, input_ids, lengths, positions=None, langs=None, cache=None,
                 labels=None, head_mask=None):
         """
         Args:
@@ -789,7 +739,7 @@ class XLMWithLMHeadModel(XLMPreTrainedModel):
             summary_type: str, "last", "first", "mean", or "attn". The method
                 to pool the input to get a vector representation.
         """
-        transformer_outputs = self.transformer(x, lengths, positions=positions, langs=langs, cache=cache, head_mask=head_mask)
+        transformer_outputs = self.transformer(input_ids, lengths, positions=positions, langs=langs, cache=cache, head_mask=head_mask)
 
         output = transformer_outputs[0]
         logits = self.pred_layer(output, labels)
@@ -905,18 +855,12 @@ class XLMForSequenceClassification(XLMPreTrainedModel):
     all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)
     ```
     """
-    def __init__(self, config, summary_type="last", use_proj=True, num_labels=2,
-                 output_attentions=False, output_hidden_states=False):
+    def __init__(self, config):
         super(XLMForSequenceClassification, self).__init__(config)
-        self.output_attentions = output_attentions
-        self.output_hidden_states = output_hidden_states
 
-        self.summary_type = summary_type
-        self.num_labels = num_labels
+        self.transformer = XLMModel(config)
 
-        self.transformer = XLMModel(config, output_attentions=output_attentions, output_hidden_states=output_hidden_states)
-
-        self.sequence_summary = XLMSequenceSummary(config, summary_type=summary_type, use_proj=use_proj)
+        self.sequence_summary = XLMSequenceSummary(config)
         self.logits_proj = nn.Linear(config.d_model, num_labels)
         self.apply(self.init_weights)
 
@@ -1030,13 +974,12 @@ class XLMForQuestionAnswering(XLMPreTrainedModel):
     start_logits, end_logits = model(input_ids, token_type_ids, input_mask)
     ```
     """
-    def __init__(self, config, output_attentions=False, output_hidden_states=False):
+    def __init__(self, CONFIG_NAME):
         super(XLMForQuestionAnswering, self).__init__(config)
-        self.output_attentions = output_attentions
-        self.output_hidden_states = output_hidden_states
 
-        self.transformer = XLMModel(config, output_attentions=output_attentions, output_hidden_states=output_hidden_states)
-        self.qa_outputs = nn.Linear(config.hidden_size, 2)
+        self.transformer = XLMModel(config)
+        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
+
         self.apply(self.init_weights)
 
     def forward(self, inp_k, token_type_ids=None, input_mask=None, attention_mask=None,
diff --git a/pytorch_pretrained_bert/tokenization_xlm.py b/pytorch_pretrained_bert/tokenization_xlm.py
new file mode 100644
index 0000000000..a4c1a61545
--- /dev/null
+++ b/pytorch_pretrained_bert/tokenization_xlm.py
@@ -0,0 +1,326 @@
+# coding=utf-8
+# Copyright 2019 The Open AI Team Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes for OpenAI GPT."""
+from __future__ import (absolute_import, division, print_function,
+                        unicode_literals)
+
+import json
+import logging
+import os
+import re
+import sys
+from io import open
+
+from tqdm import tqdm
+
+from .file_utils import cached_path
+from .tokenization import BasicTokenizer
+
+logger = logging.getLogger(__name__)
+
+PRETRAINED_VOCAB_ARCHIVE_MAP = {
+    'xlm-mlm-en-2048': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-en-2048-vocab.json",
+}
+PRETRAINED_MERGES_ARCHIVE_MAP = {
+    'xlm-mlm-en-2048': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-en-2048-merges.txt",
+}
+PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP = {
+    'xlm-mlm-en-2048': 512,
+}
+VOCAB_NAME = 'vocab.json'
+MERGES_NAME = 'merges.txt'
+SPECIAL_TOKENS_NAME = 'special_tokens.txt'
+
+INDEX= {
+  "bos_index": 0,
+  "eos_index": 1,
+  "pad_index": 2,
+  "unk_index": 3,
+  "mask_index": 5
+}
+
+def get_pairs(word):
+    """
+    Return set of symbol pairs in a word.
+    word is represented as tuple of symbols (symbols being variable-length strings)
+    """
+    pairs = set()
+    prev_char = word[0]
+    for char in word[1:]:
+        pairs.add((prev_char, char))
+        prev_char = char
+    return pairs
+
+def text_standardize(text):
+    """
+    fixes some issues the spacy tokenizer had on books corpus
+    also does some whitespace standardization
+    """
+    text = text.replace('—', '-')
+    text = text.replace('–', '-')
+    text = text.replace('―', '-')
+    text = text.replace('…', '...')
+    text = text.replace('´', "'")
+    text = re.sub(r'''(-+|~+|!+|"+|;+|\?+|\++|,+|\)+|\(+|\\+|\/+|\*+|\[+|\]+|}+|{+|\|+|_+)''', r' \1 ', text)
+    text = re.sub(r'\s*\n\s*', ' \n ', text)
+    text = re.sub(r'[^\S\n]+', ' ', text)
+    return text.strip()
+
+class XLMTokenizer(object):
+    """
+    BPE tokenizer for XLM, adapted from OpenAI BPE tokenizer. Peculiarities:
+        - lower case all inputs
+        - uses SpaCy tokenizer and ftfy for pre-BPE tokenization if they are installed, fallback to BERT's BasicTokenizer if not.
+        - argument special_tokens and function set_special_tokens:
+            can be used to add additional symbols (ex: "__classify__") to a vocabulary.
+    """
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, *inputs, **kwargs):
+        """
+        Instantiate a PreTrainedBertModel from a pre-trained model file.
+        Download and cache the pre-trained model file if needed.
+        """
+        if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP:
+            vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[pretrained_model_name_or_path]
+            merges_file = PRETRAINED_MERGES_ARCHIVE_MAP[pretrained_model_name_or_path]
+            special_tokens_file = None
+        else:
+            vocab_file = os.path.join(pretrained_model_name_or_path, VOCAB_NAME)
+            merges_file = os.path.join(pretrained_model_name_or_path, MERGES_NAME)
+            special_tokens_file = os.path.join(pretrained_model_name_or_path, SPECIAL_TOKENS_NAME)
+            if not os.path.exists(special_tokens_file):
+                special_tokens_file = None
+            else:
+                logger.info("loading special tokens file {}".format(special_tokens_file))
+        # redirect to the cache, if necessary
+        try:
+            resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir)
+            resolved_merges_file = cached_path(merges_file, cache_dir=cache_dir)
+        except EnvironmentError:
+            if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP:
+                logger.error(
+                    "Couldn't reach server at '{}' to download vocabulary.".format(
+                        vocab_file))
+            else:
+                logger.error(
+                    "Model name '{}' was not found in model name list ({}). "
+                    "We assumed '{}' was a path or url but couldn't find files {} and {} "
+                    "at this path or url.".format(
+                        pretrained_model_name_or_path,
+                        ', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()),
+                        pretrained_model_name_or_path,
+                        vocab_file, merges_file))
+            return None
+        if resolved_vocab_file == vocab_file and resolved_merges_file == merges_file:
+            logger.info("loading vocabulary file {}".format(vocab_file))
+            logger.info("loading merges file {}".format(merges_file))
+        else:
+            logger.info("loading vocabulary file {} from cache at {}".format(
+                vocab_file, resolved_vocab_file))
+            logger.info("loading merges file {} from cache at {}".format(
+                merges_file, resolved_merges_file))
+        if pretrained_model_name_or_path in PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP:
+            # if we're using a pretrained model, ensure the tokenizer wont index sequences longer
+            # than the number of positional embeddings
+            max_len = PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP[pretrained_model_name_or_path]
+            kwargs['max_len'] = min(kwargs.get('max_len', int(1e12)), max_len)
+        # Instantiate tokenizer.
+        if special_tokens_file and 'special_tokens' not in kwargs:
+            special_tokens = open(special_tokens_file, encoding='utf-8').read().split('\n')[:-1]
+        else:
+            special_tokens = kwargs.pop('special_tokens', [])
+        tokenizer = cls(resolved_vocab_file, resolved_merges_file, special_tokens=special_tokens, *inputs, **kwargs)
+        return tokenizer
+
+    def __init__(self, vocab_file, merges_file, special_tokens=None, max_len=None):
+        try:
+            import ftfy
+            import spacy
+            self.nlp = spacy.load('en', disable=['parser', 'tagger', 'ner', 'textcat'])
+            self.fix_text = ftfy.fix_text
+        except ImportError:
+            logger.warning("ftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy.")
+            self.nlp = BasicTokenizer(do_lower_case=True,
+                                      never_split=special_tokens if special_tokens is not None else [])
+            self.fix_text = None
+
+        self.max_len = max_len if max_len is not None else int(1e12)
+        self.encoder = json.load(open(vocab_file, encoding="utf-8"))
+        self.decoder = {v:k for k,v in self.encoder.items()}
+        merges = open(merges_file, encoding='utf-8').read().split('\n')[:-1]
+        merges = [tuple(merge.split()[:2]) for merge in merges]
+        self.bpe_ranks = dict(zip(merges, range(len(merges))))
+        self.cache = {}
+        self.special_tokens = {}
+        self.special_tokens_decoder = {}
+        self.set_special_tokens(special_tokens)
+
+    def __len__(self):
+        return len(self.encoder) + len(self.special_tokens)
+
+    def set_special_tokens(self, special_tokens):
+        """ Add a list of additional tokens to the encoder.
+            The additional tokens are indexed starting from the last index of the
+            current vocabulary in the order of the `special_tokens` list.
+        """
+        if not special_tokens:
+            self.special_tokens = {}
+            self.special_tokens_decoder = {}
+            return
+        self.special_tokens = dict((tok, len(self.encoder) + i) for i, tok in enumerate(special_tokens))
+        self.special_tokens_decoder = {v:k for k, v in self.special_tokens.items()}
+        if self.fix_text is None:
+            # Using BERT's BasicTokenizer: we can update the tokenizer
+            self.nlp.never_split = special_tokens
+        logger.info("Special tokens {}".format(self.special_tokens))
+
+    def bpe(self, token):
+        word = tuple(token[:-1]) + (token[-1] + '</w>',)
+        if token in self.cache:
+            return self.cache[token]
+        pairs = get_pairs(word)
+
+        if not pairs:
+            return token+'</w>'
+
+        while True:
+            bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float('inf')))
+            if bigram not in self.bpe_ranks:
+                break
+            first, second = bigram
+            new_word = []
+            i = 0
+            while i < len(word):
+                try:
+                    j = word.index(first, i)
+                    new_word.extend(word[i:j])
+                    i = j
+                except:
+                    new_word.extend(word[i:])
+                    break
+
+                if word[i] == first and i < len(word)-1 and word[i+1] == second:
+                    new_word.append(first+second)
+                    i += 2
+                else:
+                    new_word.append(word[i])
+                    i += 1
+            new_word = tuple(new_word)
+            word = new_word
+            if len(word) == 1:
+                break
+            else:
+                pairs = get_pairs(word)
+        word = ' '.join(word)
+        if word == '\n  </w>':
+            word = '\n</w>'
+        self.cache[token] = word
+        return word
+
+    def tokenize(self, text):
+        """ Tokenize a string. """
+        split_tokens = []
+        if self.fix_text is None:
+            # Using BERT's BasicTokenizer
+            text = self.nlp.tokenize(text)
+            for token in text:
+                split_tokens.extend([t for t in self.bpe(token).split(' ')])
+        else:
+            # Using SpaCy & ftfy (original tokenization process of OpenAI GPT)
+            text = self.nlp(text_standardize(self.fix_text(text)))
+            for token in text:
+                split_tokens.extend([t for t in self.bpe(token.text.lower()).split(' ')])
+        return split_tokens
+
+    def convert_tokens_to_ids(self, tokens):
+        """ Converts a sequence of tokens into ids using the vocab. """
+        ids = []
+        if isinstance(tokens, str) or (sys.version_info[0] == 2 and isinstance(tokens, unicode)):
+            if tokens in self.special_tokens:
+                return self.special_tokens[tokens]
+            else:
+                return self.encoder.get(tokens, 0)
+        for token in tokens:
+            if token in self.special_tokens:
+                ids.append(self.special_tokens[token])
+            else:
+                ids.append(self.encoder.get(token, 0))
+        if len(ids) > self.max_len:
+            logger.warning(
+                "Token indices sequence length is longer than the specified maximum "
+                " sequence length for this OpenAI GPT model ({} > {}). Running this"
+                " sequence through the model will result in indexing errors".format(len(ids), self.max_len)
+            )
+        return ids
+
+    def convert_ids_to_tokens(self, ids, skip_special_tokens=False):
+        """Converts a sequence of ids in BPE tokens using the vocab."""
+        tokens = []
+        for i in ids:
+            if i in self.special_tokens_decoder:
+                if not skip_special_tokens:
+                    tokens.append(self.special_tokens_decoder[i])
+            else:
+                tokens.append(self.decoder[i])
+        return tokens
+
+    def encode(self, text):
+        return self.convert_tokens_to_ids(self.tokenize(text))
+
+    def decode(self, ids, skip_special_tokens=False, clean_up_tokenization_spaces=True):
+        """Converts a sequence of ids in a string."""
+        tokens = self.convert_ids_to_tokens(ids, skip_special_tokens=skip_special_tokens)
+        out_string = ''.join(tokens).replace('</w>', ' ').strip()
+        if clean_up_tokenization_spaces:
+            out_string = out_string.replace('<unk>', '')
+            out_string = out_string.replace(' .', '.').replace(' ?', '?').replace(' !', '!').replace(' ,', ','
+                    ).replace(" ' ", "'").replace(" n't", "n't").replace(" 'm", "'m").replace(" do not", " don't"
+                    ).replace(" 's", "'s").replace(" 've", "'ve").replace(" 're", "'re")
+        return out_string
+
+    def save_vocabulary(self, vocab_path):
+        """Save the tokenizer vocabulary and merge files to a directory."""
+        if not os.path.isdir(vocab_path):
+            logger.error("Vocabulary path ({}) should be a directory".format(vocab_path))
+            return
+        vocab_file = os.path.join(vocab_path, VOCAB_NAME)
+        merge_file = os.path.join(vocab_path, MERGES_NAME)
+        special_tokens_file = os.path.join(vocab_path, SPECIAL_TOKENS_NAME)
+
+        with open(vocab_file, 'w', encoding='utf-8') as f:
+            f.write(json.dumps(self.encoder, ensure_ascii=False))
+
+        index = 0
+        with open(merge_file, "w", encoding="utf-8") as writer:
+            writer.write(u'#version: 0.2\n')
+            for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
+                if index != token_index:
+                    logger.warning("Saving vocabulary to {}: BPE merge indices are not consecutive."
+                                   " Please check that the tokenizer is not corrupted!".format(merge_file))
+                    index = token_index
+                writer.write(' '.join(bpe_tokens) + u'\n')
+                index += 1
+
+        index = len(self.encoder)
+        with open(special_tokens_file, 'w', encoding='utf-8') as writer:
+            for token, token_index in sorted(self.special_tokens.items(), key=lambda kv: kv[1]):
+                if index != token_index:
+                    logger.warning("Saving special tokens vocabulary to {}: BPE indices are not consecutive."
+                                   " Please check that the tokenizer is not corrupted!".format(special_tokens_file))
+                    index = token_index
+                writer.write(token + u'\n')
+                index += 1
+
+        return vocab_file, merge_file, special_tokens_file

From d8e83de792d937141184e3e7fe83c8beb63ed4b5 Mon Sep 17 00:00:00 2001
From: LysandreJik <lysandre.debut@reseau.eseo.fr>
Date: Tue, 2 Jul 2019 18:01:09 -0400
Subject: [PATCH 041/139] GPT2 can be exported to TorchScript

---
 pytorch_pretrained_bert/modeling_gpt2.py | 31 ++++++++++++------------
 1 file changed, 16 insertions(+), 15 deletions(-)

diff --git a/pytorch_pretrained_bert/modeling_gpt2.py b/pytorch_pretrained_bert/modeling_gpt2.py
index fef4937400..d878cf5234 100644
--- a/pytorch_pretrained_bert/modeling_gpt2.py
+++ b/pytorch_pretrained_bert/modeling_gpt2.py
@@ -328,7 +328,8 @@ class GPT2LMHead(nn.Module):
 
     def set_embeddings_weights(self, model_embeddings_weights, predict_special_tokens=True):
         self.predict_special_tokens = predict_special_tokens
-        self.decoder.weight = model_embeddings_weights  # Tied weights
+        # Export to TorchScript can't handle parameter sharing so we are cloning them.
+        self.decoder.weight = nn.Parameter(model_embeddings_weights.clone())  # Tied weights
 
     def forward(self, hidden_state):
         lm_logits = self.decoder(hidden_state)
@@ -557,16 +558,16 @@ class GPT2Model(GPT2PreTrainedModel):
 
         output_shape = input_shape + (hidden_states.size(-1),)
 
-        presents = []
+        presents = ()
         all_attentions = []
-        all_hidden_states = []
+        all_hidden_states = ()
         for i, (block, layer_past) in enumerate(zip(self.h, past)):
             if self.output_hidden_states:
-                all_hidden_states.append(hidden_states.view(*output_shape))
+                all_hidden_states += (hidden_states.view(*output_shape),)
 
             outputs = block(hidden_states, layer_past, head_mask[i])
             hidden_states, present = outputs[:2]
-            presents.append(present)
+            presents += (present,)
 
             if self.output_attentions:
                 all_attentions.append(outputs[2])
@@ -576,16 +577,16 @@ class GPT2Model(GPT2PreTrainedModel):
         hidden_states = hidden_states.view(*output_shape)
         # Add last hidden state
         if self.output_hidden_states:
-            all_hidden_states.append(hidden_states)
+            all_hidden_states += (hidden_states,)
 
-        outputs = [hidden_states, presents]
+        outputs = (hidden_states, presents)
         if self.output_hidden_states:
-            outputs.append(all_hidden_states)
+            outputs += (all_hidden_states,)
         if self.output_attentions:
             # let the number of heads free (-1) so we can extract attention even after head pruning
             attention_output_shape = input_shape[:-1] + (-1,) + all_attentions[0].shape[-2:]
-            all_attentions = list(t.view(*attention_output_shape) for t in all_attentions)
-            outputs.append(all_attentions)
+            all_attentions = tuple(t.view(*attention_output_shape) for t in all_attentions)
+            outputs += (all_attentions,)
         return outputs  # last hidden state, presents, (all hidden_states), (attentions)
 
 
@@ -658,7 +659,7 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
 
         lm_logits = self.lm_head(hidden_states)
 
-        outputs = [lm_logits] + transformer_outputs[1:]
+        outputs = (lm_logits,) + transformer_outputs[1:]
         if lm_labels is not None:
             # Shift so that tokens < n predict n
             shift_logits = lm_logits[..., :-1, :].contiguous()
@@ -667,7 +668,7 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
             loss_fct = CrossEntropyLoss(ignore_index=-1)
             loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)),
                             shift_labels.view(-1))
-            outputs = [loss] + outputs
+            outputs = (loss,) + outputs
 
         return outputs  # (loss), lm_logits, presents, (all hidden_states), (attentions)
 
@@ -750,18 +751,18 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
         lm_logits = self.lm_head(hidden_states)
         mc_logits = self.multiple_choice_head(hidden_states, mc_token_ids)
 
-        outputs = [lm_logits, mc_logits] + transformer_outputs[1:]
+        outputs = (lm_logits, mc_logits) + transformer_outputs[1:]
         if mc_labels is not None:
             loss_fct = CrossEntropyLoss()
             loss = loss_fct(mc_logits.view(-1, mc_logits.size(-1)),
                             mc_labels.view(-1))
-            outputs = [loss] + outputs
+            outputs = (loss,) + outputs
         if lm_labels is not None:
             shift_logits = lm_logits[..., :-1, :].contiguous()
             shift_labels = lm_labels[..., 1:].contiguous()
             loss_fct = CrossEntropyLoss(ignore_index=-1)
             loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)),
                             shift_labels.view(-1))
-            outputs = [loss] + outputs
+            outputs = (loss,) + outputs
 
         return outputs  # (lm loss), (mc loss), lm logits, mc logits, presents, (all hidden_states), (attentions)

From be54b16960947b6fc8f0656c64511170edf30631 Mon Sep 17 00:00:00 2001
From: LysandreJik <lysandre.debut@reseau.eseo.fr>
Date: Tue, 2 Jul 2019 18:09:45 -0400
Subject: [PATCH 042/139] GPT can be exported to TorchScript

---
 pytorch_pretrained_bert/modeling_openai.py | 28 +++++++++++-----------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/pytorch_pretrained_bert/modeling_openai.py b/pytorch_pretrained_bert/modeling_openai.py
index f4fe09110a..0db4b28caf 100644
--- a/pytorch_pretrained_bert/modeling_openai.py
+++ b/pytorch_pretrained_bert/modeling_openai.py
@@ -355,7 +355,7 @@ class OpenAIGPTLMHead(nn.Module):
     def set_embeddings_weights(self, model_embeddings_weights, predict_special_tokens=True):
         self.predict_special_tokens = predict_special_tokens
         embed_shape = model_embeddings_weights.shape
-        self.decoder.weight = model_embeddings_weights  # Tied weights
+        self.decoder.weight = nn.Parameter(model_embeddings_weights.clone())  # Tied weights
 
     def forward(self, hidden_state):
         lm_logits = self.decoder(hidden_state)
@@ -579,26 +579,26 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
 
         output_shape = input_shape + (hidden_states.size(-1),)
 
-        all_attentions = []
-        all_hidden_states = []
+        all_attentions = ()
+        all_hidden_states = ()
         for i, block in enumerate(self.h):
             if self.output_hidden_states:
-                all_hidden_states.append(hidden_states.view(*output_shape))
+                all_hidden_states += (hidden_states.view(*output_shape),)
 
             outputs = block(hidden_states, head_mask[i])
             hidden_states = outputs[0]
             if self.output_attentions:
-                all_attentions.append(outputs[1])
+                all_attentions += (outputs[1],)
 
         # Add last layer
         if self.output_hidden_states:
-            all_hidden_states.append(hidden_states.view(*output_shape))
+            all_hidden_states += (hidden_states.view(*output_shape),)
 
-        outputs = [hidden_states.view(*output_shape)]
+        outputs = (hidden_states.view(*output_shape),)
         if self.output_hidden_states:
-            outputs.append(all_hidden_states)
+            outputs += (all_hidden_states,)
         if self.output_attentions:
-            outputs.append(all_attentions)
+            outputs += (all_attentions,)
         return outputs  # last hidden state, (all hidden states), (all attentions)
 
 
@@ -682,7 +682,7 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
         hidden_states = transformer_outputs[0]
         lm_logits = self.lm_head(hidden_states)
 
-        outputs = [lm_logits] + transformer_outputs[1:]
+        outputs = (lm_logits,) + transformer_outputs[1:]
         if lm_labels is not None:
             # Shift so that tokens < n predict n
             shift_logits = lm_logits[..., :-1, :].contiguous()
@@ -691,7 +691,7 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
             loss_fct = CrossEntropyLoss(ignore_index=-1)
             loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)),
                             shift_labels.view(-1))
-            outputs = [loss] + outputs
+            outputs = (loss,) + outputs
 
         return outputs  # (loss), lm_logits, (all hidden states), (all attentions)
 
@@ -785,18 +785,18 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
         lm_logits = self.lm_head(hidden_states)
         mc_logits = self.multiple_choice_head(hidden_states, mc_token_ids)
 
-        outputs = [lm_logits, mc_logits] + transformer_outputs[1:]
+        outputs = (lm_logits, mc_logits) + transformer_outputs[1:]
         if mc_labels is not None:
             loss_fct = CrossEntropyLoss()
             loss = loss_fct(mc_logits.view(-1, mc_logits.size(-1)),
                             mc_labels.view(-1))
-            outputs = [loss] + outputs
+            outputs = (loss,) + outputs
         if lm_labels is not None:
             shift_logits = lm_logits[..., :-1, :].contiguous()
             shift_labels = lm_labels[..., 1:].contiguous()
             loss_fct = CrossEntropyLoss(ignore_index=-1)
             loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)),
                             shift_labels.view(-1))
-            outputs = [loss] + outputs
+            outputs = (loss,) + outputs
 
         return outputs  # (lm loss), (mc loss), lm logits, mc logits, (all hidden_states), (attentions)

From 971c24687fe722a38b68a45bcbe34f103d896c6d Mon Sep 17 00:00:00 2001
From: LysandreJik <lysandre.debut@reseau.eseo.fr>
Date: Wed, 3 Jul 2019 11:03:09 -0400
Subject: [PATCH 043/139] XLNET can be exported to TorchScript

---
 pytorch_pretrained_bert/modeling_xlnet.py | 43 ++++++++++++-----------
 1 file changed, 22 insertions(+), 21 deletions(-)

diff --git a/pytorch_pretrained_bert/modeling_xlnet.py b/pytorch_pretrained_bert/modeling_xlnet.py
index da5ccdb8f3..c4c3354070 100644
--- a/pytorch_pretrained_bert/modeling_xlnet.py
+++ b/pytorch_pretrained_bert/modeling_xlnet.py
@@ -384,7 +384,8 @@ class XLNetRelativeAttention(nn.Module):
         x = x.reshape(x_size[1], x_size[0], x_size[2], x_size[3])
         x = x[1:, ...]
         x = x.reshape(x_size[0], x_size[1] - 1, x_size[2], x_size[3])
-        x = x[:, 0:klen, :, :]
+        # x = x[:, 0:klen, :, :]
+        x = torch.index_select(x, 1, torch.arange(klen))
 
         return x
 
@@ -527,9 +528,9 @@ class XLNetRelativeAttention(nn.Module):
             output_h = self.post_attention(h, attn_vec)
             output_g = None
 
-        outputs = [output_h, output_g]
+        outputs = (output_h, output_g)
         if self.output_attentions:
-            outputs = outputs + [attn_prob]
+            outputs += (attn_prob,)
         return outputs
 
 class XLNetFeedForward(nn.Module):
@@ -574,7 +575,7 @@ class XLNetLayer(nn.Module):
             output_g = self.ff(output_g)
         output_h = self.ff(output_h)
 
-        outputs = [output_h, output_g] + outputs[2:]  # Add again attentions if there are there
+        outputs = (output_h, output_g) + outputs[2:]  # Add again attentions if there are there
         return outputs
 
 
@@ -688,7 +689,7 @@ class XLNetModel(XLNetPreTrainedModel):
     def relative_positional_encoding(self, qlen, klen, bsz=None):
         """create relative positional encoding."""
         freq_seq = torch.arange(0, self.d_model, 2.0, dtype=torch.float)
-        inv_freq = 1 / (10000 ** (freq_seq / self.d_model))
+        inv_freq = 1 / torch.pow(10000, (freq_seq / self.d_model))
 
         if self.attn_type == 'bi':
             # beg, end = klen - 1, -qlen
@@ -869,7 +870,7 @@ class XLNetModel(XLNetPreTrainedModel):
         else:
             head_mask = [None] * self.n_layer
 
-        new_mems = []
+        new_mems = ()
         if mems is None:
             mems = [None] * len(self.layer)
 
@@ -877,7 +878,7 @@ class XLNetModel(XLNetPreTrainedModel):
         hidden_states = []
         for i, layer_module in enumerate(self.layer):
             # cache new mems
-            new_mems.append(self.cache_mem(output_h, mems[i]))
+            new_mems += (self.cache_mem(output_h, mems[i]),)
             if self.output_hidden_states:
                 hidden_states.append((output_h, output_g) if output_g is not None else output_h)
 
@@ -895,16 +896,16 @@ class XLNetModel(XLNetPreTrainedModel):
         output = self.dropout(output_g if output_g is not None else output_h)
 
         # Prepare outputs, we transpose back here to shape [bsz, len, hidden_dim] (cf. beginning of forward() method)
-        outputs = [output.permute(1, 0, 2).contiguous(), new_mems]
+        outputs = (output.permute(1, 0, 2).contiguous(), new_mems)
         if self.output_hidden_states:
             if output_g is not None:
-                hidden_states = [h.permute(1, 0, 2).contiguous() for hs in hidden_states for h in hs]
+                hidden_states = tuple(h.permute(1, 0, 2).contiguous() for hs in hidden_states for h in hs)
             else:
-                hidden_states = [hs.permute(1, 0, 2).contiguous() for hs in hidden_states]
-            outputs.append(hidden_states)
+                hidden_states = tuple(hs.permute(1, 0, 2).contiguous() for hs in hidden_states)
+            outputs += (hidden_states,)
         if self.output_attentions:
-            attentions = list(t.permute(2, 3, 0, 1).contiguous() for t in attentions)
-            outputs.append(attentions)
+            attentions = tuple(t.permute(2, 3, 0, 1).contiguous() for t in attentions)
+            outputs += (attentions,)
 
         return outputs  # outputs, new_mems, (hidden_states), (attentions)
 
@@ -986,7 +987,7 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
     def tie_weights(self):
         """ Make sure we are sharing the embeddings
         """
-        self.lm_loss.weight = self.transformer.word_embedding.weight
+        self.lm_loss.weight = nn.Parameter(self.transformer.word_embedding.weight.clone())
 
     def forward(self, input_ids, token_type_ids=None, input_mask=None, attention_mask=None,
                 mems=None, perm_mask=None, target_mapping=None, inp_q=None,
@@ -1026,14 +1027,14 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
 
         logits = self.lm_loss(transformer_outputs[0])
 
-        outputs = [logits] + transformer_outputs[1:]  # Keep mems, hidden states, attentions if there are in it
+        outputs = (logits,) + transformer_outputs[1:]  # Keep mems, hidden states, attentions if there are in it
 
         if labels is not None:
             # Flatten the tokens
             loss_fct = CrossEntropyLoss(ignore_index=-1)
             loss = loss_fct(logits.view(-1, logits.size(-1)),
                             labels.view(-1))
-            outputs = [loss] + outputs
+            outputs = (loss,) + outputs
 
         return outputs  # return (loss), logits, (mems), (hidden states), (attentions)
 
@@ -1061,7 +1062,7 @@ class XLNetSequenceSummary(nn.Module):
             output = hidden_states[:, 0]
         elif self.summary_type == 'mean':
             output = hidden_states.mean(dim=1)
-        elif summary_type == 'attn':
+        elif self.summary_type == 'attn':
             raise NotImplementedError
 
         output = self.summary(output)
@@ -1180,7 +1181,7 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel):
         output = self.sequence_summary(output)
         logits = self.logits_proj(output)
 
-        outputs = [logits] + transformer_outputs[1:]  # Keep mems, hidden states, attentions if there are in it
+        outputs = (logits,) + transformer_outputs[1:]  # Keep mems, hidden states, attentions if there are in it
 
         if labels is not None:
             if self.num_labels == 1:
@@ -1190,7 +1191,7 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel):
             else:
                 loss_fct = CrossEntropyLoss()
                 loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
-            outputs = [loss] + outputs
+            outputs = (loss,) + outputs
 
         return outputs  # return (loss), logits, (mems), (hidden states), (attentions)
 
@@ -1271,7 +1272,7 @@ class XLNetForQuestionAnswering(XLNetPreTrainedModel):
         start_logits = start_logits.squeeze(-1)
         end_logits = end_logits.squeeze(-1)
 
-        outputs = [start_logits, end_logits] + transformer_outputs[1:]  # Keep mems, hidden states, attentions if there are in it
+        outputs = (start_logits, end_logits,) + transformer_outputs[1:]  # Keep mems, hidden states, attentions if there are in it
 
         if start_positions is not None and end_positions is not None:
             # If we are on multi-GPU, split add a dimension
@@ -1288,6 +1289,6 @@ class XLNetForQuestionAnswering(XLNetPreTrainedModel):
             start_loss = loss_fct(start_logits, start_positions)
             end_loss = loss_fct(end_logits, end_positions)
             total_loss = (start_loss + end_loss) / 2
-            outputs = [total_loss] + outputs
+            outputs = (total_loss,) + outputs
 
         return outputs  # return (loss), logits, (mems), (hidden states), (attentions)

From 4703148f0cdf5308d707e95ce285d01bf4e8ccfd Mon Sep 17 00:00:00 2001
From: LysandreJik <lysandre.debut@reseau.eseo.fr>
Date: Wed, 3 Jul 2019 14:50:23 -0400
Subject: [PATCH 044/139] TransformerXL can't be exported to TorchScript
 because of control-flow. Exception added to tests.

---
 pytorch_pretrained_bert/tests/model_tests_commons.py  | 11 +++++++----
 .../tests/modeling_transfo_xl_test.py                 |  2 +-
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/pytorch_pretrained_bert/tests/model_tests_commons.py b/pytorch_pretrained_bert/tests/model_tests_commons.py
index 75c0ae19fd..0afda5f2ce 100644
--- a/pytorch_pretrained_bert/tests/model_tests_commons.py
+++ b/pytorch_pretrained_bert/tests/model_tests_commons.py
@@ -198,14 +198,17 @@ def _create_and_check_for_hidden_states(tester, model_classes, config, inputs_di
             [tester.seq_length, tester.hidden_size])
 
 
-def create_and_check_commons(tester, config, inputs_dict, test_pruning=True):
+def create_and_check_commons(tester, config, inputs_dict, test_pruning=True, test_torchscript=True):
     _create_and_check_initialization(tester, tester.all_model_classes, config, inputs_dict)
-    _create_and_check_torchscript(tester, tester.all_model_classes, config, inputs_dict)
-    _create_and_check_torchscript_output_attentions(tester, tester.all_model_classes, config, inputs_dict)
-    _create_and_check_torchscript_output_hidden_state(tester, tester.all_model_classes, config, inputs_dict)
     _create_and_check_for_attentions(tester, tester.all_model_classes, config, inputs_dict)
     _create_and_check_for_headmasking(tester, tester.all_model_classes, config, inputs_dict)
     _create_and_check_for_hidden_states(tester, tester.all_model_classes, config, inputs_dict)
+
+    if test_torchscript:
+        _create_and_check_torchscript(tester, tester.all_model_classes, config, inputs_dict)
+        _create_and_check_torchscript_output_attentions(tester, tester.all_model_classes, config, inputs_dict)
+        _create_and_check_torchscript_output_hidden_state(tester, tester.all_model_classes, config, inputs_dict)
+
     if test_pruning:
         _create_and_check_for_head_pruning(tester, tester.all_model_classes, config, inputs_dict)
 
diff --git a/pytorch_pretrained_bert/tests/modeling_transfo_xl_test.py b/pytorch_pretrained_bert/tests/modeling_transfo_xl_test.py
index 8b46b6d755..caeb25b412 100644
--- a/pytorch_pretrained_bert/tests/modeling_transfo_xl_test.py
+++ b/pytorch_pretrained_bert/tests/modeling_transfo_xl_test.py
@@ -173,7 +173,7 @@ class TransfoXLModelTest(unittest.TestCase):
 
         def create_and_check_transfo_xl_commons(self, config, input_ids_1, input_ids_2, lm_labels):
             inputs_dict = {'input_ids': input_ids_1}
-            create_and_check_commons(self, config, inputs_dict, test_pruning=False)
+            create_and_check_commons(self, config, inputs_dict, test_pruning=False, test_torchscript=False)
 
     def test_default(self):
         self.run_tester(TransfoXLModelTest.TransfoXLModelTester(self))

From b43b130f35d1c6e3e925762c1c06e3e53ebdea37 Mon Sep 17 00:00:00 2001
From: LysandreJik <lysandre.debut@reseau.eseo.fr>
Date: Wed, 3 Jul 2019 16:21:17 -0400
Subject: [PATCH 045/139] TorchScript flag in config; Tied weights when not
 running TorchScript; tuple concatenation clean-up.

---
 pytorch_pretrained_bert/model_utils.py         |  1 +
 pytorch_pretrained_bert/modeling.py            | 18 ++++++++++++------
 pytorch_pretrained_bert/modeling_gpt2.py       | 16 ++++++++++------
 pytorch_pretrained_bert/modeling_openai.py     | 18 +++++++++++-------
 pytorch_pretrained_bert/modeling_xlnet.py      | 14 +++++++++-----
 .../tests/model_tests_commons.py               |  1 +
 6 files changed, 44 insertions(+), 24 deletions(-)

diff --git a/pytorch_pretrained_bert/model_utils.py b/pytorch_pretrained_bert/model_utils.py
index 8c116df54a..ec735c3e0a 100644
--- a/pytorch_pretrained_bert/model_utils.py
+++ b/pytorch_pretrained_bert/model_utils.py
@@ -46,6 +46,7 @@ class PretrainedConfig(object):
         self.num_labels = kwargs.pop('num_labels', 2)
         self.output_attentions = kwargs.pop('output_attentions', False)
         self.output_hidden_states = kwargs.pop('output_hidden_states', False)
+        self.torchscript = kwargs.pop('torchscript', False)
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
diff --git a/pytorch_pretrained_bert/modeling.py b/pytorch_pretrained_bert/modeling.py
index eb7fdf1a14..7b18cb8452 100644
--- a/pytorch_pretrained_bert/modeling.py
+++ b/pytorch_pretrained_bert/modeling.py
@@ -428,23 +428,23 @@ class BertEncoder(nn.Module):
         all_attentions = ()
         for i, layer_module in enumerate(self.layer):
             if self.output_hidden_states:
-                all_hidden_states += (hidden_states,)
+                all_hidden_states = all_hidden_states + (hidden_states,)
 
             layer_outputs = layer_module(hidden_states, attention_mask, head_mask[i])
             hidden_states = layer_outputs[0]
 
             if self.output_attentions:
-                all_attentions += (layer_outputs[1],)
+                all_attentions = all_attentions + (layer_outputs[1],)
 
         # Add last layer
         if self.output_hidden_states:
-            all_hidden_states += (hidden_states,)
+            all_hidden_states = all_hidden_states + (hidden_states,)
 
         outputs = (hidden_states,)
         if self.output_hidden_states:
-            outputs += (all_hidden_states,)
+            outputs = outputs + (all_hidden_states,)
         if self.output_attentions:
-            outputs += (all_attentions,)
+            outputs = outputs + (all_attentions,)
         return outputs  # outputs, (hidden states), (attentions)
 
 
@@ -484,13 +484,19 @@ class BertLMPredictionHead(nn.Module):
     def __init__(self, config, bert_model_embedding_weights):
         super(BertLMPredictionHead, self).__init__()
         self.transform = BertPredictionHeadTransform(config)
+        self.torchscript = config.torchscript
 
         # The output weights are the same as the input embeddings, but there is
         # an output-only bias for each token.
         self.decoder = nn.Linear(bert_model_embedding_weights.size(1),
                                  bert_model_embedding_weights.size(0),
                                  bias=False)
-        self.decoder.weight = nn.Parameter(bert_model_embedding_weights.clone())
+
+        if self.torchscript:
+            self.decoder.weight = nn.Parameter(bert_model_embedding_weights.clone())
+        else:
+            self.decoder.weight = bert_model_embedding_weights
+
         self.bias = nn.Parameter(torch.zeros(bert_model_embedding_weights.size(0)))
 
     def forward(self, hidden_states):
diff --git a/pytorch_pretrained_bert/modeling_gpt2.py b/pytorch_pretrained_bert/modeling_gpt2.py
index d878cf5234..ba4fd3e2aa 100644
--- a/pytorch_pretrained_bert/modeling_gpt2.py
+++ b/pytorch_pretrained_bert/modeling_gpt2.py
@@ -322,6 +322,7 @@ class GPT2LMHead(nn.Module):
         self.n_embd = config.n_embd
         self.vocab_size = config.vocab_size
         self.predict_special_tokens = config.predict_special_tokens
+        self.torchscript = config.torchscript
         embed_shape = model_embeddings_weights.shape
         self.decoder = nn.Linear(embed_shape[1], embed_shape[0], bias=False)
         self.set_embeddings_weights(model_embeddings_weights)
@@ -329,7 +330,10 @@ class GPT2LMHead(nn.Module):
     def set_embeddings_weights(self, model_embeddings_weights, predict_special_tokens=True):
         self.predict_special_tokens = predict_special_tokens
         # Export to TorchScript can't handle parameter sharing so we are cloning them.
-        self.decoder.weight = nn.Parameter(model_embeddings_weights.clone())  # Tied weights
+        if self.torchscript:
+            self.decoder.weight = nn.Parameter(model_embeddings_weights.clone())
+        else:
+            self.decoder.weight = model_embeddings_weights  # Tied weights
 
     def forward(self, hidden_state):
         lm_logits = self.decoder(hidden_state)
@@ -563,11 +567,11 @@ class GPT2Model(GPT2PreTrainedModel):
         all_hidden_states = ()
         for i, (block, layer_past) in enumerate(zip(self.h, past)):
             if self.output_hidden_states:
-                all_hidden_states += (hidden_states.view(*output_shape),)
+                all_hidden_states = all_hidden_states + (hidden_states.view(*output_shape),)
 
             outputs = block(hidden_states, layer_past, head_mask[i])
             hidden_states, present = outputs[:2]
-            presents += (present,)
+            presents = presents + (present,)
 
             if self.output_attentions:
                 all_attentions.append(outputs[2])
@@ -577,16 +581,16 @@ class GPT2Model(GPT2PreTrainedModel):
         hidden_states = hidden_states.view(*output_shape)
         # Add last hidden state
         if self.output_hidden_states:
-            all_hidden_states += (hidden_states,)
+            all_hidden_states = all_hidden_states + (hidden_states,)
 
         outputs = (hidden_states, presents)
         if self.output_hidden_states:
-            outputs += (all_hidden_states,)
+            outputs = outputs + (all_hidden_states,)
         if self.output_attentions:
             # let the number of heads free (-1) so we can extract attention even after head pruning
             attention_output_shape = input_shape[:-1] + (-1,) + all_attentions[0].shape[-2:]
             all_attentions = tuple(t.view(*attention_output_shape) for t in all_attentions)
-            outputs += (all_attentions,)
+            outputs = outputs + (all_attentions,)
         return outputs  # last hidden state, presents, (all hidden_states), (attentions)
 
 
diff --git a/pytorch_pretrained_bert/modeling_openai.py b/pytorch_pretrained_bert/modeling_openai.py
index 0db4b28caf..ed3c0c13ee 100644
--- a/pytorch_pretrained_bert/modeling_openai.py
+++ b/pytorch_pretrained_bert/modeling_openai.py
@@ -348,14 +348,18 @@ class OpenAIGPTLMHead(nn.Module):
         self.n_embd = config.n_embd
         self.vocab_size = config.vocab_size
         self.predict_special_tokens = config.predict_special_tokens
+        self.torchscript = config.torchscript
         embed_shape = model_embeddings_weights.shape
         self.decoder = nn.Linear(embed_shape[1], embed_shape[0], bias=False)
         self.set_embeddings_weights(model_embeddings_weights)
 
     def set_embeddings_weights(self, model_embeddings_weights, predict_special_tokens=True):
         self.predict_special_tokens = predict_special_tokens
-        embed_shape = model_embeddings_weights.shape
-        self.decoder.weight = nn.Parameter(model_embeddings_weights.clone())  # Tied weights
+
+        if self.torchscript:
+            self.decoder.weight = nn.Parameter(model_embeddings_weights.clone())
+        else:
+            self.decoder.weight = model_embeddings_weights  # Tied weights
 
     def forward(self, hidden_state):
         lm_logits = self.decoder(hidden_state)
@@ -583,22 +587,22 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
         all_hidden_states = ()
         for i, block in enumerate(self.h):
             if self.output_hidden_states:
-                all_hidden_states += (hidden_states.view(*output_shape),)
+                all_hidden_states = all_hidden_states + (hidden_states.view(*output_shape),)
 
             outputs = block(hidden_states, head_mask[i])
             hidden_states = outputs[0]
             if self.output_attentions:
-                all_attentions += (outputs[1],)
+                all_attentions = all_attentions + (outputs[1],)
 
         # Add last layer
         if self.output_hidden_states:
-            all_hidden_states += (hidden_states.view(*output_shape),)
+            all_hidden_states = all_hidden_states + (hidden_states.view(*output_shape),)
 
         outputs = (hidden_states.view(*output_shape),)
         if self.output_hidden_states:
-            outputs += (all_hidden_states,)
+            outputs = outputs + (all_hidden_states,)
         if self.output_attentions:
-            outputs += (all_attentions,)
+            outputs = outputs + (all_attentions,)
         return outputs  # last hidden state, (all hidden states), (all attentions)
 
 
diff --git a/pytorch_pretrained_bert/modeling_xlnet.py b/pytorch_pretrained_bert/modeling_xlnet.py
index c4c3354070..2771ba7ca5 100644
--- a/pytorch_pretrained_bert/modeling_xlnet.py
+++ b/pytorch_pretrained_bert/modeling_xlnet.py
@@ -530,7 +530,7 @@ class XLNetRelativeAttention(nn.Module):
 
         outputs = (output_h, output_g)
         if self.output_attentions:
-            outputs += (attn_prob,)
+            outputs = outputs + (attn_prob,)
         return outputs
 
 class XLNetFeedForward(nn.Module):
@@ -878,7 +878,7 @@ class XLNetModel(XLNetPreTrainedModel):
         hidden_states = []
         for i, layer_module in enumerate(self.layer):
             # cache new mems
-            new_mems += (self.cache_mem(output_h, mems[i]),)
+            new_mems = new_mems + (self.cache_mem(output_h, mems[i]),)
             if self.output_hidden_states:
                 hidden_states.append((output_h, output_g) if output_g is not None else output_h)
 
@@ -902,10 +902,10 @@ class XLNetModel(XLNetPreTrainedModel):
                 hidden_states = tuple(h.permute(1, 0, 2).contiguous() for hs in hidden_states for h in hs)
             else:
                 hidden_states = tuple(hs.permute(1, 0, 2).contiguous() for hs in hidden_states)
-            outputs += (hidden_states,)
+            outputs = outputs + (hidden_states,)
         if self.output_attentions:
             attentions = tuple(t.permute(2, 3, 0, 1).contiguous() for t in attentions)
-            outputs += (attentions,)
+            outputs = outputs + (attentions,)
 
         return outputs  # outputs, new_mems, (hidden_states), (attentions)
 
@@ -975,6 +975,7 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
         super(XLNetLMHeadModel, self).__init__(config)
         self.attn_type = config.attn_type
         self.same_length = config.same_length
+        self.torchscript = config.torchscript
 
         self.transformer = XLNetModel(config)
         self.lm_loss = nn.Linear(config.d_model, config.n_token, bias=True)
@@ -987,7 +988,10 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
     def tie_weights(self):
         """ Make sure we are sharing the embeddings
         """
-        self.lm_loss.weight = nn.Parameter(self.transformer.word_embedding.weight.clone())
+        if self.torchscript:
+            self.lm_loss.weight = nn.Parameter(self.transformer.word_embedding.weight.clone())
+        else:
+            self.lm_loss.weight = self.transformer.word_embedding.weight
 
     def forward(self, input_ids, token_type_ids=None, input_mask=None, attention_mask=None,
                 mems=None, perm_mask=None, target_mapping=None, inp_q=None,
diff --git a/pytorch_pretrained_bert/tests/model_tests_commons.py b/pytorch_pretrained_bert/tests/model_tests_commons.py
index 0afda5f2ce..e93cc98ffe 100644
--- a/pytorch_pretrained_bert/tests/model_tests_commons.py
+++ b/pytorch_pretrained_bert/tests/model_tests_commons.py
@@ -41,6 +41,7 @@ def _create_and_check_torchscript_output_hidden_state(tester, model_classes, con
 
 def _create_and_check_torchscript(tester, model_classes, config, inputs_dict):
     configs_no_init = _config_zero_init(config)  # To be sure we have no Nan
+    configs_no_init.torchscript = True
     for model_class in model_classes:
         model = model_class(config=configs_no_init)
         model.eval()

From c41f2bad69923da6f23d76e47639ad350206d757 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Wed, 3 Jul 2019 22:54:39 +0200
Subject: [PATCH 046/139] WIP XLM + refactoring

---
 .../lm_finetuning/finetune_on_pregenerated.py |   4 +-
 .../pregenerate_training_data.py              |   2 +-
 .../lm_finetuning/simple_lm_finetuning.py     |   4 +-
 examples/run_bert_classifier.py               |   4 +-
 examples/run_bert_extract_features.py         |   4 +-
 examples/run_bert_squad.py                    |   4 +-
 examples/run_bert_swag.py                     |   4 +-
 examples/utils_squad.py                       |   2 +-
 hubconfs/bert_hubconf.py                      |   4 +-
 .../Comparing-TF-and-PT-models-MLM-NSP.ipynb  |   6 +-
 notebooks/Comparing-TF-and-PT-models.ipynb    |   4 +-
 pytorch_pretrained_bert/__init__.py           |   8 +-
 .../convert_tf_checkpoint_to_pytorch.py       |   2 +-
 .../{modeling.py => modeling_bert.py}         |   0
 pytorch_pretrained_bert/modeling_gpt2.py      |   2 +-
 pytorch_pretrained_bert/modeling_openai.py    |   2 +-
 .../modeling_transfo_xl.py                    |   2 +-
 pytorch_pretrained_bert/modeling_xlm.py       | 219 +++++++-------
 .../tests/model_tests_commons.py              |   4 +-
 .../tests/model_utils_test.py                 |   2 +-
 ...modeling_test.py => modeling_bert_test.py} |   7 +-
 .../tests/modeling_xlm_test.py                | 276 ++++++++++++++++++
 ...tion_test.py => tokenization_bert_test.py} |   2 +-
 .../tests/tokenization_xlm_test.py            |  79 +++++
 .../{tokenization.py => tokenization_bert.py} |   0
 .../tokenization_openai.py                    |   2 +-
 pytorch_pretrained_bert/tokenization_xlm.py   |   2 +-
 27 files changed, 515 insertions(+), 136 deletions(-)
 rename pytorch_pretrained_bert/{modeling.py => modeling_bert.py} (100%)
 rename pytorch_pretrained_bert/tests/{modeling_test.py => modeling_bert_test.py} (99%)
 create mode 100644 pytorch_pretrained_bert/tests/modeling_xlm_test.py
 rename pytorch_pretrained_bert/tests/{tokenization_test.py => tokenization_bert_test.py} (98%)
 create mode 100644 pytorch_pretrained_bert/tests/tokenization_xlm_test.py
 rename pytorch_pretrained_bert/{tokenization.py => tokenization_bert.py} (100%)

diff --git a/examples/lm_finetuning/finetune_on_pregenerated.py b/examples/lm_finetuning/finetune_on_pregenerated.py
index 2a5783c261..8eda2aa5c5 100644
--- a/examples/lm_finetuning/finetune_on_pregenerated.py
+++ b/examples/lm_finetuning/finetune_on_pregenerated.py
@@ -14,8 +14,8 @@ from torch.utils.data.distributed import DistributedSampler
 from tqdm import tqdm
 
 from pytorch_pretrained_bert import WEIGHTS_NAME, CONFIG_NAME
-from pytorch_pretrained_bert.modeling import BertForPreTraining
-from pytorch_pretrained_bert.tokenization import BertTokenizer
+from pytorch_pretrained_bert.modeling_bert import BertForPreTraining
+from pytorch_pretrained_bert.tokenization_bert import BertTokenizer
 from pytorch_pretrained_bert.optimization import BertAdam, WarmupLinearSchedule
 
 InputFeatures = namedtuple("InputFeatures", "input_ids input_mask segment_ids lm_label_ids is_next")
diff --git a/examples/lm_finetuning/pregenerate_training_data.py b/examples/lm_finetuning/pregenerate_training_data.py
index 8bed1e54d4..c2211c88e6 100644
--- a/examples/lm_finetuning/pregenerate_training_data.py
+++ b/examples/lm_finetuning/pregenerate_training_data.py
@@ -5,7 +5,7 @@ from tempfile import TemporaryDirectory
 import shelve
 
 from random import random, randrange, randint, shuffle, choice
-from pytorch_pretrained_bert.tokenization import BertTokenizer
+from pytorch_pretrained_bert.tokenization_bert import BertTokenizer
 import numpy as np
 import json
 import collections
diff --git a/examples/lm_finetuning/simple_lm_finetuning.py b/examples/lm_finetuning/simple_lm_finetuning.py
index 368d6825c7..bcfd138442 100644
--- a/examples/lm_finetuning/simple_lm_finetuning.py
+++ b/examples/lm_finetuning/simple_lm_finetuning.py
@@ -30,8 +30,8 @@ from torch.utils.data.distributed import DistributedSampler
 from tqdm import tqdm, trange
 
 from pytorch_pretrained_bert import WEIGHTS_NAME, CONFIG_NAME
-from pytorch_pretrained_bert.modeling import BertForPreTraining
-from pytorch_pretrained_bert.tokenization import BertTokenizer
+from pytorch_pretrained_bert.modeling_bert import BertForPreTraining
+from pytorch_pretrained_bert.tokenization_bert import BertTokenizer
 from pytorch_pretrained_bert.optimization import BertAdam, WarmupLinearSchedule
 
 logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
diff --git a/examples/run_bert_classifier.py b/examples/run_bert_classifier.py
index d987b35321..233a7ee5d1 100644
--- a/examples/run_bert_classifier.py
+++ b/examples/run_bert_classifier.py
@@ -35,8 +35,8 @@ from torch.nn import CrossEntropyLoss, MSELoss
 from tensorboardX import SummaryWriter
 
 from pytorch_pretrained_bert import WEIGHTS_NAME, CONFIG_NAME
-from pytorch_pretrained_bert.modeling import BertForSequenceClassification
-from pytorch_pretrained_bert.tokenization import BertTokenizer
+from pytorch_pretrained_bert.modeling_bert import BertForSequenceClassification
+from pytorch_pretrained_bert.tokenization_bert import BertTokenizer
 from pytorch_pretrained_bert.optimization import BertAdam, WarmupLinearSchedule
 
 from utils_glue import processors, output_modes, convert_examples_to_features, compute_metrics
diff --git a/examples/run_bert_extract_features.py b/examples/run_bert_extract_features.py
index 13384a9d69..2a550c431a 100644
--- a/examples/run_bert_extract_features.py
+++ b/examples/run_bert_extract_features.py
@@ -28,8 +28,8 @@ import torch
 from torch.utils.data import TensorDataset, DataLoader, SequentialSampler
 from torch.utils.data.distributed import DistributedSampler
 
-from pytorch_pretrained_bert.tokenization import BertTokenizer
-from pytorch_pretrained_bert.modeling import BertModel
+from pytorch_pretrained_bert.tokenization_bert import BertTokenizer
+from pytorch_pretrained_bert.modeling_bert import BertModel
 
 logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s', 
                     datefmt = '%m/%d/%Y %H:%M:%S',
diff --git a/examples/run_bert_squad.py b/examples/run_bert_squad.py
index 54eceb36f7..f8eee9c8eb 100644
--- a/examples/run_bert_squad.py
+++ b/examples/run_bert_squad.py
@@ -34,9 +34,9 @@ from tqdm import tqdm, trange
 from tensorboardX import SummaryWriter
 
 from pytorch_pretrained_bert import WEIGHTS_NAME, CONFIG_NAME
-from pytorch_pretrained_bert.modeling import BertForQuestionAnswering
+from pytorch_pretrained_bert.modeling_bert import BertForQuestionAnswering
 from pytorch_pretrained_bert.optimization import BertAdam, WarmupLinearSchedule
-from pytorch_pretrained_bert.tokenization import BertTokenizer
+from pytorch_pretrained_bert.tokenization_bert import BertTokenizer
 
 from utils_squad import read_squad_examples, convert_examples_to_features, RawResult, write_predictions
 
diff --git a/examples/run_bert_swag.py b/examples/run_bert_swag.py
index 28fd323c73..3e45225891 100644
--- a/examples/run_bert_swag.py
+++ b/examples/run_bert_swag.py
@@ -33,9 +33,9 @@ from torch.utils.data.distributed import DistributedSampler
 from tqdm import tqdm, trange
 
 from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE, WEIGHTS_NAME, CONFIG_NAME
-from pytorch_pretrained_bert.modeling import BertForMultipleChoice, BertConfig
+from pytorch_pretrained_bert.modeling_bert import BertForMultipleChoice, BertConfig
 from pytorch_pretrained_bert.optimization import BertAdam, WarmupLinearSchedule
-from pytorch_pretrained_bert.tokenization import BertTokenizer
+from pytorch_pretrained_bert.tokenization_bert import BertTokenizer
 
 logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                     datefmt = '%m/%d/%Y %H:%M:%S',
diff --git a/examples/utils_squad.py b/examples/utils_squad.py
index e4e43eff9d..0dfecd202c 100644
--- a/examples/utils_squad.py
+++ b/examples/utils_squad.py
@@ -24,7 +24,7 @@ import math
 import collections
 from io import open
 
-from pytorch_pretrained_bert.tokenization import BasicTokenizer, whitespace_tokenize
+from pytorch_pretrained_bert.tokenization_bert import BasicTokenizer, whitespace_tokenize
 
 logger = logging.getLogger(__name__)
 
diff --git a/hubconfs/bert_hubconf.py b/hubconfs/bert_hubconf.py
index 3769c2567f..94c7a18a30 100644
--- a/hubconfs/bert_hubconf.py
+++ b/hubconfs/bert_hubconf.py
@@ -1,5 +1,5 @@
-from pytorch_pretrained_bert.tokenization import BertTokenizer
-from pytorch_pretrained_bert.modeling import (
+from pytorch_pretrained_bert.tokenization_bert import BertTokenizer
+from pytorch_pretrained_bert.modeling_bert import (
         BertModel,
         BertForNextSentencePrediction,
         BertForMaskedLM,
diff --git a/notebooks/Comparing-TF-and-PT-models-MLM-NSP.ipynb b/notebooks/Comparing-TF-and-PT-models-MLM-NSP.ipynb
index 67c56ead38..ea7271df96 100644
--- a/notebooks/Comparing-TF-and-PT-models-MLM-NSP.ipynb
+++ b/notebooks/Comparing-TF-and-PT-models-MLM-NSP.ipynb
@@ -3997,9 +3997,9 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "11/16/2018 11:03:05 - INFO - pytorch_pretrained_bert.modeling -   loading archive file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased.tar.gz from cache at /Users/thomaswolf/.pytorch_pretrained_bert/9c41111e2de84547a463fd39217199738d1e3deb72d4fec4399e6e241983c6f0.ae3cef932725ca7a30cdcb93fc6e09150a55e2a130ec7af63975a16c153ae2ba\n",
-      "11/16/2018 11:03:05 - INFO - pytorch_pretrained_bert.modeling -   extracting archive file /Users/thomaswolf/.pytorch_pretrained_bert/9c41111e2de84547a463fd39217199738d1e3deb72d4fec4399e6e241983c6f0.ae3cef932725ca7a30cdcb93fc6e09150a55e2a130ec7af63975a16c153ae2ba to temp dir /var/folders/yx/cw8n_njx3js5jksyw_qlp8p00000gn/T/tmpaqgsm566\n",
-      "11/16/2018 11:03:08 - INFO - pytorch_pretrained_bert.modeling -   Model config {\n",
+      "11/16/2018 11:03:05 - INFO - pytorch_pretrained_bert.modeling_bert -   loading archive file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased.tar.gz from cache at /Users/thomaswolf/.pytorch_pretrained_bert/9c41111e2de84547a463fd39217199738d1e3deb72d4fec4399e6e241983c6f0.ae3cef932725ca7a30cdcb93fc6e09150a55e2a130ec7af63975a16c153ae2ba\n",
+      "11/16/2018 11:03:05 - INFO - pytorch_pretrained_bert.modeling_bert -   extracting archive file /Users/thomaswolf/.pytorch_pretrained_bert/9c41111e2de84547a463fd39217199738d1e3deb72d4fec4399e6e241983c6f0.ae3cef932725ca7a30cdcb93fc6e09150a55e2a130ec7af63975a16c153ae2ba to temp dir /var/folders/yx/cw8n_njx3js5jksyw_qlp8p00000gn/T/tmpaqgsm566\n",
+      "11/16/2018 11:03:08 - INFO - pytorch_pretrained_bert.modeling_bert -   Model config {\n",
       "  \"attention_probs_dropout_prob\": 0.1,\n",
       "  \"hidden_act\": \"gelu\",\n",
       "  \"hidden_dropout_prob\": 0.1,\n",
diff --git a/notebooks/Comparing-TF-and-PT-models.ipynb b/notebooks/Comparing-TF-and-PT-models.ipynb
index 5e724a710a..3e438e2f55 100644
--- a/notebooks/Comparing-TF-and-PT-models.ipynb
+++ b/notebooks/Comparing-TF-and-PT-models.ipynb
@@ -375,8 +375,8 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "11/15/2018 16:21:18 - INFO - pytorch_pretrained_bert.modeling -   loading archive file ../../google_models/uncased_L-12_H-768_A-12/\n",
-      "11/15/2018 16:21:18 - INFO - pytorch_pretrained_bert.modeling -   Model config {\n",
+      "11/15/2018 16:21:18 - INFO - pytorch_pretrained_bert.modeling_bert -   loading archive file ../../google_models/uncased_L-12_H-768_A-12/\n",
+      "11/15/2018 16:21:18 - INFO - pytorch_pretrained_bert.modeling_bert -   Model config {\n",
       "  \"attention_probs_dropout_prob\": 0.1,\n",
       "  \"hidden_act\": \"gelu\",\n",
       "  \"hidden_dropout_prob\": 0.1,\n",
diff --git a/pytorch_pretrained_bert/__init__.py b/pytorch_pretrained_bert/__init__.py
index 7d823a045d..e14b8b27a9 100644
--- a/pytorch_pretrained_bert/__init__.py
+++ b/pytorch_pretrained_bert/__init__.py
@@ -1,11 +1,12 @@
 __version__ = "0.6.2"
-from .tokenization import BertTokenizer, BasicTokenizer, WordpieceTokenizer
+from .tokenization_bert import BertTokenizer, BasicTokenizer, WordpieceTokenizer
 from .tokenization_openai import OpenAIGPTTokenizer
 from .tokenization_transfo_xl import (TransfoXLTokenizer, TransfoXLCorpus)
 from .tokenization_gpt2 import GPT2Tokenizer
 from .tokenization_xlnet import XLNetTokenizer, SPIECE_UNDERLINE
+from .tokenization_xlm import XLMTokenizer
 
-from .modeling import (BertConfig, BertModel, BertForPreTraining,
+from .modeling_bert import (BertConfig, BertModel, BertForPreTraining,
                        BertForMaskedLM, BertForNextSentencePrediction,
                        BertForSequenceClassification, BertForMultipleChoice,
                        BertForTokenClassification, BertForQuestionAnswering,
@@ -22,6 +23,9 @@ from .modeling_xlnet import (XLNetConfig,
                              XLNetPreTrainedModel, XLNetModel, XLNetLMHeadModel,
                              XLNetForSequenceClassification, XLNetForQuestionAnswering,
                              load_tf_weights_in_xlnet)
+from .modeling_xlm import (XLMConfig, XLMModel,
+                           XLMWithLMHeadModel, XLMForSequenceClassification,
+                           XLMForQuestionAnswering)
 
 from .optimization import BertAdam
 from .optimization_openai import OpenAIAdam
diff --git a/pytorch_pretrained_bert/convert_tf_checkpoint_to_pytorch.py b/pytorch_pretrained_bert/convert_tf_checkpoint_to_pytorch.py
index 13d96384fd..42f7380969 100755
--- a/pytorch_pretrained_bert/convert_tf_checkpoint_to_pytorch.py
+++ b/pytorch_pretrained_bert/convert_tf_checkpoint_to_pytorch.py
@@ -25,7 +25,7 @@ import tensorflow as tf
 import torch
 import numpy as np
 
-from pytorch_pretrained_bert.modeling import BertConfig, BertForPreTraining, load_tf_weights_in_bert
+from pytorch_pretrained_bert.modeling_bert import BertConfig, BertForPreTraining, load_tf_weights_in_bert
 
 def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, pytorch_dump_path):
     # Initialise PyTorch model
diff --git a/pytorch_pretrained_bert/modeling.py b/pytorch_pretrained_bert/modeling_bert.py
similarity index 100%
rename from pytorch_pretrained_bert/modeling.py
rename to pytorch_pretrained_bert/modeling_bert.py
diff --git a/pytorch_pretrained_bert/modeling_gpt2.py b/pytorch_pretrained_bert/modeling_gpt2.py
index fef4937400..774ba68509 100644
--- a/pytorch_pretrained_bert/modeling_gpt2.py
+++ b/pytorch_pretrained_bert/modeling_gpt2.py
@@ -32,7 +32,7 @@ from torch.nn.parameter import Parameter
 
 from .file_utils import cached_path
 from .model_utils import Conv1D, CONFIG_NAME, WEIGHTS_NAME, PretrainedConfig, PreTrainedModel, prune_conv1d_layer
-from .modeling import BertLayerNorm as LayerNorm
+from .modeling_bert import BertLayerNorm as LayerNorm
 
 logger = logging.getLogger(__name__)
 
diff --git a/pytorch_pretrained_bert/modeling_openai.py b/pytorch_pretrained_bert/modeling_openai.py
index f4fe09110a..7948a070bf 100644
--- a/pytorch_pretrained_bert/modeling_openai.py
+++ b/pytorch_pretrained_bert/modeling_openai.py
@@ -32,7 +32,7 @@ from torch.nn.parameter import Parameter
 
 from .file_utils import cached_path
 from .model_utils import Conv1D, CONFIG_NAME, WEIGHTS_NAME, PretrainedConfig, PreTrainedModel, prune_conv1d_layer
-from .modeling import BertLayerNorm as LayerNorm
+from .modeling_bert import BertLayerNorm as LayerNorm
 
 logger = logging.getLogger(__name__)
 
diff --git a/pytorch_pretrained_bert/modeling_transfo_xl.py b/pytorch_pretrained_bert/modeling_transfo_xl.py
index 871f699b1a..9a882bce96 100644
--- a/pytorch_pretrained_bert/modeling_transfo_xl.py
+++ b/pytorch_pretrained_bert/modeling_transfo_xl.py
@@ -34,7 +34,7 @@ import torch.nn.functional as F
 from torch.nn import CrossEntropyLoss
 from torch.nn.parameter import Parameter
 
-from .modeling import BertLayerNorm as LayerNorm
+from .modeling_bert import BertLayerNorm as LayerNorm
 from .modeling_transfo_xl_utilities import ProjectedAdaptiveLogSoftmax, sample_logits
 from .file_utils import cached_path
 from .model_utils import CONFIG_NAME, WEIGHTS_NAME, PretrainedConfig, PreTrainedModel
diff --git a/pytorch_pretrained_bert/modeling_xlm.py b/pytorch_pretrained_bert/modeling_xlm.py
index fa196215a5..66a0b0b1ed 100644
--- a/pytorch_pretrained_bert/modeling_xlm.py
+++ b/pytorch_pretrained_bert/modeling_xlm.py
@@ -71,7 +71,7 @@ class XLMConfig(PretrainedConfig):
     pretrained_config_archive_map = PRETRAINED_CONFIG_ARCHIVE_MAP
 
     def __init__(self,
-                 vocab_size_or_config_json_file,
+                 vocab_size_or_config_json_file=30145,
                  n_special=0,
                  emb_dim=2048,
                  n_layers=12,
@@ -80,13 +80,20 @@ class XLMConfig(PretrainedConfig):
                  attention_dropout=0.1,
                  gelu_activation=True,
                  sinusoidal_embeddings=False,
+                 causal=False,
                  asm=False,
-                 id2lang={ 0: "en" },
-                 lang2id={ "en": 0 },
                  n_langs=1,
-                 n_words=30145,
                  max_position_embeddings=512,
-                 initializer_range=0.02,
+                 embed_init_std=2048 ** -0.5,
+                 init_std=0.02,
+                 summary_type="last",
+                 use_proj=True,
+                 bos_index=0,
+                 eos_index=1,
+                 pad_index=2,
+                 unk_index=3,
+                 mask_index=5,
+                 is_encoder=True,
                  **kwargs):
         """Constructs XLMConfig.
 
@@ -148,12 +155,20 @@ class XLMConfig(PretrainedConfig):
             self.attention_dropout = attention_dropout
             self.gelu_activation = gelu_activation
             self.sinusoidal_embeddings = sinusoidal_embeddings
+            self.causal = causal
             self.asm = asm
-            self.id2lang = id2lang
-            self.lang2id = lang2id
             self.n_langs = n_langs
+            self.summary_type = summary_type
+            self.use_proj = use_proj
+            self.bos_index = bos_index
+            self.eos_index = eos_index
+            self.pad_index = pad_index
+            self.unk_index = unk_index
+            self.mask_index = mask_index
+            self.is_encoder = is_encoder
             self.max_position_embeddings = max_position_embeddings
-            self.initializer_range = initializer_range
+            self.embed_init_std = embed_init_std
+            self.init_std = init_std
         else:
             raise ValueError("First argument must be either a vocabulary size (int)"
                              "or the path to a pretrained model config file (str)")
@@ -175,37 +190,21 @@ class XLMConfig(PretrainedConfig):
         return self.n_layers
 
 
-try:
-    from apex.normalization.fused_layer_norm import FusedLayerNorm as XLMLayerNorm
-except ImportError:
-    logger.info("Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex .")
-    class XLMLayerNorm(nn.Module):
-        def __init__(self, d_model, eps=1e-12):
-            """Construct a layernorm module in the TF style (epsilon inside the square root).
-            """
-            super(XLMLayerNorm, self).__init__()
-            self.weight = nn.Parameter(torch.ones(d_model))
-            self.bias = nn.Parameter(torch.zeros(d_model))
-            self.variance_epsilon = eps
-
-        def forward(self, x):
-            u = x.mean(-1, keepdim=True)
-            s = (x - u).pow(2).mean(-1, keepdim=True)
-            x = (x - u) / torch.sqrt(s + self.variance_epsilon)
-            return self.weight * x + self.bias
-
-
-def Embedding(num_embeddings, embedding_dim, padding_idx=None):
+def Embedding(num_embeddings, embedding_dim, padding_idx=None, config=None):
     m = nn.Embedding(num_embeddings, embedding_dim, padding_idx=padding_idx)
-    nn.init.normal_(m.weight, mean=0, std=embedding_dim ** -0.5)
+    if config is not None and config.embed_init_std is not None:
+        nn.init.normal_(m.weight, mean=0, std=config.embed_init_std)
     if padding_idx is not None:
         nn.init.constant_(m.weight[padding_idx], 0)
     return m
 
 
-def Linear(in_features, out_features, bias=True):
+def Linear(in_features, out_features, bias=True, config=None):
     m = nn.Linear(in_features, out_features, bias)
-    # nn.init.normal_(m.weight, mean=0, std=1)
+    if config is not None and config.init_std is not None:
+        nn.init.normal_(m.weight, mean=0, std=config.init_std)
+        if bias:
+            nn.init.constant_(m.bias, 0.)
     # nn.init.xavier_uniform_(m.weight)
     # nn.init.constant_(m.bias, 0.)
     return m
@@ -233,14 +232,17 @@ def gelu(x):
     return 0.5 * x * (1.0 + torch.erf(x / math.sqrt(2.0)))
 
 
-def get_masks(slen, lengths, causal):
+def get_masks(slen, lengths, causal, padding_mask=None):
     """
     Generate hidden states mask, and optionally an attention mask.
     """
-    assert lengths.max().item() <= slen
     bs = lengths.size(0)
-    alen = torch.arange(slen, dtype=torch.long, device=lengths.device)
-    mask = alen < lengths[:, None]
+    if padding_mask is not None:
+        mask = padding_mask
+    else:
+        assert lengths.max().item() <= slen
+        alen = torch.arange(slen, dtype=torch.long, device=lengths.device)
+        mask = alen < lengths[:, None]
 
     # attention mask is the same as mask, or triangular inferior attention (causal)
     if causal:
@@ -259,21 +261,21 @@ class MultiHeadAttention(nn.Module):
 
     NEW_ID = itertools.count()
 
-    def __init__(self, n_heads, dim, dropout, output_attentions=False):
+    def __init__(self, n_heads, dim, config):
         super().__init__()
         self.layer_id = next(MultiHeadAttention.NEW_ID)
-        self.output_attentions = output_attentions
+        self.output_attentions = config.output_attentions
         self.dim = dim
         self.n_heads = n_heads
-        self.dropout = dropout
+        self.dropout = config.attention_dropout
         assert self.dim % self.n_heads == 0
 
-        self.q_lin = Linear(dim, dim)
-        self.k_lin = Linear(dim, dim)
-        self.v_lin = Linear(dim, dim)
-        self.out_lin = Linear(dim, dim)
+        self.q_lin = Linear(dim, dim, config=config)
+        self.k_lin = Linear(dim, dim, config=config)
+        self.v_lin = Linear(dim, dim, config=config)
+        self.out_lin = Linear(dim, dim, config=config)
 
-    def forward(self, input, mask, kv=None, cache=None):
+    def forward(self, input, mask, kv=None, cache=None, head_mask=None):
         """
         Self-attention (if kv is None) or attention over source sentence (provided by kv).
         """
@@ -323,6 +325,11 @@ class MultiHeadAttention(nn.Module):
 
         weights = F.softmax(scores.float(), dim=-1).type_as(scores)           # (bs, n_heads, qlen, klen)
         weights = F.dropout(weights, p=self.dropout, training=self.training)  # (bs, n_heads, qlen, klen)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            weights = weights * head_mask
+
         context = torch.matmul(weights, v)                                    # (bs, n_heads, qlen, dim_per_head)
         context = unshape(context)                                            # (bs, qlen, dim)
 
@@ -334,12 +341,12 @@ class MultiHeadAttention(nn.Module):
 
 class TransformerFFN(nn.Module):
 
-    def __init__(self, in_dim, dim_hidden, out_dim, dropout, gelu_activation):
+    def __init__(self, in_dim, dim_hidden, out_dim, config):
         super().__init__()
-        self.dropout = dropout
-        self.lin1 = Linear(in_dim, dim_hidden)
-        self.lin2 = Linear(dim_hidden, out_dim)
-        self.act = gelu if gelu_activation else F.relu
+        self.dropout = config.dropout
+        self.lin1 = Linear(in_dim, dim_hidden, config=config)
+        self.lin2 = Linear(dim_hidden, out_dim, config=config)
+        self.act = gelu if config.gelu_activation else F.relu
 
     def forward(self, input):
         x = self.lin1(input)
@@ -365,12 +372,9 @@ class XLMPreTrainedModel(PreTrainedModel):
         """ Initialize the weights.
         """
         if isinstance(module, (nn.Linear, nn.Embedding)):
-            # Slightly different from the TF version which uses truncated_normal for initialization
-            # cf https://github.com/pytorch/pytorch/pull/5617
-            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
-            if isinstance(module, nn.Linear) and module.bias is not None:
-                module.bias.data.zero_()
-        elif isinstance(module, XLMLayerNorm):
+            # Weights are initialized in module instantiation (see above)
+            pass
+        if isinstance(module, nn.LayerNorm):
             module.bias.data.zero_()
             module.weight.data.fill_(1.0)
 
@@ -439,8 +443,10 @@ class XLMModel(XLMPreTrainedModel):
         self.output_hidden_states = config.output_hidden_states
 
         # encoder / decoder, output layer
-        # self.is_encoder = is_encoder
-        # self.is_decoder = not is_encoder
+        self.is_encoder = config.is_encoder
+        self.is_decoder = not config.is_encoder
+        if self.is_decoder:
+            raise NotImplementedError("Currently XLM can only be used as an encoder")
         # self.with_output = with_output
         self.causal = config.causal
 
@@ -450,10 +456,10 @@ class XLMModel(XLMPreTrainedModel):
         self.eos_index = config.eos_index
         self.pad_index = config.pad_index
         # self.dico = dico
-        self.id2lang = config.id2lang
-        self.lang2id = config.lang2id
+        # self.id2lang = config.id2lang
+        # self.lang2id = config.lang2id
         # assert len(self.dico) == self.n_words
-        assert len(self.id2lang) == len(self.lang2id) == self.n_langs
+        # assert len(self.id2lang) == len(self.lang2id) == self.n_langs
 
         # model parameters
         self.dim = config.emb_dim       # 512 by default
@@ -465,12 +471,12 @@ class XLMModel(XLMPreTrainedModel):
         assert self.dim % self.n_heads == 0, 'transformer dim must be a multiple of n_heads'
 
         # embeddings
-        self.position_embeddings = Embedding(config.max_position_embeddings, self.dim)
+        self.position_embeddings = Embedding(config.max_position_embeddings, self.dim, config=config)
         if config.sinusoidal_embeddings:
             create_sinusoidal_embeddings(config.max_position_embeddings, self.dim, out=self.position_embeddings.weight)
         if config.n_langs > 1:
-            self.lang_embeddings = Embedding(self.n_langs, self.dim)
-        self.embeddings = Embedding(self.n_words, self.dim, padding_idx=self.pad_index)
+            self.lang_embeddings = Embedding(self.n_langs, self.dim, config=config)
+        self.embeddings = Embedding(self.n_words, self.dim, padding_idx=self.pad_index, config=config)
         self.layer_norm_emb = nn.LayerNorm(self.dim, eps=1e-12)
 
         # transformer layers
@@ -478,29 +484,31 @@ class XLMModel(XLMPreTrainedModel):
         self.layer_norm1 = nn.ModuleList()
         self.ffns = nn.ModuleList()
         self.layer_norm2 = nn.ModuleList()
-        if self.is_decoder:
-            self.layer_norm15 = nn.ModuleList()
-            self.encoder_attn = nn.ModuleList()
+        # if self.is_decoder:
+        #     self.layer_norm15 = nn.ModuleList()
+        #     self.encoder_attn = nn.ModuleList()
 
         for _ in range(self.n_layers):
-            self.attentions.append(MultiHeadAttention(self.n_heads, self.dim, dropout=self.attention_dropout))
+            self.attentions.append(MultiHeadAttention(self.n_heads, self.dim, config=config))
             self.layer_norm1.append(nn.LayerNorm(self.dim, eps=1e-12))
-            if self.is_decoder:
-                self.layer_norm15.append(nn.LayerNorm(self.dim, eps=1e-12))
-                self.encoder_attn.append(MultiHeadAttention(self.n_heads, self.dim, dropout=self.attention_dropout))
-            self.ffns.append(TransformerFFN(self.dim, self.hidden_dim, self.dim, dropout=self.dropout, gelu_activation=config.gelu_activation))
+            # if self.is_decoder:
+            #     self.layer_norm15.append(nn.LayerNorm(self.dim, eps=1e-12))
+            #     self.encoder_attn.append(MultiHeadAttention(self.n_heads, self.dim, dropout=self.attention_dropout))
+            self.ffns.append(TransformerFFN(self.dim, self.hidden_dim, self.dim, config=config))
             self.layer_norm2.append(nn.LayerNorm(self.dim, eps=1e-12))
 
-    def forward(self, input_ids, lengths, positions=None, langs=None, cache=None, head_mask=None):  # src_enc=None, src_len=None, 
+    def forward(self, input_ids, lengths=None, positions=None, langs=None,
+                token_type_ids=None, attention_mask=None, cache=None, head_mask=None):  # src_enc=None, src_len=None, 
         """
         Inputs:
             `input_ids` LongTensor(bs, slen), containing word indices
             `lengths` LongTensor(bs), containing the length of each sentence
-            `causal` Boolean, if True, the attention is only done over previous hidden states
             `positions` LongTensor(bs, slen), containing word positions
             `langs` LongTensor(bs, slen), containing language IDs
+            `token_type_ids` LongTensor (bs, slen) same as `langs` used for compatibility
         """
-        # lengths = (input_ids != self.pad_index).float().sum(dim=1)
+        if lengths is None:
+            lengths = (input_ids != self.pad_index).float().sum(dim=1)
         # mask = input_ids != self.pad_index
 
         # check inputs
@@ -514,7 +522,7 @@ class XLMModel(XLMPreTrainedModel):
         #     assert src_enc.size(0) == bs
 
         # generate masks
-        mask, attn_mask = get_masks(slen, lengths, self.causal)
+        mask, attn_mask = get_masks(slen, lengths, self.causal, padding_mask=attention_mask)
         # if self.is_decoder and src_enc is not None:
         #     src_mask = torch.arange(src_len.max(), dtype=torch.long, device=lengths.device) < src_len[:, None]
 
@@ -527,10 +535,28 @@ class XLMModel(XLMPreTrainedModel):
             # positions = positions.transpose(0, 1)
 
         # langs
+        assert langs is None or token_type_ids is None, "You can only use one among langs and token_type_ids"
+        if token_type_ids is not None:
+            langs = token_type_ids
         if langs is not None:
             assert langs.size() == (bs, slen)  # (slen, bs)
             # langs = langs.transpose(0, 1)
 
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x qlen x klen]
+        if head_mask is not None:
+            if head_mask.dim() == 1:
+                head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
+                head_mask = head_mask.expand(self.n_layers, -1, -1, -1, -1)
+            elif head_mask.dim() == 2:
+                head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)  # We can specify head_mask for each layer
+            head_mask = head_mask.to(dtype=next(self.parameters()).dtype) # switch to fload if need + fp16 compatibility
+        else:
+            head_mask = [None] * self.n_layers
+
         # do not recompute cached elements
         if cache is not None:
             _slen = slen - cache['slen']
@@ -696,9 +722,7 @@ class XLMWithLMHeadModel(XLMPreTrainedModel):
     ```
     """
     def __init__(self, config):
-        super(XLMLMHeadModel, self).__init__(config)
-        self.attn_type = config.attn_type
-        self.same_length = config.same_length
+        super(XLMWithLMHeadModel, self).__init__(config)
 
         self.transformer = XLMModel(config)
         self.pred_layer = XLMPredLayer(config)
@@ -711,8 +735,8 @@ class XLMWithLMHeadModel(XLMPreTrainedModel):
         """
         self.pred_layer.proj.weight = self.transformer.embeddings.weight
 
-    def forward(self, input_ids, lengths, positions=None, langs=None, cache=None,
-                labels=None, head_mask=None):
+    def forward(self, input_ids, lengths=None, positions=None, langs=None, token_type_ids=None,
+                attention_mask=None, cache=None, labels=None, head_mask=None):
         """
         Args:
             inp_k: int32 Tensor in shape [bsz, len], the input token IDs.
@@ -739,7 +763,8 @@ class XLMWithLMHeadModel(XLMPreTrainedModel):
             summary_type: str, "last", "first", "mean", or "attn". The method
                 to pool the input to get a vector representation.
         """
-        transformer_outputs = self.transformer(input_ids, lengths, positions=positions, langs=langs, cache=cache, head_mask=head_mask)
+        transformer_outputs = self.transformer(input_ids, lengths=lengths, positions=positions, token_type_ids=token_type_ids,
+                                               langs=langs, attention_mask=attention_mask, cache=cache, head_mask=head_mask)
 
         output = transformer_outputs[0]
         logits = self.pred_layer(output, labels)
@@ -759,14 +784,14 @@ class XLMWithLMHeadModel(XLMPreTrainedModel):
 
 
 class XLMSequenceSummary(nn.Module):
-    def __init__(self, config, summary_type="last", use_proj=True):
+    def __init__(self, config):
         super(XLMSequenceSummary, self).__init__()
-        self.summary_type = summary_type
-        if use_proj:
+        self.summary_type = config.summary_type
+        if config.use_proj:
             self.summary = nn.Linear(config.d_model, config.d_model)
         else:
             self.summary = None
-        if summary_type == 'attn':
+        if config.summary_type == 'attn':
             # We should use a standard multi-head attention module with absolute positional embedding for that.
             # Cf. https://github.com/zihangdai/xlnet/blob/master/modeling.py#L253-L276
             # We can probably just use the multi-head attention module of PyTorch >=1.1.0
@@ -859,14 +884,13 @@ class XLMForSequenceClassification(XLMPreTrainedModel):
         super(XLMForSequenceClassification, self).__init__(config)
 
         self.transformer = XLMModel(config)
-
         self.sequence_summary = XLMSequenceSummary(config)
-        self.logits_proj = nn.Linear(config.d_model, num_labels)
+        self.logits_proj = nn.Linear(config.d_model, config.num_labels)
+
         self.apply(self.init_weights)
 
-    def forward(self, inp_k, token_type_ids=None, input_mask=None, attention_mask=None,
-                mems=None, perm_mask=None, target_mapping=None, inp_q=None,
-                labels=None, head_mask=None):
+    def forward(self, input_ids, lengths=None, positions=None, langs=None, attention_mask=None,
+                cache=None, labels=None, head_mask=None):
         """
         Args:
             inp_k: int32 Tensor in shape [bsz, len], the input token IDs.
@@ -894,8 +918,8 @@ class XLMForSequenceClassification(XLMPreTrainedModel):
                 Only used during pretraining for two-stream attention.
                 Set to None during finetuning.
         """
-        transformer_outputs = self.transformer(inp_k, token_type_ids, input_mask, attention_mask,
-                                               mems, perm_mask, target_mapping, inp_q, head_mask)
+        transformer_outputs = self.transformer(input_ids, lengths=lengths, positions=positions, token_type_ids=token_type_ids,
+                                               langs=langs, attention_mask=attention_mask, cache=cache, head_mask=head_mask)
 
         output = transformer_outputs[0]
         output = self.sequence_summary(output)
@@ -974,7 +998,7 @@ class XLMForQuestionAnswering(XLMPreTrainedModel):
     start_logits, end_logits = model(input_ids, token_type_ids, input_mask)
     ```
     """
-    def __init__(self, CONFIG_NAME):
+    def __init__(self, config):
         super(XLMForQuestionAnswering, self).__init__(config)
 
         self.transformer = XLMModel(config)
@@ -982,12 +1006,11 @@ class XLMForQuestionAnswering(XLMPreTrainedModel):
 
         self.apply(self.init_weights)
 
-    def forward(self, inp_k, token_type_ids=None, input_mask=None, attention_mask=None,
-                mems=None, perm_mask=None, target_mapping=None, inp_q=None,
-                start_positions=None, end_positions=None, head_mask=None):
+    def forward(self, input_ids, lengths=None, positions=None, langs=None, attention_mask=None, cache=None,
+                labels=None, head_mask=None):
 
-        transformer_outputs = self.transformer(inp_k, token_type_ids, input_mask, attention_mask,
-                                               mems, perm_mask, target_mapping, inp_q, head_mask)
+        transformer_outputs = self.transformer(input_ids, lengths=lengths, positions=positions, token_type_ids=token_type_ids,
+                                               langs=langs, attention_mask=attention_mask, cache=cache, head_mask=head_mask)
 
         output = transformer_outputs[0]
         logits = self.qa_outputs(output)
diff --git a/pytorch_pretrained_bert/tests/model_tests_commons.py b/pytorch_pretrained_bert/tests/model_tests_commons.py
index da5d0f8b8a..1179b75368 100644
--- a/pytorch_pretrained_bert/tests/model_tests_commons.py
+++ b/pytorch_pretrained_bert/tests/model_tests_commons.py
@@ -36,7 +36,9 @@ def _create_and_check_initialization(tester, model_classes, config, inputs_dict)
     for model_class in model_classes:
         model = model_class(config=configs_no_init)
         for name, param in model.named_parameters():
-            tester.parent.assertIn(param.data.mean().item(), [0.0, 1.0], msg="Parameter {} of model {} seems not properly initialized".format(name, model_class))
+            if param.requires_grad:
+                tester.parent.assertIn(param.data.mean().item(), [0.0, 1.0],
+                                       msg="Parameter {} of model {} seems not properly initialized".format(name, model_class))
 
 def _create_and_check_for_headmasking(tester, model_classes, config, inputs_dict):
     configs_no_init = _config_zero_init(config)
diff --git a/pytorch_pretrained_bert/tests/model_utils_test.py b/pytorch_pretrained_bert/tests/model_utils_test.py
index 76585453c8..59f076fa00 100644
--- a/pytorch_pretrained_bert/tests/model_utils_test.py
+++ b/pytorch_pretrained_bert/tests/model_utils_test.py
@@ -26,7 +26,7 @@ import pytest
 import torch
 
 from pytorch_pretrained_bert import PretrainedConfig, PreTrainedModel
-from pytorch_pretrained_bert.modeling import BertModel, BertConfig, PRETRAINED_MODEL_ARCHIVE_MAP, PRETRAINED_CONFIG_ARCHIVE_MAP
+from pytorch_pretrained_bert.modeling_bert import BertModel, BertConfig, PRETRAINED_MODEL_ARCHIVE_MAP, PRETRAINED_CONFIG_ARCHIVE_MAP
 
 
 class ModelUtilsTest(unittest.TestCase):
diff --git a/pytorch_pretrained_bert/tests/modeling_test.py b/pytorch_pretrained_bert/tests/modeling_bert_test.py
similarity index 99%
rename from pytorch_pretrained_bert/tests/modeling_test.py
rename to pytorch_pretrained_bert/tests/modeling_bert_test.py
index 2219ee7589..be5c3e090d 100644
--- a/pytorch_pretrained_bert/tests/modeling_test.py
+++ b/pytorch_pretrained_bert/tests/modeling_bert_test.py
@@ -16,20 +16,15 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import os
 import unittest
-import json
-import random
 import shutil
 import pytest
 
-import torch
-
 from pytorch_pretrained_bert import (BertConfig, BertModel, BertForMaskedLM,
                                      BertForNextSentencePrediction, BertForPreTraining,
                                      BertForQuestionAnswering, BertForSequenceClassification,
                                      BertForTokenClassification, BertForMultipleChoice)
-from pytorch_pretrained_bert.modeling import PRETRAINED_MODEL_ARCHIVE_MAP
+from pytorch_pretrained_bert.modeling_bert import PRETRAINED_MODEL_ARCHIVE_MAP
 
 from .model_tests_commons import (create_and_check_commons, ConfigTester, ids_tensor)
 
diff --git a/pytorch_pretrained_bert/tests/modeling_xlm_test.py b/pytorch_pretrained_bert/tests/modeling_xlm_test.py
new file mode 100644
index 0000000000..d2cf8235d4
--- /dev/null
+++ b/pytorch_pretrained_bert/tests/modeling_xlm_test.py
@@ -0,0 +1,276 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import unittest
+import shutil
+import pytest
+
+from pytorch_pretrained_bert import (XLMConfig, XLMModel, XLMForQuestionAnswering, XLMForSequenceClassification)
+from pytorch_pretrained_bert.modeling_xlm import PRETRAINED_MODEL_ARCHIVE_MAP
+
+from .model_tests_commons import (create_and_check_commons, ConfigTester, ids_tensor)
+
+
+class XLMModelTest(unittest.TestCase):
+    class XLMModelTester(object):
+
+        def __init__(self,
+                     parent,
+                     batch_size=13,
+                     seq_length=7,
+                     is_training=True,
+                     use_input_lengths=True,
+                     use_token_type_ids=True,
+                     use_labels=True,
+                     gelu_activation=True,
+                     sinusoidal_embeddings=False,
+                     causal=False,
+                     asm=False,
+                     n_langs=2,
+                     vocab_size=99,
+                     n_special=0,
+                     hidden_size=32,
+                     num_hidden_layers=5,
+                     num_attention_heads=4,
+                     hidden_dropout_prob=0.1,
+                     attention_probs_dropout_prob=0.1,
+                     max_position_embeddings=512,
+                     type_vocab_size=16,
+                     type_sequence_label_size=2,
+                     initializer_range=0.02,
+                     num_labels=3,
+                     num_choices=4,
+                     summary_type="last",
+                     use_proj=True,
+                     scope=None,
+                     all_model_classes = (XLMModel,),  # , XLMForSequenceClassification, XLMForTokenClassification),
+                    ):
+            self.parent = parent
+            self.batch_size = batch_size
+            self.seq_length = seq_length
+            self.is_training = is_training
+            self.use_input_lengths = use_input_lengths
+            self.use_token_type_ids = use_token_type_ids
+            self.use_labels = use_labels
+            self.gelu_activation = gelu_activation
+            self.sinusoidal_embeddings = sinusoidal_embeddings
+            self.asm = asm
+            self.n_langs = n_langs
+            self.vocab_size = vocab_size
+            self.n_special = n_special
+            self.summary_type = summary_type
+            self.causal = causal
+            self.use_proj = use_proj
+            self.hidden_size = hidden_size
+            self.num_hidden_layers = num_hidden_layers
+            self.num_attention_heads = num_attention_heads
+            self.hidden_dropout_prob = hidden_dropout_prob
+            self.attention_probs_dropout_prob = attention_probs_dropout_prob
+            self.max_position_embeddings = max_position_embeddings
+            self.n_langs = n_langs
+            self.type_sequence_label_size = type_sequence_label_size
+            self.initializer_range = initializer_range
+            self.summary_type = summary_type
+            self.num_labels = num_labels
+            self.num_choices = num_choices
+            self.scope = scope
+            self.all_model_classes = all_model_classes
+
+        def prepare_config_and_inputs(self):
+            input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+            input_lengths = None
+            if self.use_input_lengths:
+                input_lengths = ids_tensor([self.batch_size], vocab_size=self.seq_length-1)
+
+            token_type_ids = None
+            if self.use_token_type_ids:
+                token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.n_langs)
+
+            sequence_labels = None
+            token_labels = None
+            choice_labels = None
+            if self.use_labels:
+                sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+                token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+                choice_labels = ids_tensor([self.batch_size], self.num_choices)
+
+            config = XLMConfig(
+                 vocab_size_or_config_json_file=self.vocab_size,
+                 n_special=self.n_special,
+                 emb_dim=self.hidden_size,
+                 n_layers=self.num_hidden_layers,
+                 n_heads=self.num_attention_heads,
+                 dropout=self.hidden_dropout_prob,
+                 attention_dropout=self.attention_probs_dropout_prob,
+                 gelu_activation=self.gelu_activation,
+                 sinusoidal_embeddings=self.sinusoidal_embeddings,
+                 asm=self.asm,
+                 causal=self.causal,
+                 n_langs=self.n_langs,
+                 max_position_embeddings=self.max_position_embeddings,
+                 initializer_range=self.initializer_range,
+                 summary_type=self.summary_type,
+                 use_proj=self.use_proj)
+
+            return config, input_ids, token_type_ids, input_lengths, sequence_labels, token_labels, choice_labels
+
+        def check_loss_output(self, result):
+            self.parent.assertListEqual(
+                list(result["loss"].size()),
+                [])
+
+        def create_and_check_xlm_model(self, config, input_ids, token_type_ids, input_lengths, sequence_labels, token_labels, choice_labels):
+            model = XLMModel(config=config)
+            model.eval()
+            outputs = model(input_ids, lengths=input_lengths, langs=token_type_ids)
+            sequence_output = outputs[0]
+            result = {
+                "sequence_output": sequence_output,
+            }
+            self.parent.assertListEqual(
+                list(result["sequence_output"].size()),
+                [self.batch_size, self.seq_length, self.hidden_size])
+
+
+        # def create_and_check_xlm_for_masked_lm(self, config, input_ids, token_type_ids, input_lengths, sequence_labels, token_labels, choice_labels):
+        #     model = XLMForMaskedLM(config=config)
+        #     model.eval()
+        #     loss, prediction_scores = model(input_ids, token_type_ids, input_lengths, token_labels)
+        #     result = {
+        #         "loss": loss,
+        #         "prediction_scores": prediction_scores,
+        #     }
+        #     self.parent.assertListEqual(
+        #         list(result["prediction_scores"].size()),
+        #         [self.batch_size, self.seq_length, self.vocab_size])
+        #     self.check_loss_output(result)
+
+
+        # def create_and_check_xlm_for_question_answering(self, config, input_ids, token_type_ids, input_lengths, sequence_labels, token_labels, choice_labels):
+        #     model = XLMForQuestionAnswering(config=config)
+        #     model.eval()
+        #     loss, start_logits, end_logits = model(input_ids, token_type_ids, input_lengths, sequence_labels, sequence_labels)
+        #     result = {
+        #         "loss": loss,
+        #         "start_logits": start_logits,
+        #         "end_logits": end_logits,
+        #     }
+        #     self.parent.assertListEqual(
+        #         list(result["start_logits"].size()),
+        #         [self.batch_size, self.seq_length])
+        #     self.parent.assertListEqual(
+        #         list(result["end_logits"].size()),
+        #         [self.batch_size, self.seq_length])
+        #     self.check_loss_output(result)
+
+
+        # def create_and_check_xlm_for_sequence_classification(self, config, input_ids, token_type_ids, input_lengths, sequence_labels, token_labels, choice_labels):
+        #     config.num_labels = self.num_labels
+        #     model = XLMForSequenceClassification(config)
+        #     model.eval()
+        #     loss, logits = model(input_ids, token_type_ids, input_lengths, sequence_labels)
+        #     result = {
+        #         "loss": loss,
+        #         "logits": logits,
+        #     }
+        #     self.parent.assertListEqual(
+        #         list(result["logits"].size()),
+        #         [self.batch_size, self.num_labels])
+        #     self.check_loss_output(result)
+
+
+        # def create_and_check_xlm_for_token_classification(self, config, input_ids, token_type_ids, input_lengths, sequence_labels, token_labels, choice_labels):
+        #     config.num_labels = self.num_labels
+        #     model = XLMForTokenClassification(config=config)
+        #     model.eval()
+        #     loss, logits = model(input_ids, token_type_ids, input_lengths, token_labels)
+        #     result = {
+        #         "loss": loss,
+        #         "logits": logits,
+        #     }
+        #     self.parent.assertListEqual(
+        #         list(result["logits"].size()),
+        #         [self.batch_size, self.seq_length, self.num_labels])
+        #     self.check_loss_output(result)
+
+
+        # def create_and_check_xlm_for_multiple_choice(self, config, input_ids, token_type_ids, input_lengths, sequence_labels, token_labels, choice_labels):
+        #     config.num_choices = self.num_choices
+        #     model = XLMForMultipleChoice(config=config)
+        #     model.eval()
+        #     multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+        #     multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+        #     multiple_choice_input_lengths = input_lengths.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+        #     loss, logits = model(multiple_choice_inputs_ids,
+        #                  multiple_choice_token_type_ids,
+        #                  multiple_choice_input_lengths,
+        #                  choice_labels)
+        #     result = {
+        #         "loss": loss,
+        #         "logits": logits,
+        #     }
+        #     self.parent.assertListEqual(
+        #         list(result["logits"].size()),
+        #         [self.batch_size, self.num_choices])
+        #     self.check_loss_output(result)
+
+
+        def create_and_check_xlm_commons(self, config, input_ids, token_type_ids, input_lengths, sequence_labels, token_labels, choice_labels):
+            inputs_dict = {'input_ids': input_ids, 'token_type_ids': token_type_ids, 'attention_mask': input_lengths}
+            create_and_check_commons(self, config, inputs_dict)
+
+    def test_default(self):
+        self.run_tester(XLMModelTest.XLMModelTester(self))
+
+    def test_config(self):
+        config_tester = ConfigTester(self, config_class=XLMConfig, emb_dim=37)
+        config_tester.run_common_tests()
+
+    @pytest.mark.slow
+    def test_model_from_pretrained(self):
+        cache_dir = "/tmp/pytorch_pretrained_bert_test/"
+        for model_name in list(PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
+            model = XLMModel.from_pretrained(model_name, cache_dir=cache_dir)
+            shutil.rmtree(cache_dir)
+            self.assertIsNotNone(model)
+
+    def run_tester(self, tester):
+        config_and_inputs = tester.prepare_config_and_inputs()
+        tester.create_and_check_xlm_model(*config_and_inputs)
+
+        # config_and_inputs = tester.prepare_config_and_inputs()
+        # tester.create_and_check_xlm_for_masked_lm(*config_and_inputs)
+
+        # config_and_inputs = tester.prepare_config_and_inputs()
+        # tester.create_and_check_xlm_for_multiple_choice(*config_and_inputs)
+
+        # config_and_inputs = tester.prepare_config_and_inputs()
+        # tester.create_and_check_xlm_for_question_answering(*config_and_inputs)
+
+        # config_and_inputs = tester.prepare_config_and_inputs()
+        # tester.create_and_check_xlm_for_sequence_classification(*config_and_inputs)
+
+        # config_and_inputs = tester.prepare_config_and_inputs()
+        # tester.create_and_check_xlm_for_token_classification(*config_and_inputs)
+
+        config_and_inputs = tester.prepare_config_and_inputs()
+        tester.create_and_check_xlm_commons(*config_and_inputs)
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/pytorch_pretrained_bert/tests/tokenization_test.py b/pytorch_pretrained_bert/tests/tokenization_bert_test.py
similarity index 98%
rename from pytorch_pretrained_bert/tests/tokenization_test.py
rename to pytorch_pretrained_bert/tests/tokenization_bert_test.py
index 249f71f984..e00771c1b1 100644
--- a/pytorch_pretrained_bert/tests/tokenization_test.py
+++ b/pytorch_pretrained_bert/tests/tokenization_bert_test.py
@@ -20,7 +20,7 @@ from io import open
 import shutil
 import pytest
 
-from pytorch_pretrained_bert.tokenization import (BasicTokenizer,
+from pytorch_pretrained_bert.tokenization_bert import (BasicTokenizer,
                                                   BertTokenizer,
                                                   WordpieceTokenizer,
                                                   _is_control, _is_punctuation,
diff --git a/pytorch_pretrained_bert/tests/tokenization_xlm_test.py b/pytorch_pretrained_bert/tests/tokenization_xlm_test.py
new file mode 100644
index 0000000000..d288f2fe60
--- /dev/null
+++ b/pytorch_pretrained_bert/tests/tokenization_xlm_test.py
@@ -0,0 +1,79 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import os
+import unittest
+import json
+import shutil
+import pytest
+
+from pytorch_pretrained_bert.tokenization_xlm import XLMTokenizer, PRETRAINED_VOCAB_ARCHIVE_MAP
+
+
+class XLMTokenizationTest(unittest.TestCase):
+
+    def test_full_tokenizer(self):
+        """ Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt """
+        vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n",
+                 "w</w>", "r</w>", "t</w>",
+                 "lo", "low", "er</w>",
+                 "low</w>", "lowest</w>", "newer</w>", "wider</w>"]
+        vocab_tokens = dict(zip(vocab, range(len(vocab))))
+        merges = ["l o 123", "lo w 1456", "e r</w> 1789", ""]
+        with open("/tmp/openai_tokenizer_vocab_test.json", "w") as fp:
+            fp.write(json.dumps(vocab_tokens))
+            vocab_file = fp.name
+        with open("/tmp/openai_tokenizer_merges_test.txt", "w") as fp:
+            fp.write("\n".join(merges))
+            merges_file = fp.name
+
+        tokenizer = XLMTokenizer(vocab_file, merges_file, special_tokens=["<unk>", "<pad>"])
+        os.remove(vocab_file)
+        os.remove(merges_file)
+
+        text = "lower"
+        bpe_tokens = ["low", "er</w>"]
+        tokens = tokenizer.tokenize(text)
+        self.assertListEqual(tokens, bpe_tokens)
+
+        input_tokens = tokens + ["<unk>"]
+        input_bpe_tokens = [14, 15, 20]
+        self.assertListEqual(
+            tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
+
+        vocab_file, merges_file, special_tokens_file = tokenizer.save_vocabulary(vocab_path="/tmp/")
+        tokenizer_2 = XLMTokenizer.from_pretrained("/tmp/")
+        os.remove(vocab_file)
+        os.remove(merges_file)
+        os.remove(special_tokens_file)
+
+        self.assertListEqual(
+            [tokenizer.encoder, tokenizer.decoder, tokenizer.bpe_ranks,
+             tokenizer.special_tokens, tokenizer.special_tokens_decoder],
+            [tokenizer_2.encoder, tokenizer_2.decoder, tokenizer_2.bpe_ranks,
+             tokenizer_2.special_tokens, tokenizer_2.special_tokens_decoder])
+
+    @pytest.mark.slow
+    def test_tokenizer_from_pretrained(self):
+        cache_dir = "/tmp/pytorch_pretrained_bert_test/"
+        for model_name in list(PRETRAINED_VOCAB_ARCHIVE_MAP.keys())[:1]:
+            tokenizer = XLMTokenizer.from_pretrained(model_name, cache_dir=cache_dir)
+            shutil.rmtree(cache_dir)
+            self.assertIsNotNone(tokenizer)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/pytorch_pretrained_bert/tokenization.py b/pytorch_pretrained_bert/tokenization_bert.py
similarity index 100%
rename from pytorch_pretrained_bert/tokenization.py
rename to pytorch_pretrained_bert/tokenization_bert.py
diff --git a/pytorch_pretrained_bert/tokenization_openai.py b/pytorch_pretrained_bert/tokenization_openai.py
index 52d735efa8..5b2bd31cd0 100644
--- a/pytorch_pretrained_bert/tokenization_openai.py
+++ b/pytorch_pretrained_bert/tokenization_openai.py
@@ -26,7 +26,7 @@ from io import open
 from tqdm import tqdm
 
 from .file_utils import cached_path
-from .tokenization import BasicTokenizer
+from .tokenization_bert import BasicTokenizer
 
 logger = logging.getLogger(__name__)
 
diff --git a/pytorch_pretrained_bert/tokenization_xlm.py b/pytorch_pretrained_bert/tokenization_xlm.py
index a4c1a61545..d6705954c0 100644
--- a/pytorch_pretrained_bert/tokenization_xlm.py
+++ b/pytorch_pretrained_bert/tokenization_xlm.py
@@ -26,7 +26,7 @@ from io import open
 from tqdm import tqdm
 
 from .file_utils import cached_path
-from .tokenization import BasicTokenizer
+from .tokenization_bert import BasicTokenizer
 
 logger = logging.getLogger(__name__)
 

From 8fa3a1f0d8421c6c35b11a88d8de5d42b8f999c3 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Wed, 3 Jul 2019 22:54:53 +0200
Subject: [PATCH 047/139] updating tests

---
 pytorch_pretrained_bert/modeling_xlm.py       | 54 +++++++++++--------
 .../tests/model_tests_commons.py              |  6 +++
 .../tests/modeling_bert_test.py               |  2 +
 .../tests/modeling_xlm_test.py                |  6 ++-
 .../tests/modeling_xlnet_test.py              | 32 ++++++++---
 pytorch_pretrained_bert/tokenization_xlm.py   |  1 -
 6 files changed, 69 insertions(+), 32 deletions(-)

diff --git a/pytorch_pretrained_bert/modeling_xlm.py b/pytorch_pretrained_bert/modeling_xlm.py
index 66a0b0b1ed..c119a43de0 100644
--- a/pytorch_pretrained_bert/modeling_xlm.py
+++ b/pytorch_pretrained_bert/modeling_xlm.py
@@ -35,7 +35,7 @@ from torch.nn import functional as F
 from torch.nn import CrossEntropyLoss, MSELoss
 
 from .file_utils import cached_path
-from .model_utils import CONFIG_NAME, WEIGHTS_NAME, PretrainedConfig, PreTrainedModel
+from .model_utils import CONFIG_NAME, WEIGHTS_NAME, PretrainedConfig, PreTrainedModel, prune_linear_layer
 
 logger = logging.getLogger(__name__)
 
@@ -46,24 +46,6 @@ PRETRAINED_CONFIG_ARCHIVE_MAP = {
     'xlm-mlm-en-2048': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-en-2048-config.json",
 }
 
-DECODER_ONLY_PARAMS = [
-    'layer_norm15.%i.weight', 'layer_norm15.%i.bias',
-    'encoder_attn.%i.q_lin.weight', 'encoder_attn.%i.q_lin.bias',
-    'encoder_attn.%i.k_lin.weight', 'encoder_attn.%i.k_lin.bias',
-    'encoder_attn.%i.v_lin.weight', 'encoder_attn.%i.v_lin.bias',
-    'encoder_attn.%i.out_lin.weight', 'encoder_attn.%i.out_lin.bias'
-]
-
-TRANSFORMER_LAYER_PARAMS = [
-    'attentions.%i.q_lin.weight', 'attentions.%i.q_lin.bias',
-    'attentions.%i.k_lin.weight', 'attentions.%i.k_lin.bias',
-    'attentions.%i.v_lin.weight', 'attentions.%i.v_lin.bias',
-    'attentions.%i.out_lin.weight', 'attentions.%i.out_lin.bias',
-    'layer_norm1.%i.weight', 'layer_norm1.%i.bias',
-    'ffns.%i.lin1.weight', 'ffns.%i.lin1.bias',
-    'ffns.%i.lin2.weight', 'ffns.%i.lin2.bias',
-    'layer_norm2.%i.weight', 'layer_norm2.%i.bias'
-]
 
 class XLMConfig(PretrainedConfig):
     """Configuration class to store the configuration of a `XLMModel`.
@@ -275,6 +257,24 @@ class MultiHeadAttention(nn.Module):
         self.v_lin = Linear(dim, dim, config=config)
         self.out_lin = Linear(dim, dim, config=config)
 
+    def prune_heads(self, heads):
+        attention_head_size = self.dim // self.n_heads
+        if len(heads) == 0:
+            return
+        mask = torch.ones(self.n_heads, attention_head_size)
+        for head in heads:
+            mask[head] = 0
+        mask = mask.view(-1).contiguous().eq(1)
+        index = torch.arange(len(mask))[mask].long()
+        # Prune linear layers
+        self.q_lin = prune_linear_layer(self.q_lin, index)
+        self.k_lin = prune_linear_layer(self.k_lin, index)
+        self.v_lin = prune_linear_layer(self.v_lin, index)
+        self.out_lin = prune_linear_layer(self.out_lin, index, dim=1)
+        # Update hyper params
+        self.n_heads = self.n_heads - len(heads)
+        self.dim = attention_head_size * self.n_heads
+
     def forward(self, input, mask, kv=None, cache=None, head_mask=None):
         """
         Self-attention (if kv is None) or attention over source sentence (provided by kv).
@@ -286,9 +286,9 @@ class MultiHeadAttention(nn.Module):
             klen = qlen if cache is None else cache['slen'] + qlen
         else:
             klen = kv.size(1)
-        assert dim == self.dim, 'Dimensions do not match: %s input vs %s configured' % (dim, self.dim)
+        # assert dim == self.dim, 'Dimensions do not match: %s input vs %s configured' % (dim, self.dim)
         n_heads = self.n_heads
-        dim_per_head = dim // n_heads
+        dim_per_head = self.dim // n_heads
         mask_reshape = (bs, 1, qlen, klen) if mask.dim() == 3 else (bs, 1, 1, klen)
 
         def shape(x):
@@ -335,7 +335,7 @@ class MultiHeadAttention(nn.Module):
 
         outputs = (self.out_lin(context),)
         if self.output_attentions:
-            outputs = outputs + (weights)
+            outputs = outputs + (weights,)
         return outputs
 
 
@@ -497,6 +497,14 @@ class XLMModel(XLMPreTrainedModel):
             self.ffns.append(TransformerFFN(self.dim, self.hidden_dim, self.dim, config=config))
             self.layer_norm2.append(nn.LayerNorm(self.dim, eps=1e-12))
 
+    def _prune_heads(self, heads_to_prune):
+        """ Prunes heads of the model.
+            heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
+            See base class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.attentions[layer].prune_heads(heads)
+
     def forward(self, input_ids, lengths=None, positions=None, langs=None,
                 token_type_ids=None, attention_mask=None, cache=None, head_mask=None):  # src_enc=None, src_len=None, 
         """
@@ -508,7 +516,7 @@ class XLMModel(XLMPreTrainedModel):
             `token_type_ids` LongTensor (bs, slen) same as `langs` used for compatibility
         """
         if lengths is None:
-            lengths = (input_ids != self.pad_index).float().sum(dim=1)
+            lengths = (input_ids != self.pad_index).sum(dim=1).long()
         # mask = input_ids != self.pad_index
 
         # check inputs
diff --git a/pytorch_pretrained_bert/tests/model_tests_commons.py b/pytorch_pretrained_bert/tests/model_tests_commons.py
index 1179b75368..88fb8d0420 100644
--- a/pytorch_pretrained_bert/tests/model_tests_commons.py
+++ b/pytorch_pretrained_bert/tests/model_tests_commons.py
@@ -68,6 +68,8 @@ def _create_and_check_for_headmasking(tester, model_classes, config, inputs_dict
         attentions = outputs[-1]
         hidden_states = outputs[-2]
 
+        # Remove Nan
+
         tester.parent.assertIsNotNone(multihead_outputs)
         tester.parent.assertEqual(len(multihead_outputs), tester.num_hidden_layers)
         tester.parent.assertAlmostEqual(
@@ -298,7 +300,11 @@ class GPTModelTester(object):
                             mc_labels, lm_labels, mc_token_ids):
         model = self.base_model_class(config)
         model.eval()
+
         outputs = model(input_ids, position_ids, token_type_ids)
+        outputs = model(input_ids, position_ids)
+        outputs = model(input_ids)
+
         hidden_state = outputs[0]
         self.parent.assertListEqual(
             list(hidden_state.size()),
diff --git a/pytorch_pretrained_bert/tests/modeling_bert_test.py b/pytorch_pretrained_bert/tests/modeling_bert_test.py
index be5c3e090d..7a9d49fde7 100644
--- a/pytorch_pretrained_bert/tests/modeling_bert_test.py
+++ b/pytorch_pretrained_bert/tests/modeling_bert_test.py
@@ -126,6 +126,8 @@ class BertModelTest(unittest.TestCase):
             model = BertModel(config=config)
             model.eval()
             sequence_output, pooled_output = model(input_ids, token_type_ids, input_mask)
+            sequence_output, pooled_output = model(input_ids, token_type_ids)
+            sequence_output, pooled_output = model(input_ids)
 
             result = {
                 "sequence_output": sequence_output,
diff --git a/pytorch_pretrained_bert/tests/modeling_xlm_test.py b/pytorch_pretrained_bert/tests/modeling_xlm_test.py
index d2cf8235d4..17ed978abe 100644
--- a/pytorch_pretrained_bert/tests/modeling_xlm_test.py
+++ b/pytorch_pretrained_bert/tests/modeling_xlm_test.py
@@ -96,7 +96,7 @@ class XLMModelTest(unittest.TestCase):
 
             input_lengths = None
             if self.use_input_lengths:
-                input_lengths = ids_tensor([self.batch_size], vocab_size=self.seq_length-1)
+                input_lengths = ids_tensor([self.batch_size], vocab_size=2) + self.seq_length - 2  # small variation of seq_length
 
             token_type_ids = None
             if self.use_token_type_ids:
@@ -139,6 +139,8 @@ class XLMModelTest(unittest.TestCase):
             model = XLMModel(config=config)
             model.eval()
             outputs = model(input_ids, lengths=input_lengths, langs=token_type_ids)
+            outputs = model(input_ids, langs=token_type_ids)
+            outputs = model(input_ids)
             sequence_output = outputs[0]
             result = {
                 "sequence_output": sequence_output,
@@ -232,7 +234,7 @@ class XLMModelTest(unittest.TestCase):
 
 
         def create_and_check_xlm_commons(self, config, input_ids, token_type_ids, input_lengths, sequence_labels, token_labels, choice_labels):
-            inputs_dict = {'input_ids': input_ids, 'token_type_ids': token_type_ids, 'attention_mask': input_lengths}
+            inputs_dict = {'input_ids': input_ids, 'token_type_ids': token_type_ids, 'lengths': input_lengths}
             create_and_check_commons(self, config, inputs_dict)
 
     def test_default(self):
diff --git a/pytorch_pretrained_bert/tests/modeling_xlnet_test.py b/pytorch_pretrained_bert/tests/modeling_xlnet_test.py
index 1527f08642..13030567b9 100644
--- a/pytorch_pretrained_bert/tests/modeling_xlnet_test.py
+++ b/pytorch_pretrained_bert/tests/modeling_xlnet_test.py
@@ -140,7 +140,26 @@ class XLNetModelTest(unittest.TestCase):
             random.seed(self.seed)
             torch.manual_seed(self.seed)
 
-        def create_transfo_xl_lm_head(self, config, input_ids_1, input_ids_2, input_ids_q, perm_mask, target_mapping, inp_q, segment_ids, lm_labels):
+        def create_and_check_xlnet_base_model(self, config, input_ids_1, input_ids_2, input_ids_q, perm_mask, target_mapping, inp_q, segment_ids, lm_labels):
+            model = XLNetModel(config)
+            model.eval()
+
+            _, _ = model(input_ids_1, token_type_ids=segment_ids)
+            outputs, mems_1 = model(input_ids_1)
+
+            result = {
+                "mems_1": mems_1,
+                "outputs": outputs,
+            }
+
+            self.parent.assertListEqual(
+                list(result["outputs"].size()),
+                [self.batch_size, self.seq_length, self.hidden_size])
+            self.parent.assertListEqual(
+                list(list(mem.size()) for mem in result["mems_1"]),
+                [[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers)
+
+        def create_and_check_xlnet_lm_head(self, config, input_ids_1, input_ids_2, input_ids_q, perm_mask, target_mapping, inp_q, segment_ids, lm_labels):
             model = XLNetLMHeadModel(config)
             model.eval()
 
@@ -150,7 +169,7 @@ class XLNetModelTest(unittest.TestCase):
 
             logits, _ = model(input_ids_q, perm_mask=perm_mask, target_mapping=target_mapping, inp_q=inp_q)
 
-            outputs = {
+            result = {
                 "loss_1": loss_1,
                 "mems_1": mems_1,
                 "all_logits_1": all_logits_1,
@@ -158,9 +177,7 @@ class XLNetModelTest(unittest.TestCase):
                 "mems_2": mems_2,
                 "all_logits_2": all_logits_2,
             }
-            return outputs
 
-        def check_transfo_xl_lm_head_output(self, result):
             self.parent.assertListEqual(
                 list(result["loss_1"].size()),
                 [])
@@ -203,8 +220,11 @@ class XLNetModelTest(unittest.TestCase):
     def run_tester(self, tester):
         tester.set_seed()
         config_and_inputs = tester.prepare_config_and_inputs()
-        output_result = tester.create_transfo_xl_lm_head(*config_and_inputs)
-        tester.check_transfo_xl_lm_head_output(output_result)
+        tester.create_and_check_xlnet_base_model(*config_and_inputs)
+
+        tester.set_seed()
+        config_and_inputs = tester.prepare_config_and_inputs()
+        tester.create_and_check_xlnet_lm_head(*config_and_inputs)
 
         tester.set_seed()
         config_and_inputs = tester.prepare_config_and_inputs()
diff --git a/pytorch_pretrained_bert/tokenization_xlm.py b/pytorch_pretrained_bert/tokenization_xlm.py
index d6705954c0..25a0c1b542 100644
--- a/pytorch_pretrained_bert/tokenization_xlm.py
+++ b/pytorch_pretrained_bert/tokenization_xlm.py
@@ -304,7 +304,6 @@ class XLMTokenizer(object):
 
         index = 0
         with open(merge_file, "w", encoding="utf-8") as writer:
-            writer.write(u'#version: 0.2\n')
             for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
                 if index != token_index:
                     logger.warning("Saving vocabulary to {}: BPE merge indices are not consecutive."

From c22545aa407c6a51c18a811677411f18fa4a32d5 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Wed, 3 Jul 2019 23:03:57 +0200
Subject: [PATCH 048/139] fix xlm torchscript

---
 pytorch_pretrained_bert/modeling_xlm.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/pytorch_pretrained_bert/modeling_xlm.py b/pytorch_pretrained_bert/modeling_xlm.py
index c119a43de0..84ba88ff7e 100644
--- a/pytorch_pretrained_bert/modeling_xlm.py
+++ b/pytorch_pretrained_bert/modeling_xlm.py
@@ -536,7 +536,7 @@ class XLMModel(XLMPreTrainedModel):
 
         # positions
         if positions is None:
-            positions = input_ids.new(slen).long()
+            positions = input_ids.new((slen,)).long()
             positions = torch.arange(slen, out=positions).unsqueeze(0)
         else:
             assert positions.size() == (bs, slen)  # (slen, bs)
@@ -585,17 +585,17 @@ class XLMModel(XLMPreTrainedModel):
         tensor *= mask.unsqueeze(-1).to(tensor.dtype)
 
         # transformer layers
-        hidden_states = []
-        attentions = []
+        hidden_states = ()
+        attentions = ()
         for i in range(self.n_layers):
             if self.output_hidden_states:
-                hidden_states.append(tensor)
+                hidden_states = hidden_states + (tensor,)
 
             # self attention
             attn_outputs = self.attentions[i](tensor, attn_mask, cache=cache, head_mask=head_mask[i])
             attn = attn_outputs[0]
             if self.output_attentions:
-                attentions.append(attn_outputs[1])
+                attentions = attentions + (attn_outputs[1],)
             attn = F.dropout(attn, p=self.dropout, training=self.training)
             tensor = tensor + attn
             tensor = self.layer_norm1[i](tensor)
@@ -614,7 +614,7 @@ class XLMModel(XLMPreTrainedModel):
 
         # Add last hidden state
         if self.output_hidden_states:
-            hidden_states.append(tensor)
+            hidden_states = hidden_states + (tensor,)
 
         # update cache length
         if cache is not None:
@@ -623,11 +623,11 @@ class XLMModel(XLMPreTrainedModel):
         # move back sequence length to dimension 0
         # tensor = tensor.transpose(0, 1)
 
-        outputs = [tensor]
+        outputs = (tensor,)
         if self.output_hidden_states:
-            outputs.append(hidden_states)
+            outputs = outputs + (hidden_states,)
         if self.output_attentions:
-            outputs.append(attentions)
+            outputs = outputs + (attentions,)
         return outputs  # outputs, (hidden_states), (attentions)
 
 

From fbe04423b6fc5ca2b7d28e423264e50505dbdf45 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Thu, 4 Jul 2019 00:25:30 +0200
Subject: [PATCH 049/139] Common SequenceSummary class

---
 pytorch_pretrained_bert/__init__.py        |   2 +-
 pytorch_pretrained_bert/model_utils.py     | 108 +++++++++++++++++----
 pytorch_pretrained_bert/modeling_gpt2.py   |  48 +++------
 pytorch_pretrained_bert/modeling_openai.py |  50 +++-------
 pytorch_pretrained_bert/modeling_xlnet.py  |  47 ++-------
 5 files changed, 130 insertions(+), 125 deletions(-)

diff --git a/pytorch_pretrained_bert/__init__.py b/pytorch_pretrained_bert/__init__.py
index e14b8b27a9..23346967ba 100644
--- a/pytorch_pretrained_bert/__init__.py
+++ b/pytorch_pretrained_bert/__init__.py
@@ -17,7 +17,7 @@ from .modeling_openai import (OpenAIGPTConfig, OpenAIGPTModel,
 from .modeling_transfo_xl import (TransfoXLConfig, TransfoXLModel, TransfoXLLMHeadModel,
                                   load_tf_weights_in_transfo_xl)
 from .modeling_gpt2 import (GPT2Config, GPT2Model,
-                            GPT2LMHeadModel, GPT2DoubleHeadsModel, GPT2MultipleChoiceHead,
+                            GPT2LMHeadModel, GPT2DoubleHeadsModel,
                             load_tf_weights_in_gpt2)
 from .modeling_xlnet import (XLNetConfig,
                              XLNetPreTrainedModel, XLNetModel, XLNetLMHeadModel,
diff --git a/pytorch_pretrained_bert/model_utils.py b/pytorch_pretrained_bert/model_utils.py
index ec735c3e0a..0496e41bbd 100644
--- a/pytorch_pretrained_bert/model_utils.py
+++ b/pytorch_pretrained_bert/model_utils.py
@@ -282,6 +282,95 @@ class PreTrainedModel(nn.Module):
         return model
 
 
+class Conv1D(nn.Module):
+    def __init__(self, nf, nx):
+        """ Conv1D layer as defined by Alec for GPT (and also used in GPT-2)
+            Basically works like a Linear layer but the weights are transposed
+        """
+        super(Conv1D, self).__init__()
+        self.nf = nf
+        w = torch.empty(nx, nf)
+        nn.init.normal_(w, std=0.02)
+        self.weight = nn.Parameter(w)
+        self.bias = nn.Parameter(torch.zeros(nf))
+
+    def forward(self, x):
+        size_out = x.size()[:-1] + (self.nf,)
+        x = torch.addmm(self.bias, x.view(-1, x.size(-1)), self.weight)
+        x = x.view(*size_out)
+        return x
+
+
+class SequenceSummary(nn.Module):
+    def __init__(self, config):
+        """ Compute a single vector summary of a sequence hidden states according to various possibilities:
+            Args of the config class:
+                summary_type:
+                    - 'last' => [default] take the last token hidden state (like XLNet)
+                    - 'first' => take the first token hidden state (like Bert)
+                    - 'mean' => take the mean of all tokens hidden states
+                    - 'token_ids' => supply a Tensor of classification token indices (GPT/GPT-2)
+                    - 'attn' => Not implemented now, use multi-head attention
+                summary_use_proj: Add a projection after the vector extraction
+                summary_num_classes: If > 0: the projection outputs to n classes (otherwise to hidden_size)
+                summary_activation:
+                    'tanh' => add a tanh activation to the output
+                     None => no activation
+        """
+        super(SequenceSummary, self).__init__()
+
+        self.summary_type = config.summary_type if hasattr(config, 'summary_use_proj') else 'last'
+        if config.summary_type == 'attn':
+            # We should use a standard multi-head attention module with absolute positional embedding for that.
+            # Cf. https://github.com/zihangdai/xlnet/blob/master/modeling.py#L253-L276
+            # We can probably just use the multi-head attention module of PyTorch >=1.1.0
+            raise NotImplementedError
+
+        self.summary = nn.Identity()
+        if hasattr(config, 'summary_use_proj') and config.summary_use_proj:
+            if hasattr(config, 'summary_num_classes') and config.summary_num_classes > 0:
+                num_classes = config.summary_num_classes
+            else:
+                num_classes = config.hidden_size
+            self.summary = nn.Linear(config.hidden_size, num_classes)
+
+        self.activation = nn.Identity()
+        if hasattr(config, 'summary_activation') and config.summary_activation == 'tanh':
+            self.activation = nn.Tanh()
+
+        self.dropout = nn.Dropout(config.summary_dropout)
+
+    def forward(self, hidden_states, token_ids=None):
+        """ hidden_states: float Tensor in shape [bsz, seq_len, hidden_size], the hidden-states of the last layer.
+            token_ids: [optional] index of the classification token if summary_type == 'token_ids',
+                shape (bsz,) or more generally (bsz, ...) where ... are optional leading dimensions of hidden_states.
+                if summary_type == 'token_ids' and token_ids is None:
+                    we take the last token of the sequence as classification token
+        """
+        if self.summary_type == 'last':
+            output = hidden_states[:, -1]
+        elif self.summary_type == 'first':
+            output = hidden_states[:, 0]
+        elif self.summary_type == 'mean':
+            output = hidden_states.mean(dim=1)
+        elif self.summary_type == 'token_ids':
+            if token_ids is None:
+                token_ids = torch.full_like(hidden_states[..., :1, :], hidden_states.shape[-2]-1, dtype=torch.long)
+            else:
+                token_ids = token_ids.unsqueeze(-1).unsqueeze(-1)
+                token_ids = token_ids.expand((-1,) * (token_ids.dim()-1) + (hidden_states.size(-1),))
+            # shape of token_ids: (bsz, XX, 1, hidden_size) where XX are optional leading dim of hidden_states
+            output = hidden_states.gather(-2, token_ids).squeeze(-2) # shape (bsz, XX, hidden_size)
+        elif self.summary_type == 'attn':
+            raise NotImplementedError
+
+        output = self.summary(output)
+        output = self.activation(output)
+        output = self.dropout(output)
+
+        return output
+
+
 def prune_linear_layer(layer, index, dim=0):
     """ Prune a linear layer (a model parameters) to keep only entries in index.
         Return the pruned layer as a new layer with requires_grad=True.
@@ -307,25 +396,6 @@ def prune_linear_layer(layer, index, dim=0):
     return new_layer
 
 
-class Conv1D(nn.Module):
-    """ Conv1D layer as defined by Alec Radford for GPT (and also used in GPT-2)
-        Basically works like a Linear layer but the weights are transposed
-    """
-    def __init__(self, nf, nx):
-        super(Conv1D, self).__init__()
-        self.nf = nf
-        w = torch.empty(nx, nf)
-        nn.init.normal_(w, std=0.02)
-        self.weight = nn.Parameter(w)
-        self.bias = nn.Parameter(torch.zeros(nf))
-
-    def forward(self, x):
-        size_out = x.size()[:-1] + (self.nf,)
-        x = torch.addmm(self.bias, x.view(-1, x.size(-1)), self.weight)
-        x = x.view(*size_out)
-        return x
-
-
 def prune_conv1d_layer(layer, index, dim=1):
     """ Prune a Conv1D layer (a model parameters) to keep only entries in index.
         A Conv1D work as a Linear layer (see e.g. BERT) but the weights are transposed.
diff --git a/pytorch_pretrained_bert/modeling_gpt2.py b/pytorch_pretrained_bert/modeling_gpt2.py
index fa5766f4e0..c16ad2f763 100644
--- a/pytorch_pretrained_bert/modeling_gpt2.py
+++ b/pytorch_pretrained_bert/modeling_gpt2.py
@@ -31,7 +31,8 @@ from torch.nn import CrossEntropyLoss
 from torch.nn.parameter import Parameter
 
 from .file_utils import cached_path
-from .model_utils import Conv1D, CONFIG_NAME, WEIGHTS_NAME, PretrainedConfig, PreTrainedModel, prune_conv1d_layer
+from .model_utils import (Conv1D, CONFIG_NAME, WEIGHTS_NAME, PretrainedConfig,
+                          PreTrainedModel, prune_conv1d_layer, SequenceSummary)
 from .modeling_bert import BertLayerNorm as LayerNorm
 
 logger = logging.getLogger(__name__)
@@ -119,6 +120,11 @@ class GPT2Config(PretrainedConfig):
         layer_norm_epsilon=1e-5,
         initializer_range=0.02,
         predict_special_tokens=True,
+        summary_type='token_ids',
+        summary_use_proj=True,
+        summary_num_classes=1,
+        summary_activation=None,
+        summary_dropout=0.1,
         **kwargs
     ):
         """Constructs GPT2Config.
@@ -164,6 +170,11 @@ class GPT2Config(PretrainedConfig):
             self.layer_norm_epsilon = layer_norm_epsilon
             self.initializer_range = initializer_range
             self.predict_special_tokens = predict_special_tokens
+            self.summary_type = summary_type
+            self.summary_use_proj = summary_use_proj
+            self.summary_num_classes = summary_num_classes
+            self.summary_activation = summary_activation
+            self.summary_dropout = summary_dropout
         else:
             raise ValueError(
                 "First argument must be either a vocabulary size (int)"
@@ -342,37 +353,6 @@ class GPT2LMHead(nn.Module):
         return lm_logits
 
 
-class GPT2MultipleChoiceHead(nn.Module):
-    """ Classifier Head for the transformer """
-
-    def __init__(self, config):
-        super(GPT2MultipleChoiceHead, self).__init__()
-        self.n_embd = config.n_embd
-        self.dropout = nn.Dropout2d(config.resid_pdrop)  # To reproduce the noise_shape parameter of TF implementation
-        self.linear = nn.Linear(config.n_embd, 1)
-
-        nn.init.normal_(self.linear.weight, std=0.02)
-        nn.init.normal_(self.linear.bias, 0)
-
-    def forward(self, hidden_states, mc_token_ids=None):
-        """ Extract classification token hidden state and project it using self.linear
-            hidden_state: shape (bsz, num_choices, seq_length, hidden_size)
-            mc_token_ids: [optional] index of the classification token, shape (bsz, num_choices)
-            if mc_token_ids=None we take the last token of the sequence as classification token
-        """
-        if mc_token_ids is None:
-            mc_token_ids = torch.full_like(hidden_states[:, :, :1, :], hidden_states.shape[2] - 1, dtype=torch.long)
-        else:
-            mc_token_ids = mc_token_ids.unsqueeze(-1).unsqueeze(-1).expand(-1, -1, -1, hidden_states.size(-1))
-        # mc_token_ids has shape (bsz, num_choices, 1, hidden_size)
-        multiple_choice_h = hidden_states.gather(2, mc_token_ids).squeeze(2)
-        # (bsz, num_choices, hidden_size)
-        multiple_choice_h = self.dropout(multiple_choice_h.transpose(1, 2)).transpose(1, 2)
-        multiple_choice_logits = self.linear(multiple_choice_h).squeeze(-1)
-        # (bsz, num_choices)
-        return multiple_choice_logits
-
-
 class GPT2PreTrainedModel(PreTrainedModel):
     """ An abstract class to handle weights initialization and
         a simple interface for dowloading and loading pretrained models.
@@ -735,7 +715,7 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
         super(GPT2DoubleHeadsModel, self).__init__(config)
         self.transformer = GPT2Model(config)
         self.lm_head = GPT2LMHead(self.transformer.wte.weight, config)
-        self.multiple_choice_head = GPT2MultipleChoiceHead(config)
+        self.multiple_choice_head = SequenceSummary(config)
 
         self.apply(self.init_weights)
 
@@ -753,7 +733,7 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
         hidden_states = transformer_outputs[0]
 
         lm_logits = self.lm_head(hidden_states)
-        mc_logits = self.multiple_choice_head(hidden_states, mc_token_ids)
+        mc_logits = self.multiple_choice_head(hidden_states, mc_token_ids).squeeze(-1)
 
         outputs = (lm_logits, mc_logits) + transformer_outputs[1:]
         if mc_labels is not None:
diff --git a/pytorch_pretrained_bert/modeling_openai.py b/pytorch_pretrained_bert/modeling_openai.py
index 6a182526e8..1a3e7fbbb4 100644
--- a/pytorch_pretrained_bert/modeling_openai.py
+++ b/pytorch_pretrained_bert/modeling_openai.py
@@ -31,7 +31,8 @@ from torch.nn import CrossEntropyLoss
 from torch.nn.parameter import Parameter
 
 from .file_utils import cached_path
-from .model_utils import Conv1D, CONFIG_NAME, WEIGHTS_NAME, PretrainedConfig, PreTrainedModel, prune_conv1d_layer
+from .model_utils import (Conv1D, CONFIG_NAME, WEIGHTS_NAME, PretrainedConfig,
+                          PreTrainedModel, prune_conv1d_layer, SequenceSummary)
 from .modeling_bert import BertLayerNorm as LayerNorm
 
 logger = logging.getLogger(__name__)
@@ -147,6 +148,11 @@ class OpenAIGPTConfig(PretrainedConfig):
         layer_norm_epsilon=1e-5,
         initializer_range=0.02,
         predict_special_tokens=True,
+        summary_type='token_ids',
+        summary_use_proj=True,
+        summary_num_classes=1,
+        summary_activation=None,
+        summary_dropout=0.1,
         **kwargs
     ):
         """Constructs OpenAIGPTConfig.
@@ -195,6 +201,11 @@ class OpenAIGPTConfig(PretrainedConfig):
             self.layer_norm_epsilon = layer_norm_epsilon
             self.initializer_range = initializer_range
             self.predict_special_tokens = predict_special_tokens
+            self.summary_type = summary_type
+            self.summary_use_proj = summary_use_proj
+            self.summary_num_classes = summary_num_classes
+            self.summary_activation = summary_activation
+            self.summary_dropout = summary_dropout
         else:
             raise ValueError(
                 "First argument must be either a vocabulary size (int)"
@@ -368,37 +379,6 @@ class OpenAIGPTLMHead(nn.Module):
         return lm_logits
 
 
-class OpenAIGPTMultipleChoiceHead(nn.Module):
-    """ Classifier Head for the transformer """
-
-    def __init__(self, config):
-        super(OpenAIGPTMultipleChoiceHead, self).__init__()
-        self.n_embd = config.n_embd
-        self.dropout = nn.Dropout2d(config.resid_pdrop)  # To reproduce the noise_shape parameter of TF implementation
-        self.linear = nn.Linear(config.n_embd, 1)
-
-        nn.init.normal_(self.linear.weight, std=0.02)
-        nn.init.normal_(self.linear.bias, 0)
-
-    def forward(self, hidden_states, mc_token_ids=None):
-        """ Extract classification token hidden state and project it using self.linear
-            hidden_state: hidden state of shape (bsz, num_choices, seq_length, hidden_size)
-            mc_token_ids: [optional] index of the classification token, shape (bsz, num_choices)
-            if mc_token_ids=None we take the last token of the sequence as classification token
-        """
-        if mc_token_ids is None:
-            mc_token_ids = torch.full_like(hidden_states[:, :, :1, :], hidden_states.shape[2] - 1, dtype=torch.long)
-        else:
-            mc_token_ids = mc_token_ids.unsqueeze(-1).unsqueeze(-1).expand(-1, -1, -1, hidden_states.size(-1))
-        # (bsz, num_choices, 1, hidden_size)
-        multiple_choice_h = hidden_states.gather(2, mc_token_ids).squeeze(2)
-        # (bsz, num_choices, hidden_size)
-        multiple_choice_h = self.dropout(multiple_choice_h.transpose(1, 2)).transpose(1, 2)
-        multiple_choice_logits = self.linear(multiple_choice_h).squeeze(-1)
-        # (bsz, num_choices)
-        return multiple_choice_logits
-
-
 class OpenAIGPTPreTrainedModel(PreTrainedModel):
     """ An abstract class to handle weights initialization and
         a simple interface for dowloading and loading pretrained models.
@@ -768,9 +748,11 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
 
     def __init__(self, config):
         super(OpenAIGPTDoubleHeadsModel, self).__init__(config)
+
         self.transformer = OpenAIGPTModel(config)
         self.lm_head = OpenAIGPTLMHead(self.transformer.tokens_embed.weight, config)
-        self.multiple_choice_head = OpenAIGPTMultipleChoiceHead(config)
+        self.multiple_choice_head = SequenceSummary(config)
+
         self.apply(self.init_weights)
 
     def set_num_special_tokens(self, num_special_tokens, predict_special_tokens=True):
@@ -787,7 +769,7 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
         hidden_states = transformer_outputs[0]
 
         lm_logits = self.lm_head(hidden_states)
-        mc_logits = self.multiple_choice_head(hidden_states, mc_token_ids)
+        mc_logits = self.multiple_choice_head(hidden_states, mc_token_ids).squeeze(-1)
 
         outputs = (lm_logits, mc_logits) + transformer_outputs[1:]
         if mc_labels is not None:
diff --git a/pytorch_pretrained_bert/modeling_xlnet.py b/pytorch_pretrained_bert/modeling_xlnet.py
index 2771ba7ca5..fb3d72954d 100644
--- a/pytorch_pretrained_bert/modeling_xlnet.py
+++ b/pytorch_pretrained_bert/modeling_xlnet.py
@@ -32,7 +32,8 @@ from torch.nn import functional as F
 from torch.nn import CrossEntropyLoss, MSELoss
 
 from .file_utils import cached_path
-from .model_utils import CONFIG_NAME, WEIGHTS_NAME, PretrainedConfig, PreTrainedModel
+from .model_utils import (CONFIG_NAME, WEIGHTS_NAME,
+                          PretrainedConfig, PreTrainedModel, SequenceSummary)
 
 
 logger = logging.getLogger(__name__)
@@ -223,8 +224,10 @@ class XLNetConfig(PretrainedConfig):
                  
                  finetuning_task=None,
                  num_labels=2,
-                 summary_type="last",
-                 use_proj=True,
+                 summary_type='last',
+                 summary_use_proj=True,
+                 summary_activation='tanh',
+                 summary_dropout=0.1,
                  **kwargs):
         """Constructs XLNetConfig.
 
@@ -307,7 +310,9 @@ class XLNetConfig(PretrainedConfig):
             self.finetuning_task = finetuning_task
             self.num_labels = num_labels
             self.summary_type = summary_type
-            self.use_proj = use_proj
+            self.summary_use_proj = summary_use_proj
+            self.summary_activation = summary_activation
+            self.summary_dropout = summary_dropout
         else:
             raise ValueError("First argument must be either a vocabulary size (int)"
                              "or the path to a pretrained model config file (str)")
@@ -1042,38 +1047,6 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
 
         return outputs  # return (loss), logits, (mems), (hidden states), (attentions)
 
-class XLNetSequenceSummary(nn.Module):
-    def __init__(self, config):
-        super(XLNetSequenceSummary, self).__init__()
-        self.summary_type = config.summary_type
-        if config.use_proj:
-            self.summary = nn.Linear(config.d_model, config.d_model)
-        else:
-            self.summary = None
-        if config.summary_type == 'attn':
-            # We should use a standard multi-head attention module with absolute positional embedding for that.
-            # Cf. https://github.com/zihangdai/xlnet/blob/master/modeling.py#L253-L276
-            # We can probably just use the multi-head attention module of PyTorch >=1.1.0
-            raise NotImplementedError
-        self.dropout = nn.Dropout(config.dropout)
-        self.activation = nn.Tanh()
-
-    def forward(self, hidden_states):
-        """ hidden_states: float Tensor in shape [bsz, seq_len, d_model], the hidden-states of the last layer."""
-        if self.summary_type == 'last':
-            output = hidden_states[:, -1]
-        elif self.summary_type == 'first':
-            output = hidden_states[:, 0]
-        elif self.summary_type == 'mean':
-            output = hidden_states.mean(dim=1)
-        elif self.summary_type == 'attn':
-            raise NotImplementedError
-
-        output = self.summary(output)
-        output = self.activation(output)
-        output = self.dropout(output)
-        return output
-
 
 class XLNetForSequenceClassification(XLNetPreTrainedModel):
     """XLNet model ("XLNet: Generalized Autoregressive Pretraining for Language Understanding").
@@ -1143,7 +1116,7 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel):
         super(XLNetForSequenceClassification, self).__init__(config)
 
         self.transformer = XLNetModel(config)
-        self.sequence_summary = XLNetSequenceSummary(config)
+        self.sequence_summary = SequenceSummary(config)
         self.logits_proj = nn.Linear(config.d_model, config.num_labels)
 
         self.apply(self.init_weights)

From 15b70338ba29ed17e69d38eb68f6940f8efa4a0b Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Thu, 4 Jul 2019 16:50:42 +0200
Subject: [PATCH 050/139] adding squad model to xlnet and xlm

---
 pytorch_pretrained_bert/model_utils.py        | 197 +++++++++++++--
 pytorch_pretrained_bert/modeling_xlm.py       | 231 +++++++-----------
 pytorch_pretrained_bert/modeling_xlnet.py     |  94 +++++--
 .../tests/modeling_openai_test.py             |   4 -
 .../tests/modeling_xlm_test.py                | 172 ++++++-------
 .../tests/modeling_xlnet_test.py              | 145 +++++++----
 6 files changed, 524 insertions(+), 319 deletions(-)

diff --git a/pytorch_pretrained_bert/model_utils.py b/pytorch_pretrained_bert/model_utils.py
index 0496e41bbd..b72707ce08 100644
--- a/pytorch_pretrained_bert/model_utils.py
+++ b/pytorch_pretrained_bert/model_utils.py
@@ -25,7 +25,7 @@ from io import open
 
 import torch
 from torch import nn
-from torch.nn import CrossEntropyLoss, MSELoss
+from torch.nn import CrossEntropyLoss, MSELoss, functional as F
 
 from .file_utils import cached_path
 
@@ -301,22 +301,189 @@ class Conv1D(nn.Module):
         return x
 
 
-class SequenceSummary(nn.Module):
+class PoolerStartLogits(nn.Module):
+    """ Compute SQuAD start_logits from sequence hidden states. """
     def __init__(self, config):
-        """ Compute a single vector summary of a sequence hidden states according to various possibilities:
-            Args of the config class:
-                summary_type:
-                    - 'last' => [default] take the last token hidden state (like XLNet)
-                    - 'first' => take the first token hidden state (like Bert)
-                    - 'mean' => take the mean of all tokens hidden states
-                    - 'token_ids' => supply a Tensor of classification token indices (GPT/GPT-2)
-                    - 'attn' => Not implemented now, use multi-head attention
-                summary_use_proj: Add a projection after the vector extraction
-                summary_num_classes: If > 0: the projection outputs to n classes (otherwise to hidden_size)
-                summary_activation:
-                    'tanh' => add a tanh activation to the output
-                     None => no activation
+        super(PoolerStartLogits, self).__init__()
+        self.dense = nn.Linear(config.hidden_size, 1)
+
+    def forward(self, hidden_states, p_mask=None):
+        """ Args:
+            `p_mask`: [optional] invalid position mask such as query and special symbols (PAD, SEP, CLS)
+                shape [batch_size, seq_len]. 1.0 means token should be masked.
         """
+        x = self.dense(hidden_states).squeeze(-1)
+
+        if p_mask is not None:
+            x = x * (1 - p_mask) - 1e30 * p_mask
+
+        return x
+
+
+class PoolerEndLogits(nn.Module):
+    """ Compute SQuAD end_logits from sequence hidden states and start token hidden state.
+    """
+    def __init__(self, config):
+        super(PoolerEndLogits, self).__init__()
+        self.dense_0 = nn.Linear(config.hidden_size * 2, config.hidden_size)
+        self.activation = nn.Tanh()
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dense_1 = nn.Linear(config.hidden_size, 1)
+
+    def forward(self, hidden_states, start_states=None, start_positions=None, p_mask=None):
+        """ Args:
+            One of start_states, start_positions should be not None. If both are set, start_positions overrides start_states.
+            `start_states`: hidden states of the first tokens for the labeled span: torch.LongTensor of shape identical to hidden_states.
+            `start_positions`: position of the first token for the labeled span: torch.LongTensor of shape [batch_size].
+            `p_mask`: [optional] invalid position mask such as query and special symbols (PAD, SEP, CLS)
+                shape [batch_size, seq_len]. 1.0 means token should be masked.
+        """
+        slen, hsz = hidden_states.shape[-2:]
+        assert start_states is not None or start_positions is not None, "One of start_states, start_positions should be not None"
+        if start_positions is not None:
+            start_positions = start_positions[:, None, None].expand(-1, -1, hsz) # shape (bsz, 1, hsz)
+            start_states = hidden_states.gather(-2, start_positions) # shape (bsz, 1, hsz)
+            start_states = start_states.expand(-1, slen, -1) # shape (bsz, slen, hsz)
+
+        x = self.dense_0(torch.cat([hidden_states, start_states], dim=-1))
+        x = self.activation(x)
+        x = self.LayerNorm(x)
+        x = self.dense_1(x).squeeze(-1)
+
+        if p_mask is not None:
+            x = x * (1 - p_mask) - 1e30 * p_mask
+
+        return x
+
+
+class PoolerAnswerClass(nn.Module):
+    """ Compute SQuAD 2.0 answer class from classification and start tokens hidden states. """
+    def __init__(self, config):
+        super(PoolerAnswerClass, self).__init__()
+        self.dense_0 = nn.Linear(config.hidden_size * 2, config.hidden_size)
+        self.activation = nn.Tanh()
+        self.dense_1 = nn.Linear(config.hidden_size, 1, bias=False)
+
+    def forward(self, hidden_states, start_states=None, start_positions=None, cls_index=None):
+        """ Args:
+            One of start_states, start_positions should be not None. If both are set, start_positions overrides start_states.
+            `start_states`: hidden states of the first tokens for the labeled span: torch.LongTensor of shape identical to hidden_states.
+            `start_positions`: position of the first token for the labeled span: torch.LongTensor of shape [batch_size].
+            `cls_index`: position of the CLS token: torch.LongTensor of shape [batch_size]. If None, take the last token.
+
+            # note(zhiliny): no dependency on end_feature so that we can obtain one single `cls_logits` for each sample
+        """
+        slen, hsz = hidden_states.shape[-2:]
+        assert start_states is not None or start_positions is not None, "One of start_states, start_positions should be not None"
+        if start_positions is not None:
+            start_positions = start_positions[:, None, None].expand(-1, -1, hsz) # shape (bsz, 1, hsz)
+            start_states = hidden_states.gather(-2, start_positions).squeeze(-2) # shape (bsz, hsz)
+
+        if cls_index is not None:
+            cls_index = cls_index[:, None, None].expand(-1, -1, hsz) # shape (bsz, 1, hsz)
+            cls_token_state = hidden_states.gather(-2, cls_index).squeeze(-2) # shape (bsz, hsz)
+        else:
+            cls_token_state = hidden_states[:, -1, :] # shape (bsz, hsz)
+
+        x = self.dense_0(torch.cat([start_states, cls_token_state], dim=-1))
+        x = self.activation(x)
+        x = self.dense_1(x).squeeze(-1)
+
+        return x
+
+
+class SQuADHead(nn.Module):
+    """ A SQuAD head inspired by XLNet.
+        Compute
+    """
+    def __init__(self, config):
+        super(SQuADHead, self).__init__()
+        self.start_n_top = config.start_n_top
+        self.end_n_top = config.end_n_top
+
+        self.start_logits = PoolerStartLogits(config)
+        self.end_logits = PoolerEndLogits(config)
+        self.answer_class = PoolerAnswerClass(config)
+
+    def forward(self, hidden_states, start_positions=None, end_positions=None,
+                cls_index=None, is_impossible=None, p_mask=None):
+        """ hidden_states: float Tensor in shape [bsz, seq_len, hidden_size], the hidden-states of the last layer.
+        """
+        outputs = ()
+
+        start_logits = self.start_logits(hidden_states, p_mask)
+
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, let's remove the dimension added by batch splitting
+            for x in (start_positions, end_positions, cls_index, is_impossible):
+                if x is not None and x.dim() > 1:
+                    x.squeeze_(-1)
+
+            # during training, compute the end logits based on the ground truth of the start position
+            end_logits = self.end_logits(hidden_states, start_positions=start_positions, p_mask=p_mask)
+
+            loss_fct = CrossEntropyLoss()
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+
+            if cls_index is not None and is_impossible is not None:
+                # Predict answerability from the representation of CLS and START
+                cls_logits = self.answer_class(hidden_states, start_positions=start_positions, cls_index=cls_index)
+                loss_fct_cls = nn.BCEWithLogitsLoss()
+                cls_loss = loss_fct_cls(cls_logits, is_impossible)
+
+                # note(zhiliny): by default multiply the loss by 0.5 so that the scale is comparable to start_loss and end_loss
+                total_loss += cls_loss * 0.5
+                outputs = (total_loss, start_logits, end_logits, cls_logits) + outputs
+            else:
+                outputs = (total_loss, start_logits, end_logits) + outputs
+
+        else:
+            # during inference, compute the end logits based on beam search
+            bsz, slen, hsz = hidden_states.size()
+            start_log_probs = F.softmax(start_logits, dim=-1) # shape (bsz, slen)
+
+            start_top_log_probs, start_top_index = torch.topk(start_log_probs, self.start_n_top, dim=-1) # shape (bsz, start_n_top)
+            start_top_index = start_top_index.unsqueeze(-1).expand(-1, -1, hsz) # shape (bsz, start_n_top, hsz)
+            start_states = torch.gather(hidden_states, -2, start_top_index) # shape (bsz, start_n_top, hsz)
+            start_states = start_states.unsqueeze(1).expand(-1, slen, -1, -1) # shape (bsz, slen, start_n_top, hsz)
+
+            hidden_states_expanded = hidden_states.unsqueeze(2).expand_as(start_states) # shape (bsz, slen, start_n_top, hsz)
+            p_mask = p_mask.unsqueeze(-1) if p_mask is not None else None
+            end_logits = self.end_logits(hidden_states_expanded, start_states=start_states, p_mask=p_mask)
+            end_log_probs = F.softmax(end_logits, dim=1) # shape (bsz, slen, start_n_top)
+
+            end_top_log_probs, end_top_index = torch.topk(end_log_probs, self.end_n_top, dim=1) # shape (bsz, end_n_top, start_n_top)
+            end_top_log_probs = end_top_log_probs.view(-1, self.start_n_top * self.end_n_top)
+            end_top_index = end_top_index.view(-1, self.start_n_top * self.end_n_top)
+
+            start_states = torch.einsum("blh,bl->bh", hidden_states, start_log_probs)
+            cls_logits = self.answer_class(hidden_states, start_states=start_states, cls_index=cls_index)
+
+            outputs = (start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits) + outputs
+
+        # return start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits
+        # or (if labels are provided) total_loss, start_logits, end_logits, (cls_logits)
+        return outputs
+
+
+class SequenceSummary(nn.Module):
+    """ Compute a single vector summary of a sequence hidden states according to various possibilities:
+        Args of the config class:
+            summary_type:
+                - 'last' => [default] take the last token hidden state (like XLNet)
+                - 'first' => take the first token hidden state (like Bert)
+                - 'mean' => take the mean of all tokens hidden states
+                - 'token_ids' => supply a Tensor of classification token indices (GPT/GPT-2)
+                - 'attn' => Not implemented now, use multi-head attention
+            summary_use_proj: Add a projection after the vector extraction
+            summary_num_classes: If > 0: the projection outputs to n classes (otherwise to hidden_size)
+            summary_activation:
+                'tanh' => add a tanh activation to the output
+                    None => no activation
+    """
+    def __init__(self, config):
         super(SequenceSummary, self).__init__()
 
         self.summary_type = config.summary_type if hasattr(config, 'summary_use_proj') else 'last'
diff --git a/pytorch_pretrained_bert/modeling_xlm.py b/pytorch_pretrained_bert/modeling_xlm.py
index 84ba88ff7e..9d1775161d 100644
--- a/pytorch_pretrained_bert/modeling_xlm.py
+++ b/pytorch_pretrained_bert/modeling_xlm.py
@@ -35,7 +35,8 @@ from torch.nn import functional as F
 from torch.nn import CrossEntropyLoss, MSELoss
 
 from .file_utils import cached_path
-from .model_utils import CONFIG_NAME, WEIGHTS_NAME, PretrainedConfig, PreTrainedModel, prune_linear_layer
+from .model_utils import (CONFIG_NAME, WEIGHTS_NAME, PretrainedConfig, PreTrainedModel,
+                          prune_linear_layer, SequenceSummary, SQuADHead)
 
 logger = logging.getLogger(__name__)
 
@@ -67,15 +68,23 @@ class XLMConfig(PretrainedConfig):
                  n_langs=1,
                  max_position_embeddings=512,
                  embed_init_std=2048 ** -0.5,
+                 layer_norm_eps=1e-12,
                  init_std=0.02,
-                 summary_type="last",
-                 use_proj=True,
                  bos_index=0,
                  eos_index=1,
                  pad_index=2,
                  unk_index=3,
                  mask_index=5,
                  is_encoder=True,
+
+                 finetuning_task=None,
+                 num_labels=2,
+                 summary_type='last',
+                 summary_use_proj=True,
+                 summary_activation='tanh',
+                 summary_dropout=0.1,
+                 start_n_top=5,
+                 end_n_top=5,
                  **kwargs):
         """Constructs XLMConfig.
 
@@ -140,8 +149,7 @@ class XLMConfig(PretrainedConfig):
             self.causal = causal
             self.asm = asm
             self.n_langs = n_langs
-            self.summary_type = summary_type
-            self.use_proj = use_proj
+            self.layer_norm_eps = layer_norm_eps
             self.bos_index = bos_index
             self.eos_index = eos_index
             self.pad_index = pad_index
@@ -151,6 +159,14 @@ class XLMConfig(PretrainedConfig):
             self.max_position_embeddings = max_position_embeddings
             self.embed_init_std = embed_init_std
             self.init_std = init_std
+            self.finetuning_task = finetuning_task
+            self.num_labels = num_labels
+            self.summary_type = summary_type
+            self.summary_use_proj = summary_use_proj
+            self.summary_activation = summary_activation
+            self.summary_dropout = summary_dropout
+            self.start_n_top = start_n_top
+            self.end_n_top = end_n_top
         else:
             raise ValueError("First argument must be either a vocabulary size (int)"
                              "or the path to a pretrained model config file (str)")
@@ -172,26 +188,6 @@ class XLMConfig(PretrainedConfig):
         return self.n_layers
 
 
-def Embedding(num_embeddings, embedding_dim, padding_idx=None, config=None):
-    m = nn.Embedding(num_embeddings, embedding_dim, padding_idx=padding_idx)
-    if config is not None and config.embed_init_std is not None:
-        nn.init.normal_(m.weight, mean=0, std=config.embed_init_std)
-    if padding_idx is not None:
-        nn.init.constant_(m.weight[padding_idx], 0)
-    return m
-
-
-def Linear(in_features, out_features, bias=True, config=None):
-    m = nn.Linear(in_features, out_features, bias)
-    if config is not None and config.init_std is not None:
-        nn.init.normal_(m.weight, mean=0, std=config.init_std)
-        if bias:
-            nn.init.constant_(m.bias, 0.)
-    # nn.init.xavier_uniform_(m.weight)
-    # nn.init.constant_(m.bias, 0.)
-    return m
-
-
 def create_sinusoidal_embeddings(n_pos, dim, out):
     position_enc = np.array([
         [pos / np.power(10000, 2 * (j // 2) / dim) for j in range(dim)]
@@ -244,7 +240,7 @@ class MultiHeadAttention(nn.Module):
     NEW_ID = itertools.count()
 
     def __init__(self, n_heads, dim, config):
-        super().__init__()
+        super(MultiHeadAttention, self).__init__()
         self.layer_id = next(MultiHeadAttention.NEW_ID)
         self.output_attentions = config.output_attentions
         self.dim = dim
@@ -252,10 +248,10 @@ class MultiHeadAttention(nn.Module):
         self.dropout = config.attention_dropout
         assert self.dim % self.n_heads == 0
 
-        self.q_lin = Linear(dim, dim, config=config)
-        self.k_lin = Linear(dim, dim, config=config)
-        self.v_lin = Linear(dim, dim, config=config)
-        self.out_lin = Linear(dim, dim, config=config)
+        self.q_lin = nn.Linear(dim, dim)
+        self.k_lin = nn.Linear(dim, dim)
+        self.v_lin = nn.Linear(dim, dim)
+        self.out_lin = nn.Linear(dim, dim)
 
     def prune_heads(self, heads):
         attention_head_size = self.dim // self.n_heads
@@ -342,10 +338,10 @@ class MultiHeadAttention(nn.Module):
 class TransformerFFN(nn.Module):
 
     def __init__(self, in_dim, dim_hidden, out_dim, config):
-        super().__init__()
+        super(TransformerFFN, self).__init__()
         self.dropout = config.dropout
-        self.lin1 = Linear(in_dim, dim_hidden, config=config)
-        self.lin2 = Linear(dim_hidden, out_dim, config=config)
+        self.lin1 = nn.Linear(in_dim, dim_hidden)
+        self.lin2 = nn.Linear(dim_hidden, out_dim)
         self.act = gelu if config.gelu_activation else F.relu
 
     def forward(self, input):
@@ -363,17 +359,21 @@ class XLMPreTrainedModel(PreTrainedModel):
     config_class = XLMConfig
     pretrained_model_archive_map = PRETRAINED_MODEL_ARCHIVE_MAP
     load_tf_weights = None
-    base_model_prefix = "xlm"
+    base_model_prefix = "transformer"
 
     def __init__(self, *inputs, **kwargs):
         super(XLMPreTrainedModel, self).__init__(*inputs, **kwargs)
 
     def init_weights(self, module):
-        """ Initialize the weights.
-        """
-        if isinstance(module, (nn.Linear, nn.Embedding)):
-            # Weights are initialized in module instantiation (see above)
-            pass
+        """ Initialize the weights. """
+        if isinstance(module, nn.Embedding):
+            if self.config is not None and self.config.embed_init_std is not None:
+                nn.init.normal_(module.weight, mean=0, std=self.config.embed_init_std)
+        if isinstance(module, nn.Linear):
+            if self.config is not None and self.config.init_std is not None:
+                nn.init.normal_(module.weight, mean=0, std=self.config.init_std)
+                if hasattr(module, 'bias') and module.bias is not None:
+                    nn.init.constant_(module.bias, 0.)
         if isinstance(module, nn.LayerNorm):
             module.bias.data.zero_()
             module.weight.data.fill_(1.0)
@@ -471,13 +471,13 @@ class XLMModel(XLMPreTrainedModel):
         assert self.dim % self.n_heads == 0, 'transformer dim must be a multiple of n_heads'
 
         # embeddings
-        self.position_embeddings = Embedding(config.max_position_embeddings, self.dim, config=config)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings, self.dim)
         if config.sinusoidal_embeddings:
             create_sinusoidal_embeddings(config.max_position_embeddings, self.dim, out=self.position_embeddings.weight)
         if config.n_langs > 1:
-            self.lang_embeddings = Embedding(self.n_langs, self.dim, config=config)
-        self.embeddings = Embedding(self.n_words, self.dim, padding_idx=self.pad_index, config=config)
-        self.layer_norm_emb = nn.LayerNorm(self.dim, eps=1e-12)
+            self.lang_embeddings = nn.Embedding(self.n_langs, self.dim)
+        self.embeddings = nn.Embedding(self.n_words, self.dim, padding_idx=self.pad_index)
+        self.layer_norm_emb = nn.LayerNorm(self.dim, eps=config.layer_norm_eps)
 
         # transformer layers
         self.attentions = nn.ModuleList()
@@ -490,12 +490,14 @@ class XLMModel(XLMPreTrainedModel):
 
         for _ in range(self.n_layers):
             self.attentions.append(MultiHeadAttention(self.n_heads, self.dim, config=config))
-            self.layer_norm1.append(nn.LayerNorm(self.dim, eps=1e-12))
+            self.layer_norm1.append(nn.LayerNorm(self.dim, eps=config.layer_norm_eps))
             # if self.is_decoder:
-            #     self.layer_norm15.append(nn.LayerNorm(self.dim, eps=1e-12))
+            #     self.layer_norm15.append(nn.LayerNorm(self.dim, eps=config.layer_norm_eps))
             #     self.encoder_attn.append(MultiHeadAttention(self.n_heads, self.dim, dropout=self.attention_dropout))
             self.ffns.append(TransformerFFN(self.dim, self.hidden_dim, self.dim, config=config))
-            self.layer_norm2.append(nn.LayerNorm(self.dim, eps=1e-12))
+            self.layer_norm2.append(nn.LayerNorm(self.dim, eps=config.layer_norm_eps))
+
+        self.apply(self.init_weights)
 
     def _prune_heads(self, heads_to_prune):
         """ Prunes heads of the model.
@@ -636,14 +638,14 @@ class XLMPredLayer(nn.Module):
     Prediction layer (cross_entropy or adaptive_softmax).
     """
     def __init__(self, config):
-        super().__init__()
+        super(XLMPredLayer, self).__init__()
         self.asm = config.asm
         self.n_words = config.n_words
         self.pad_index = config.pad_index
         dim = config.emb_dim
 
         if config.asm is False:
-            self.proj = Linear(dim, config.n_words, bias=True)
+            self.proj = nn.Linear(dim, config.n_words, bias=True)
         else:
             self.proj = nn.AdaptiveLogSoftmaxWithLoss(
                 in_features=dim,
@@ -653,28 +655,24 @@ class XLMPredLayer(nn.Module):
                 head_bias=True,  # default is False
             )
 
-    def forward(self, x, y, get_scores=False):
+    def forward(self, x, y=None):
+        """ Compute the loss, and optionally the scores.
         """
-        Compute the loss, and optionally the scores.
-        """
-        assert (y == self.pad_index).sum().item() == 0
-
+        outputs = ()
         if self.asm is False:
             scores = self.proj(x).view(-1, self.n_words)
-            loss = F.cross_entropy(scores, y, reduction='elementwise_mean')
+            outputs = (scores,) + outputs
+            if y is not None:
+                loss = F.cross_entropy(scores, y, reduction='elementwise_mean')
+                outputs = (loss,) + outputs
         else:
-            _, loss = self.proj(x, y)
-            scores = self.proj.log_prob(x) if get_scores else None
-
-        return scores, loss
-
-    def get_scores(self, x):
-        """
-        Compute scores.
-        """
-        assert x.dim() == 2
-        return self.proj.log_prob(x) if self.asm else self.proj(x)
+            scores = self.proj.log_prob(x)
+            outputs = (scores,) + outputs
+            if y is not None:
+                _, loss = self.proj(x, y)
+                outputs = (loss,) + outputs
 
+        return outputs
 
 
 class XLMWithLMHeadModel(XLMPreTrainedModel):
@@ -731,6 +729,7 @@ class XLMWithLMHeadModel(XLMPreTrainedModel):
     """
     def __init__(self, config):
         super(XLMWithLMHeadModel, self).__init__(config)
+        self.torchscript = config.torchscript
 
         self.transformer = XLMModel(config)
         self.pred_layer = XLMPredLayer(config)
@@ -741,7 +740,10 @@ class XLMWithLMHeadModel(XLMPreTrainedModel):
     def tie_weights(self):
         """ Make sure we are sharing the embeddings
         """
-        self.pred_layer.proj.weight = self.transformer.embeddings.weight
+        if self.torchscript:
+            self.pred_layer.proj.weight = nn.Parameter(self.transformer.embeddings.weight.clone())
+        else:
+            self.pred_layer.proj.weight = self.transformer.embeddings.weight
 
     def forward(self, input_ids, lengths=None, positions=None, langs=None, token_type_ids=None,
                 attention_mask=None, cache=None, labels=None, head_mask=None):
@@ -775,55 +777,12 @@ class XLMWithLMHeadModel(XLMPreTrainedModel):
                                                langs=langs, attention_mask=attention_mask, cache=cache, head_mask=head_mask)
 
         output = transformer_outputs[0]
-        logits = self.pred_layer(output, labels)
-
-        outputs = transformer_outputs[1:]  # Keep new_mems and attention/hidden states if they are here
-
-        if labels is not None:
-            # Flatten the tokens
-            loss_fct = CrossEntropyLoss(ignore_index=-1)
-            loss = loss_fct(logits.view(-1, logits.size(-1)),
-                            labels.view(-1))
-            outputs = [loss] + outputs
-
-        outputs = [logits] + outputs
+        outputs = self.pred_layer(output, labels)
+        outputs = outputs + transformer_outputs[1:]  # Keep new_mems and attention/hidden states if they are here
 
         return outputs
 
 
-class XLMSequenceSummary(nn.Module):
-    def __init__(self, config):
-        super(XLMSequenceSummary, self).__init__()
-        self.summary_type = config.summary_type
-        if config.use_proj:
-            self.summary = nn.Linear(config.d_model, config.d_model)
-        else:
-            self.summary = None
-        if config.summary_type == 'attn':
-            # We should use a standard multi-head attention module with absolute positional embedding for that.
-            # Cf. https://github.com/zihangdai/xlnet/blob/master/modeling.py#L253-L276
-            # We can probably just use the multi-head attention module of PyTorch >=1.1.0
-            raise NotImplementedError
-        self.dropout = nn.Dropout(config.dropout)
-        self.activation = nn.Tanh()
-
-    def forward(self, hidden_states):
-        """ hidden_states: float Tensor in shape [bsz, seq_len, d_model], the hidden-states of the last layer."""
-        if self.summary_type == 'last':
-            output = hidden_states[:, -1]
-        elif self.summary_type == 'first':
-            output = hidden_states[:, 0]
-        elif self.summary_type == 'mean':
-            output = hidden_states.mean(dim=1)
-        elif summary_type == 'attn':
-            raise NotImplementedError
-
-        output = self.summary(output)
-        output = self.activation(output)
-        output = self.dropout(output)
-        return output
-
-
 class XLMForSequenceClassification(XLMPreTrainedModel):
     """XLM model ("XLM: Generalized Autoregressive Pretraining for Language Understanding").
 
@@ -890,15 +849,15 @@ class XLMForSequenceClassification(XLMPreTrainedModel):
     """
     def __init__(self, config):
         super(XLMForSequenceClassification, self).__init__(config)
+        self.num_labels = config.num_labels
 
         self.transformer = XLMModel(config)
-        self.sequence_summary = XLMSequenceSummary(config)
-        self.logits_proj = nn.Linear(config.d_model, config.num_labels)
+        self.sequence_summary = SequenceSummary(config)
 
         self.apply(self.init_weights)
 
-    def forward(self, input_ids, lengths=None, positions=None, langs=None, attention_mask=None,
-                cache=None, labels=None, head_mask=None):
+    def forward(self, input_ids, lengths=None, positions=None, langs=None, token_type_ids=None,
+                attention_mask=None, cache=None, labels=None, head_mask=None):
         """
         Args:
             inp_k: int32 Tensor in shape [bsz, len], the input token IDs.
@@ -930,10 +889,9 @@ class XLMForSequenceClassification(XLMPreTrainedModel):
                                                langs=langs, attention_mask=attention_mask, cache=cache, head_mask=head_mask)
 
         output = transformer_outputs[0]
-        output = self.sequence_summary(output)
-        logits = self.logits_proj(output)
+        logits = self.sequence_summary(output)
 
-        outputs = transformer_outputs[1:]  # Keep new_mems and attention/hidden states if they are here
+        outputs = (logits,) + transformer_outputs[1:]  # Keep new_mems and attention/hidden states if they are here
 
         if labels is not None:
             if self.num_labels == 1:
@@ -943,9 +901,7 @@ class XLMForSequenceClassification(XLMPreTrainedModel):
             else:
                 loss_fct = CrossEntropyLoss()
                 loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
-            outputs = [loss] + outputs
-
-        outputs = [logits] + outputs
+            outputs = (loss,) + outputs
 
         return outputs
 
@@ -1010,41 +966,22 @@ class XLMForQuestionAnswering(XLMPreTrainedModel):
         super(XLMForQuestionAnswering, self).__init__(config)
 
         self.transformer = XLMModel(config)
-        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
+        self.qa_outputs = SQuADHead(config)
 
         self.apply(self.init_weights)
 
-    def forward(self, input_ids, lengths=None, positions=None, langs=None, attention_mask=None, cache=None,
-                labels=None, head_mask=None):
+    def forward(self, input_ids, lengths=None, positions=None, langs=None, token_type_ids=None,
+                attention_mask=None, cache=None, start_positions=None, end_positions=None,
+                cls_index=None, is_impossible=None, p_mask=None, head_mask=None):
 
         transformer_outputs = self.transformer(input_ids, lengths=lengths, positions=positions, token_type_ids=token_type_ids,
                                                langs=langs, attention_mask=attention_mask, cache=cache, head_mask=head_mask)
 
         output = transformer_outputs[0]
-        logits = self.qa_outputs(output)
-        start_logits, end_logits = logits.split(1, dim=-1)
-        start_logits = start_logits.squeeze(-1)
-        end_logits = end_logits.squeeze(-1)
 
-        outputs = transformer_outputs[1:]  # Keep new_mems and attention/hidden states if they are here
+        outputs = self.qa_outputs(output, start_positions=start_positions, end_positions=end_positions,
+                                  cls_index=cls_index, is_impossible=is_impossible, p_mask=p_mask)
 
-        if start_positions is not None and end_positions is not None:
-            # If we are on multi-GPU, split add a dimension
-            if len(start_positions.size()) > 1:
-                start_positions = start_positions.squeeze(-1)
-            if len(end_positions.size()) > 1:
-                end_positions = end_positions.squeeze(-1)
-            # sometimes the start/end positions are outside our model inputs, we ignore these terms
-            ignored_index = start_logits.size(1)
-            start_positions.clamp_(0, ignored_index)
-            end_positions.clamp_(0, ignored_index)
-
-            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
-            start_loss = loss_fct(start_logits, start_positions)
-            end_loss = loss_fct(end_logits, end_positions)
-            total_loss = (start_loss + end_loss) / 2
-            outputs = [total_loss] + outputs
-
-        outputs = [start_logits, end_logits] + outputs
+        outputs = outputs + transformer_outputs[1:]  # Keep new_mems and attention/hidden states if they are here
 
         return outputs
diff --git a/pytorch_pretrained_bert/modeling_xlnet.py b/pytorch_pretrained_bert/modeling_xlnet.py
index fb3d72954d..f5841e0601 100644
--- a/pytorch_pretrained_bert/modeling_xlnet.py
+++ b/pytorch_pretrained_bert/modeling_xlnet.py
@@ -32,8 +32,8 @@ from torch.nn import functional as F
 from torch.nn import CrossEntropyLoss, MSELoss
 
 from .file_utils import cached_path
-from .model_utils import (CONFIG_NAME, WEIGHTS_NAME,
-                          PretrainedConfig, PreTrainedModel, SequenceSummary)
+from .model_utils import (CONFIG_NAME, WEIGHTS_NAME, PretrainedConfig, PreTrainedModel,
+                          SequenceSummary, PoolerAnswerClass, PoolerEndLogits, PoolerStartLogits)
 
 
 logger = logging.getLogger(__name__)
@@ -221,13 +221,15 @@ class XLNetConfig(PretrainedConfig):
                  bi_data=False,
                  clamp_len=-1,
                  same_length=False,
-                 
+
                  finetuning_task=None,
                  num_labels=2,
                  summary_type='last',
                  summary_use_proj=True,
                  summary_activation='tanh',
                  summary_dropout=0.1,
+                 start_n_top=5,
+                 end_n_top=5,
                  **kwargs):
         """Constructs XLNetConfig.
 
@@ -313,6 +315,8 @@ class XLNetConfig(PretrainedConfig):
             self.summary_use_proj = summary_use_proj
             self.summary_activation = summary_activation
             self.summary_dropout = summary_dropout
+            self.start_n_top = start_n_top
+            self.end_n_top = end_n_top
         else:
             raise ValueError("First argument must be either a vocabulary size (int)"
                              "or the path to a pretrained model config file (str)")
@@ -1114,6 +1118,7 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel):
     """
     def __init__(self, config):
         super(XLNetForSequenceClassification, self).__init__(config)
+        self.num_labels = config.num_labels
 
         self.transformer = XLNetModel(config)
         self.sequence_summary = SequenceSummary(config)
@@ -1174,7 +1179,7 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel):
 
 
 class XLNetForQuestionAnswering(XLNetPreTrainedModel):
-    """XLNet model for Question Answering (span extraction).
+    """ XLNet model for Question Answering (span extraction).
     This module is composed of the XLNet model with a linear layer on top of
     the sequence output that computes start_logits and end_logits
 
@@ -1231,41 +1236,78 @@ class XLNetForQuestionAnswering(XLNetPreTrainedModel):
     """
     def __init__(self, config):
         super(XLNetForQuestionAnswering, self).__init__(config)
+        self.start_n_top = config.start_n_top
+        self.end_n_top = config.end_n_top
 
         self.transformer = XLNetModel(config)
-        self.qa_outputs = nn.Linear(config.d_model, config.num_labels)
+        self.start_logits = PoolerStartLogits(config)
+        self.end_logits = PoolerEndLogits(config)
+        self.answer_class = PoolerAnswerClass(config)
 
         self.apply(self.init_weights)
 
     def forward(self, input_ids, token_type_ids=None, input_mask=None, attention_mask=None,
                 mems=None, perm_mask=None, target_mapping=None, inp_q=None,
-                start_positions=None, end_positions=None, head_mask=None):
+                start_positions=None, end_positions=None, cls_index=None, is_impossible=None, p_mask=None,
+                head_mask=None):
         transformer_outputs = self.transformer(input_ids, token_type_ids, input_mask, attention_mask,
-                                            mems, perm_mask, target_mapping, inp_q, head_mask)
+                                               mems, perm_mask, target_mapping, inp_q, head_mask)
+        hidden_states = transformer_outputs[0]
+        start_logits = self.start_logits(hidden_states, p_mask)
 
-        logits = self.qa_outputs(transformer_outputs[0])
-
-        start_logits, end_logits = logits.split(1, dim=-1)
-        start_logits = start_logits.squeeze(-1)
-        end_logits = end_logits.squeeze(-1)
-
-        outputs = (start_logits, end_logits,) + transformer_outputs[1:]  # Keep mems, hidden states, attentions if there are in it
+        outputs = transformer_outputs[1:]  # Keep mems, hidden states, attentions if there are in it
 
         if start_positions is not None and end_positions is not None:
-            # If we are on multi-GPU, split add a dimension
-            if len(start_positions.size()) > 1:
-                start_positions = start_positions.squeeze(-1)
-            if len(end_positions.size()) > 1:
-                end_positions = end_positions.squeeze(-1)
-            # sometimes the start/end positions are outside our model inputs, we ignore these terms
-            ignored_index = start_logits.size(1)
-            start_positions.clamp_(0, ignored_index)
-            end_positions.clamp_(0, ignored_index)
+            # If we are on multi-GPU, let's remove the dimension added by batch splitting
+            for x in (start_positions, end_positions, cls_index, is_impossible):
+                if x is not None and x.dim() > 1:
+                    x.squeeze_(-1)
 
-            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
+            # during training, compute the end logits based on the ground truth of the start position
+            end_logits = self.end_logits(hidden_states, start_positions=start_positions, p_mask=p_mask)
+
+            loss_fct = CrossEntropyLoss()
             start_loss = loss_fct(start_logits, start_positions)
             end_loss = loss_fct(end_logits, end_positions)
             total_loss = (start_loss + end_loss) / 2
-            outputs = (total_loss,) + outputs
 
-        return outputs  # return (loss), logits, (mems), (hidden states), (attentions)
+            if cls_index is not None and is_impossible is not None:
+                # Predict answerability from the representation of CLS and START
+                cls_logits = self.answer_class(hidden_states, start_positions=start_positions, cls_index=cls_index)
+                loss_fct_cls = nn.BCEWithLogitsLoss()
+                cls_loss = loss_fct_cls(cls_logits, is_impossible)
+
+                # note(zhiliny): by default multiply the loss by 0.5 so that the scale is
+                # comparable to start_loss and end_loss
+                total_loss += cls_loss * 0.5
+                outputs = (total_loss, start_logits, end_logits, cls_logits) + outputs
+            else:
+                outputs = (total_loss, start_logits, end_logits) + outputs
+
+        else:
+            # during inference, compute the end logits based on beam search
+            bsz, slen, hsz = hidden_states.size()
+            start_log_probs = F.softmax(start_logits, dim=-1) # shape (bsz, slen)
+
+            start_top_log_probs, start_top_index = torch.topk(start_log_probs, self.start_n_top, dim=-1) # shape (bsz, start_n_top)
+            start_top_index = start_top_index.unsqueeze(-1).expand(-1, -1, hsz) # shape (bsz, start_n_top, hsz)
+            start_states = torch.gather(hidden_states, -2, start_top_index) # shape (bsz, start_n_top, hsz)
+            start_states = start_states.unsqueeze(1).expand(-1, slen, -1, -1) # shape (bsz, slen, start_n_top, hsz)
+
+            hidden_states_expanded = hidden_states.unsqueeze(2).expand_as(start_states) # shape (bsz, slen, start_n_top, hsz)
+            p_mask = p_mask.unsqueeze(-1) if p_mask is not None else None
+            end_logits = self.end_logits(hidden_states_expanded, start_states=start_states, p_mask=p_mask)
+            end_log_probs = F.softmax(end_logits, dim=1) # shape (bsz, slen, start_n_top)
+
+            end_top_log_probs, end_top_index = torch.topk(end_log_probs, self.end_n_top, dim=1) # shape (bsz, end_n_top, start_n_top)
+            end_top_log_probs = end_top_log_probs.view(-1, self.start_n_top * self.end_n_top)
+            end_top_index = end_top_index.view(-1, self.start_n_top * self.end_n_top)
+
+            start_states = torch.einsum("blh,bl->bh", hidden_states, start_log_probs)
+            cls_logits = self.answer_class(hidden_states, start_states=start_states, cls_index=cls_index)
+
+            outputs = (start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits) + outputs
+
+        # return start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits, mems, (hidden states), (attentions)
+        # or (if labels are provided) total_loss, start_logits, end_logits, (cls_logits), mems, (hidden states), (attentions)
+        return outputs
diff --git a/pytorch_pretrained_bert/tests/modeling_openai_test.py b/pytorch_pretrained_bert/tests/modeling_openai_test.py
index e3e9e2849d..627bc564de 100644
--- a/pytorch_pretrained_bert/tests/modeling_openai_test.py
+++ b/pytorch_pretrained_bert/tests/modeling_openai_test.py
@@ -16,11 +16,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import os
 import unittest
-import json
-import random
-import shutil
 import pytest
 
 import torch
diff --git a/pytorch_pretrained_bert/tests/modeling_xlm_test.py b/pytorch_pretrained_bert/tests/modeling_xlm_test.py
index 17ed978abe..3e442a09fb 100644
--- a/pytorch_pretrained_bert/tests/modeling_xlm_test.py
+++ b/pytorch_pretrained_bert/tests/modeling_xlm_test.py
@@ -20,7 +20,7 @@ import unittest
 import shutil
 import pytest
 
-from pytorch_pretrained_bert import (XLMConfig, XLMModel, XLMForQuestionAnswering, XLMForSequenceClassification)
+from pytorch_pretrained_bert import (XLMConfig, XLMModel, XLMWithLMHeadModel, XLMForQuestionAnswering, XLMForSequenceClassification)
 from pytorch_pretrained_bert.modeling_xlm import PRETRAINED_MODEL_ARCHIVE_MAP
 
 from .model_tests_commons import (create_and_check_commons, ConfigTester, ids_tensor)
@@ -58,7 +58,8 @@ class XLMModelTest(unittest.TestCase):
                      summary_type="last",
                      use_proj=True,
                      scope=None,
-                     all_model_classes = (XLMModel,),  # , XLMForSequenceClassification, XLMForTokenClassification),
+                     all_model_classes = (XLMModel, XLMWithLMHeadModel,
+                                          XLMForQuestionAnswering, XLMForSequenceClassification),  # , XLMForSequenceClassification, XLMForTokenClassification),
                     ):
             self.parent = parent
             self.batch_size = batch_size
@@ -93,6 +94,7 @@ class XLMModelTest(unittest.TestCase):
 
         def prepare_config_and_inputs(self):
             input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+            input_mask = ids_tensor([self.batch_size, self.seq_length], 2).float()
 
             input_lengths = None
             if self.use_input_lengths:
@@ -104,11 +106,11 @@ class XLMModelTest(unittest.TestCase):
 
             sequence_labels = None
             token_labels = None
-            choice_labels = None
+            is_impossible_labels = None
             if self.use_labels:
                 sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
                 token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-                choice_labels = ids_tensor([self.batch_size], self.num_choices)
+                is_impossible_labels = ids_tensor([self.batch_size], 2).float()
 
             config = XLMConfig(
                  vocab_size_or_config_json_file=self.vocab_size,
@@ -128,14 +130,14 @@ class XLMModelTest(unittest.TestCase):
                  summary_type=self.summary_type,
                  use_proj=self.use_proj)
 
-            return config, input_ids, token_type_ids, input_lengths, sequence_labels, token_labels, choice_labels
+            return config, input_ids, token_type_ids, input_lengths, sequence_labels, token_labels, is_impossible_labels, input_mask
 
         def check_loss_output(self, result):
             self.parent.assertListEqual(
                 list(result["loss"].size()),
                 [])
 
-        def create_and_check_xlm_model(self, config, input_ids, token_type_ids, input_lengths, sequence_labels, token_labels, choice_labels):
+        def create_and_check_xlm_model(self, config, input_ids, token_type_ids, input_lengths, sequence_labels, token_labels, is_impossible_labels, input_mask):
             model = XLMModel(config=config)
             model.eval()
             outputs = model(input_ids, lengths=input_lengths, langs=token_type_ids)
@@ -150,90 +152,92 @@ class XLMModelTest(unittest.TestCase):
                 [self.batch_size, self.seq_length, self.hidden_size])
 
 
-        # def create_and_check_xlm_for_masked_lm(self, config, input_ids, token_type_ids, input_lengths, sequence_labels, token_labels, choice_labels):
-        #     model = XLMForMaskedLM(config=config)
-        #     model.eval()
-        #     loss, prediction_scores = model(input_ids, token_type_ids, input_lengths, token_labels)
-        #     result = {
-        #         "loss": loss,
-        #         "prediction_scores": prediction_scores,
-        #     }
-        #     self.parent.assertListEqual(
-        #         list(result["prediction_scores"].size()),
-        #         [self.batch_size, self.seq_length, self.vocab_size])
-        #     self.check_loss_output(result)
+        def create_and_check_xlm_lm_head(self, config, input_ids, token_type_ids, input_lengths, sequence_labels, token_labels, is_impossible_labels, input_mask):
+            model = XLMWithLMHeadModel(config)
+            model.eval()
+
+            loss, logits = model(input_ids, token_type_ids=token_type_ids, labels=token_labels)
+
+            result = {
+                "loss": loss,
+                "logits": logits,
+            }
+
+            self.parent.assertListEqual(
+                list(result["loss"].size()),
+                [])
+            self.parent.assertListEqual(
+                list(result["logits"].size()),
+                [self.batch_size, self.seq_length, self.vocab_size])
 
 
-        # def create_and_check_xlm_for_question_answering(self, config, input_ids, token_type_ids, input_lengths, sequence_labels, token_labels, choice_labels):
-        #     model = XLMForQuestionAnswering(config=config)
-        #     model.eval()
-        #     loss, start_logits, end_logits = model(input_ids, token_type_ids, input_lengths, sequence_labels, sequence_labels)
-        #     result = {
-        #         "loss": loss,
-        #         "start_logits": start_logits,
-        #         "end_logits": end_logits,
-        #     }
-        #     self.parent.assertListEqual(
-        #         list(result["start_logits"].size()),
-        #         [self.batch_size, self.seq_length])
-        #     self.parent.assertListEqual(
-        #         list(result["end_logits"].size()),
-        #         [self.batch_size, self.seq_length])
-        #     self.check_loss_output(result)
+        def create_and_check_xlm_qa(self, config, input_ids, token_type_ids, input_lengths, sequence_labels, token_labels, is_impossible_labels, input_mask):
+            model = XLMForQuestionAnswering(config)
+            model.eval()
+
+            outputs = model(input_ids)
+            start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits, mems = outputs
+
+            outputs = model(input_ids, start_positions=sequence_labels,
+                                         end_positions=sequence_labels,
+                                         cls_index=sequence_labels,
+                                         is_impossible=is_impossible_labels,
+                                         p_mask=input_mask)
+
+            outputs = model(input_ids, start_positions=sequence_labels,
+                                         end_positions=sequence_labels,
+                                         cls_index=sequence_labels,
+                                         is_impossible=is_impossible_labels)
+
+            total_loss, start_logits, end_logits, cls_logits = outputs
+
+            outputs = model(input_ids, start_positions=sequence_labels,
+                                         end_positions=sequence_labels)
+
+            total_loss, start_logits, end_logits = outputs
+
+            result = {
+                "loss": total_loss,
+                "start_logits": start_logits,
+                "end_logits": end_logits,
+                "cls_logits": cls_logits,
+            }
+
+            self.parent.assertListEqual(
+                list(result["loss"].size()),
+                [])
+            self.parent.assertListEqual(
+                list(result["start_logits"].size()),
+                [self.batch_size, self.seq_length])
+            self.parent.assertListEqual(
+                list(result["end_logits"].size()),
+                [self.batch_size, self.seq_length])
+            self.parent.assertListEqual(
+                list(result["cls_logits"].size()),
+                [self.batch_size])
 
 
-        # def create_and_check_xlm_for_sequence_classification(self, config, input_ids, token_type_ids, input_lengths, sequence_labels, token_labels, choice_labels):
-        #     config.num_labels = self.num_labels
-        #     model = XLMForSequenceClassification(config)
-        #     model.eval()
-        #     loss, logits = model(input_ids, token_type_ids, input_lengths, sequence_labels)
-        #     result = {
-        #         "loss": loss,
-        #         "logits": logits,
-        #     }
-        #     self.parent.assertListEqual(
-        #         list(result["logits"].size()),
-        #         [self.batch_size, self.num_labels])
-        #     self.check_loss_output(result)
+        def create_and_check_xlm_sequence_classif(self, config, input_ids, token_type_ids, input_lengths, sequence_labels, token_labels, is_impossible_labels, input_mask):
+            model = XLMForSequenceClassification(config)
+            model.eval()
+
+            (logits,) = model(input_ids)
+            loss, logits = model(input_ids, labels=sequence_labels)
+
+            result = {
+                "loss": loss,
+                "logits": logits,
+            }
+
+            self.parent.assertListEqual(
+                list(result["loss"].size()),
+                [])
+            self.parent.assertListEqual(
+                list(result["logits"].size()),
+                [self.batch_size, self.type_sequence_label_size])
 
 
-        # def create_and_check_xlm_for_token_classification(self, config, input_ids, token_type_ids, input_lengths, sequence_labels, token_labels, choice_labels):
-        #     config.num_labels = self.num_labels
-        #     model = XLMForTokenClassification(config=config)
-        #     model.eval()
-        #     loss, logits = model(input_ids, token_type_ids, input_lengths, token_labels)
-        #     result = {
-        #         "loss": loss,
-        #         "logits": logits,
-        #     }
-        #     self.parent.assertListEqual(
-        #         list(result["logits"].size()),
-        #         [self.batch_size, self.seq_length, self.num_labels])
-        #     self.check_loss_output(result)
-
-
-        # def create_and_check_xlm_for_multiple_choice(self, config, input_ids, token_type_ids, input_lengths, sequence_labels, token_labels, choice_labels):
-        #     config.num_choices = self.num_choices
-        #     model = XLMForMultipleChoice(config=config)
-        #     model.eval()
-        #     multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
-        #     multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
-        #     multiple_choice_input_lengths = input_lengths.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
-        #     loss, logits = model(multiple_choice_inputs_ids,
-        #                  multiple_choice_token_type_ids,
-        #                  multiple_choice_input_lengths,
-        #                  choice_labels)
-        #     result = {
-        #         "loss": loss,
-        #         "logits": logits,
-        #     }
-        #     self.parent.assertListEqual(
-        #         list(result["logits"].size()),
-        #         [self.batch_size, self.num_choices])
-        #     self.check_loss_output(result)
-
-
-        def create_and_check_xlm_commons(self, config, input_ids, token_type_ids, input_lengths, sequence_labels, token_labels, choice_labels):
+        def create_and_check_xlm_commons(self, config, input_ids, token_type_ids, input_lengths, sequence_labels, token_labels, is_impossible_labels, input_mask):
             inputs_dict = {'input_ids': input_ids, 'token_type_ids': token_type_ids, 'lengths': input_lengths}
             create_and_check_commons(self, config, inputs_dict)
 
diff --git a/pytorch_pretrained_bert/tests/modeling_xlnet_test.py b/pytorch_pretrained_bert/tests/modeling_xlnet_test.py
index 13030567b9..58617cf7b9 100644
--- a/pytorch_pretrained_bert/tests/modeling_xlnet_test.py
+++ b/pytorch_pretrained_bert/tests/modeling_xlnet_test.py
@@ -49,6 +49,7 @@ class XLNetModelTest(unittest.TestCase):
                      d_inner=128,
                      num_hidden_layers=5,
                      max_position_embeddings=10,
+                     type_sequence_label_size=2,
                      untie_r=True,
                      bi_data=False,
                      same_length=False,
@@ -80,12 +81,14 @@ class XLNetModelTest(unittest.TestCase):
             self.initializer_range = initializer_range
             self.seed = seed
             self.type_vocab_size = type_vocab_size
+            self.type_sequence_label_size = type_sequence_label_size
             self.all_model_classes = all_model_classes
 
         def prepare_config_and_inputs(self):
             input_ids_1 = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
             input_ids_2 = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
             segment_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
+            input_mask = ids_tensor([self.batch_size, self.seq_length], 2).float()
 
             input_ids_q = ids_tensor([self.batch_size, self.seq_length + 1], self.vocab_size)
             perm_mask = torch.zeros(self.batch_size, self.seq_length + 1, self.seq_length + 1, dtype=torch.float)
@@ -94,30 +97,13 @@ class XLNetModelTest(unittest.TestCase):
             target_mapping[:, 0, -1] = 1.0  # predict last token
             inp_q = target_mapping[:, 0, :].clone()  # predict last token
 
-            # inp_k: int32 Tensor in shape [bsz, len], the input token IDs.
-            # token_type_ids: int32 Tensor in shape [bsz, len], the input segment IDs.
-            # input_mask: float32 Tensor in shape [bsz, len], the input mask.
-            #     0 for real tokens and 1 for padding.
-            # mems: a list of float32 Tensors in shape [bsz, mem_len, hidden_size], memory
-            #     from previous batches. The length of the list equals num_hidden_layers.
-            #     If None, no memory is used.
-            # perm_mask: float32 Tensor in shape [bsz, len, len].
-            #     If perm_mask[k, i, j] = 0, i attend to j in batch k;
-            #     if perm_mask[k, i, j] = 1, i does not attend to j in batch k.
-            #     If None, each position attends to all the others.
-            # target_mapping: float32 Tensor in shape [bsz, num_predict, len].
-            #     If target_mapping[k, i, j] = 1, the i-th predict in batch k is
-            #     on the j-th token.
-            #     Only used during pretraining for partial prediction.
-            #     Set to None during finetuning.
-            # inp_q: float32 Tensor in shape [bsz, len].
-            #     1 for tokens with losses and 0 for tokens without losses.
-            #     Only used during pretraining for two-stream attention.
-            #     Set to None during finetuning.
-
+            sequence_labels = None
             lm_labels = None
+            is_impossible_labels = None
             if self.use_labels:
                 lm_labels = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+                sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+                is_impossible_labels = ids_tensor([self.batch_size], 2).float()
 
             config = XLNetConfig(
                 vocab_size_or_config_json_file=self.vocab_size,
@@ -132,18 +118,23 @@ class XLNetModelTest(unittest.TestCase):
                 same_length=self.same_length,
                 reuse_len=self.reuse_len,
                 bi_data=self.bi_data,
-                initializer_range=self.initializer_range)
+                initializer_range=self.initializer_range,
+                num_labels=self.type_sequence_label_size)
 
-            return (config, input_ids_1, input_ids_2, input_ids_q, perm_mask, target_mapping, inp_q, segment_ids, lm_labels)
+            return (config, input_ids_1, input_ids_2, input_ids_q, perm_mask, input_mask,
+                    target_mapping, inp_q, segment_ids, lm_labels, sequence_labels, is_impossible_labels)
 
         def set_seed(self):
             random.seed(self.seed)
             torch.manual_seed(self.seed)
 
-        def create_and_check_xlnet_base_model(self, config, input_ids_1, input_ids_2, input_ids_q, perm_mask, target_mapping, inp_q, segment_ids, lm_labels):
+        def create_and_check_xlnet_base_model(self, config, input_ids_1, input_ids_2, input_ids_q, perm_mask, input_mask,
+                target_mapping, inp_q, segment_ids, lm_labels, sequence_labels, is_impossible_labels):
             model = XLNetModel(config)
             model.eval()
 
+            _, _ = model(input_ids_1, input_mask=input_mask)
+            _, _ = model(input_ids_1, attention_mask=input_mask)
             _, _ = model(input_ids_1, token_type_ids=segment_ids)
             outputs, mems_1 = model(input_ids_1)
 
@@ -159,7 +150,8 @@ class XLNetModelTest(unittest.TestCase):
                 list(list(mem.size()) for mem in result["mems_1"]),
                 [[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers)
 
-        def create_and_check_xlnet_lm_head(self, config, input_ids_1, input_ids_2, input_ids_q, perm_mask, target_mapping, inp_q, segment_ids, lm_labels):
+        def create_and_check_xlnet_lm_head(self, config, input_ids_1, input_ids_2, input_ids_q, perm_mask, input_mask,
+                target_mapping, inp_q, segment_ids, lm_labels, sequence_labels, is_impossible_labels):
             model = XLNetLMHeadModel(config)
             model.eval()
 
@@ -198,7 +190,82 @@ class XLNetModelTest(unittest.TestCase):
                 list(list(mem.size()) for mem in result["mems_2"]),
                 [[self.mem_len, self.batch_size, self.hidden_size]] * self.num_hidden_layers)
 
-        def create_and_check_xlnet_commons(self, config, input_ids_1, input_ids_2, input_ids_q, perm_mask, target_mapping, inp_q, segment_ids, lm_labels):
+        def create_and_check_xlnet_qa(self, config, input_ids_1, input_ids_2, input_ids_q, perm_mask, input_mask,
+                target_mapping, inp_q, segment_ids, lm_labels, sequence_labels, is_impossible_labels):
+            model = XLNetForQuestionAnswering(config)
+            model.eval()
+
+            outputs = model(input_ids_1)
+            start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits, mems = outputs
+
+            outputs = model(input_ids_1, start_positions=sequence_labels,
+                                         end_positions=sequence_labels,
+                                         cls_index=sequence_labels,
+                                         is_impossible=is_impossible_labels,
+                                         p_mask=input_mask)
+
+            outputs = model(input_ids_1, start_positions=sequence_labels,
+                                         end_positions=sequence_labels,
+                                         cls_index=sequence_labels,
+                                         is_impossible=is_impossible_labels)
+
+            total_loss, start_logits, end_logits, cls_logits, mems = outputs
+
+            outputs = model(input_ids_1, start_positions=sequence_labels,
+                                         end_positions=sequence_labels)
+
+            total_loss, start_logits, end_logits, mems = outputs
+
+            result = {
+                "loss": total_loss,
+                "start_logits": start_logits,
+                "end_logits": end_logits,
+                "cls_logits": cls_logits,
+                "mems": mems,
+            }
+
+            self.parent.assertListEqual(
+                list(result["loss"].size()),
+                [])
+            self.parent.assertListEqual(
+                list(result["start_logits"].size()),
+                [self.batch_size, self.seq_length])
+            self.parent.assertListEqual(
+                list(result["end_logits"].size()),
+                [self.batch_size, self.seq_length])
+            self.parent.assertListEqual(
+                list(result["cls_logits"].size()),
+                [self.batch_size])
+            self.parent.assertListEqual(
+                list(list(mem.size()) for mem in result["mems"]),
+                [[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers)
+
+        def create_and_check_xlnet_sequence_classif(self, config, input_ids_1, input_ids_2, input_ids_q, perm_mask, input_mask,
+                target_mapping, inp_q, segment_ids, lm_labels, sequence_labels, is_impossible_labels):
+            model = XLNetForSequenceClassification(config)
+            model.eval()
+
+            logits, mems_1 = model(input_ids_1)
+            loss, logits, mems_1 = model(input_ids_1, labels=sequence_labels)
+
+            result = {
+                "loss": loss,
+                "mems_1": mems_1,
+                "logits": logits,
+            }
+
+            self.parent.assertListEqual(
+                list(result["loss"].size()),
+                [])
+            self.parent.assertListEqual(
+                list(result["logits"].size()),
+                [self.batch_size, self.type_sequence_label_size])
+            self.parent.assertListEqual(
+                list(list(mem.size()) for mem in result["mems_1"]),
+                [[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers)
+
+        def create_and_check_xlnet_commons(self, config, input_ids_1, input_ids_2, input_ids_q, perm_mask, input_mask,
+                target_mapping, inp_q, segment_ids, lm_labels, sequence_labels, is_impossible_labels):
             inputs_dict = {'input_ids': input_ids_1}
             create_and_check_commons(self, config, inputs_dict, test_pruning=False)
 
@@ -224,28 +291,20 @@ class XLNetModelTest(unittest.TestCase):
 
         tester.set_seed()
         config_and_inputs = tester.prepare_config_and_inputs()
-        tester.create_and_check_xlnet_lm_head(*config_and_inputs)
+        tester.create_and_check_xlnet_lm_head(*config_and_inputs) 
+
+        tester.set_seed()
+        config_and_inputs = tester.prepare_config_and_inputs()
+        tester.create_and_check_xlnet_sequence_classif(*config_and_inputs)
+
+        tester.set_seed()
+        config_and_inputs = tester.prepare_config_and_inputs()
+        tester.create_and_check_xlnet_qa(*config_and_inputs)
 
         tester.set_seed()
         config_and_inputs = tester.prepare_config_and_inputs()
         tester.create_and_check_xlnet_commons(*config_and_inputs)
 
-    @classmethod
-    def mask_tensor(cls, shape, vocab_size, rng=None, name=None):
-        """Creates a tensor with padding on the right (0.0 for )."""
-        if rng is None:
-            rng = random.Random()
-
-        total_dims = 1
-        for dim in shape:
-            total_dims *= dim
-
-        values = []
-        for _ in range(total_dims):
-            values.append(rng.randint(0, vocab_size - 1))
-
-        return torch.tensor(data=values, dtype=torch.long).view(shape).contiguous()
-
 
 if __name__ == "__main__":
     unittest.main()

From cf86d23effd6adb6cbae613a98a2007eb98251ea Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Thu, 4 Jul 2019 17:02:21 +0200
Subject: [PATCH 051/139] parallelism in circlci

---
 .circleci/config.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 89819c1009..858ca001d6 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -12,6 +12,7 @@ jobs:
             - run: sudo python -m spacy download en
             - run: python -m pytest -sv ./pytorch_pretrained_bert/tests/ --cov
             - run: codecov
+        parallelism: 4
     build_py2:
         working_directory: ~/pytorch-pretrained-BERT
         docker:
@@ -24,6 +25,7 @@ jobs:
             - run: sudo python -m spacy download en
             - run: python -m pytest -sv ./pytorch_pretrained_bert/tests/ --cov
             - run: codecov
+        parallelism: 4
 workflows:
   version: 2
   build_and_test:

From c0239e09e634aac57a111717c25461f1e950cb3e Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Thu, 4 Jul 2019 17:06:30 +0200
Subject: [PATCH 052/139] first commit

---
 docs/index.rst | 2 ++
 1 file changed, 2 insertions(+)
 create mode 100644 docs/index.rst

diff --git a/docs/index.rst b/docs/index.rst
new file mode 100644
index 0000000000..4639f1d218
--- /dev/null
+++ b/docs/index.rst
@@ -0,0 +1,2 @@
+Home
+====

From e75c3f70aa5122dbdd06cc693a58df837e422721 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Fri, 5 Jul 2019 11:20:27 +0200
Subject: [PATCH 053/139] standardizing tokenizers API and adding tests

---
 pytorch_pretrained_bert/model_utils.py        |  6 ++
 pytorch_pretrained_bert/modeling_bert.py      |  1 +
 .../tests/tokenization_bert_test.py           | 17 +---
 .../tests/tokenization_gpt2_test.py           | 15 +---
 .../tests/tokenization_openai_test.py         | 16 +---
 .../tests/tokenization_tests_commons.py       | 81 +++++++++++++++++++
 .../tests/tokenization_transfo_xl_test.py     | 15 +---
 .../tests/tokenization_xlm_test.py            | 15 +---
 .../tests/tokenization_xlnet_test.py          | 34 ++------
 pytorch_pretrained_bert/tokenization_bert.py  | 16 +++-
 pytorch_pretrained_bert/tokenization_gpt2.py  |  6 +-
 .../tokenization_openai.py                    |  5 +-
 .../tokenization_transfo_xl.py                | 20 ++++-
 pytorch_pretrained_bert/tokenization_xlm.py   |  5 +-
 pytorch_pretrained_bert/tokenization_xlnet.py |  5 +-
 15 files changed, 150 insertions(+), 107 deletions(-)
 create mode 100644 pytorch_pretrained_bert/tests/tokenization_tests_commons.py

diff --git a/pytorch_pretrained_bert/model_utils.py b/pytorch_pretrained_bert/model_utils.py
index b72707ce08..051fbdefbc 100644
--- a/pytorch_pretrained_bert/model_utils.py
+++ b/pytorch_pretrained_bert/model_utils.py
@@ -598,3 +598,9 @@ def prune_layer(layer, index, dim=None):
         return prune_conv1d_layer(layer, index, dim=1 if dim is None else dim)
     else:
         raise ValueError("Can't prune layer of class {}".format(layer.__class__))
+
+def clean_up_tokenization(out_string):
+    out_string.replace(' .', '.').replace(' ?', '?').replace(' !', '!').replace(' ,', ','
+                    ).replace(" ' ", "'").replace(" n't", "n't").replace(" 'm", "'m").replace(" do not", " don't"
+                    ).replace(" 's", "'s").replace(" 've", "'ve").replace(" 're", "'re")
+    return out_string
diff --git a/pytorch_pretrained_bert/modeling_bert.py b/pytorch_pretrained_bert/modeling_bert.py
index 7b18cb8452..d4967b3718 100644
--- a/pytorch_pretrained_bert/modeling_bert.py
+++ b/pytorch_pretrained_bert/modeling_bert.py
@@ -48,6 +48,7 @@ PRETRAINED_MODEL_ARCHIVE_MAP = {
     'bert-large-cased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-pytorch_model.bin",
     'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-pytorch_model.bin",
 }
+
 PRETRAINED_CONFIG_ARCHIVE_MAP = {
     'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-config.json",
     'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-config.json",
diff --git a/pytorch_pretrained_bert/tests/tokenization_bert_test.py b/pytorch_pretrained_bert/tests/tokenization_bert_test.py
index e00771c1b1..3d0b4323b2 100644
--- a/pytorch_pretrained_bert/tests/tokenization_bert_test.py
+++ b/pytorch_pretrained_bert/tests/tokenization_bert_test.py
@@ -26,6 +26,7 @@ from pytorch_pretrained_bert.tokenization_bert import (BasicTokenizer,
                                                   _is_control, _is_punctuation,
                                                   _is_whitespace, PRETRAINED_VOCAB_ARCHIVE_MAP)
 
+from .tokenization_tests_commons import create_and_check_tokenizer_commons
 
 class TokenizationTest(unittest.TestCase):
 
@@ -36,28 +37,18 @@ class TokenizationTest(unittest.TestCase):
         ]
         with open("/tmp/bert_tokenizer_test.txt", "w", encoding='utf-8') as vocab_writer:
             vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
-
             vocab_file = vocab_writer.name
 
+        create_and_check_tokenizer_commons(self, BertTokenizer, vocab_file)
+
         tokenizer = BertTokenizer(vocab_file)
-        os.remove(vocab_file)
 
         tokens = tokenizer.tokenize(u"UNwant\u00E9d,running")
         self.assertListEqual(tokens, ["un", "##want", "##ed", ",", "runn", "##ing"])
+        self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [7, 4, 5, 10, 8, 9])
 
-        self.assertListEqual(
-            tokenizer.convert_tokens_to_ids(tokens), [7, 4, 5, 10, 8, 9])
-
-        vocab_file = tokenizer.save_vocabulary(vocab_path="/tmp/")
-        tokenizer = tokenizer.from_pretrained(vocab_file)
         os.remove(vocab_file)
 
-        tokens = tokenizer.tokenize(u"UNwant\u00E9d,running")
-        self.assertListEqual(tokens, ["un", "##want", "##ed", ",", "runn", "##ing"])
-
-        self.assertListEqual(
-            tokenizer.convert_tokens_to_ids(tokens), [7, 4, 5, 10, 8, 9])
-
     @pytest.mark.slow
     def test_tokenizer_from_pretrained(self):
         cache_dir = "/tmp/pytorch_pretrained_bert_test/"
diff --git a/pytorch_pretrained_bert/tests/tokenization_gpt2_test.py b/pytorch_pretrained_bert/tests/tokenization_gpt2_test.py
index 4ae804a060..70f69a1f23 100644
--- a/pytorch_pretrained_bert/tests/tokenization_gpt2_test.py
+++ b/pytorch_pretrained_bert/tests/tokenization_gpt2_test.py
@@ -22,6 +22,7 @@ import pytest
 
 from pytorch_pretrained_bert.tokenization_gpt2 import GPT2Tokenizer, PRETRAINED_VOCAB_ARCHIVE_MAP
 
+from .tokenization_tests_commons import create_and_check_tokenizer_commons
 
 class GPT2TokenizationTest(unittest.TestCase):
 
@@ -39,10 +40,9 @@ class GPT2TokenizationTest(unittest.TestCase):
             fp.write("\n".join(merges))
             merges_file = fp.name
 
-        tokenizer = GPT2Tokenizer(vocab_file, merges_file, special_tokens=["<unk>", "<pad>"])
-        os.remove(vocab_file)
-        os.remove(merges_file)
+        create_and_check_tokenizer_commons(self, GPT2Tokenizer, vocab_file, merges_file, special_tokens=["<unk>", "<pad>"])
 
+        tokenizer = GPT2Tokenizer(vocab_file, merges_file, special_tokens=["<unk>", "<pad>"])
         text = "lower"
         bpe_tokens = ["low", "er"]
         tokens = tokenizer.tokenize(text)
@@ -53,17 +53,8 @@ class GPT2TokenizationTest(unittest.TestCase):
         self.assertListEqual(
             tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
 
-        vocab_file, merges_file, special_tokens_file = tokenizer.save_vocabulary(vocab_path="/tmp/")
-        tokenizer_2 = GPT2Tokenizer.from_pretrained("/tmp/")
         os.remove(vocab_file)
         os.remove(merges_file)
-        os.remove(special_tokens_file)
-
-        self.assertListEqual(
-            [tokenizer.encoder, tokenizer.decoder, tokenizer.bpe_ranks,
-             tokenizer.special_tokens, tokenizer.special_tokens_decoder],
-            [tokenizer_2.encoder, tokenizer_2.decoder, tokenizer_2.bpe_ranks,
-             tokenizer_2.special_tokens, tokenizer_2.special_tokens_decoder])
 
     # @pytest.mark.slow
     def test_tokenizer_from_pretrained(self):
diff --git a/pytorch_pretrained_bert/tests/tokenization_openai_test.py b/pytorch_pretrained_bert/tests/tokenization_openai_test.py
index a57f86be57..6ae72858a7 100644
--- a/pytorch_pretrained_bert/tests/tokenization_openai_test.py
+++ b/pytorch_pretrained_bert/tests/tokenization_openai_test.py
@@ -22,6 +22,8 @@ import pytest
 
 from pytorch_pretrained_bert.tokenization_openai import OpenAIGPTTokenizer, PRETRAINED_VOCAB_ARCHIVE_MAP
 
+from.tokenization_tests_commons import create_and_check_tokenizer_commons
+
 
 class OpenAIGPTTokenizationTest(unittest.TestCase):
 
@@ -40,6 +42,8 @@ class OpenAIGPTTokenizationTest(unittest.TestCase):
             fp.write("\n".join(merges))
             merges_file = fp.name
 
+        create_and_check_tokenizer_commons(self, OpenAIGPTTokenizer, vocab_file, merges_file, special_tokens=["<unk>", "<pad>"])
+
         tokenizer = OpenAIGPTTokenizer(vocab_file, merges_file, special_tokens=["<unk>", "<pad>"])
         os.remove(vocab_file)
         os.remove(merges_file)
@@ -54,18 +58,6 @@ class OpenAIGPTTokenizationTest(unittest.TestCase):
         self.assertListEqual(
             tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
 
-        vocab_file, merges_file, special_tokens_file = tokenizer.save_vocabulary(vocab_path="/tmp/")
-        tokenizer_2 = OpenAIGPTTokenizer.from_pretrained("/tmp/")
-        os.remove(vocab_file)
-        os.remove(merges_file)
-        os.remove(special_tokens_file)
-
-        self.assertListEqual(
-            [tokenizer.encoder, tokenizer.decoder, tokenizer.bpe_ranks,
-             tokenizer.special_tokens, tokenizer.special_tokens_decoder],
-            [tokenizer_2.encoder, tokenizer_2.decoder, tokenizer_2.bpe_ranks,
-             tokenizer_2.special_tokens, tokenizer_2.special_tokens_decoder])
-
     @pytest.mark.slow
     def test_tokenizer_from_pretrained(self):
         cache_dir = "/tmp/pytorch_pretrained_bert_test/"
diff --git a/pytorch_pretrained_bert/tests/tokenization_tests_commons.py b/pytorch_pretrained_bert/tests/tokenization_tests_commons.py
new file mode 100644
index 0000000000..e8f7ee7a25
--- /dev/null
+++ b/pytorch_pretrained_bert/tests/tokenization_tests_commons.py
@@ -0,0 +1,81 @@
+# coding=utf-8
+# Copyright 2019 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import sys
+from io import open
+
+if sys.version_info[0] == 3:
+    unicode = str
+
+if sys.version_info[0] == 2:
+    import cPickle as pickle
+else:
+    import pickle
+
+
+def create_and_check_save_and_load_tokenizer(tester, tokenizer_class, *inputs, **kwargs):
+    tokenizer = tokenizer_class(*inputs, **kwargs)
+
+    before_tokens = tokenizer.encode(u"He is very happy, UNwant\u00E9d,running")
+
+    vocab_path="/tmp/"
+    output_files = tokenizer.save_vocabulary(vocab_path=vocab_path)
+    tokenizer = tokenizer.from_pretrained(vocab_path)
+
+    for f in output_files:
+        os.remove(f)
+
+    after_tokens = tokenizer.encode(u"He is very happy, UNwant\u00E9d,running")
+    tester.assertListEqual(before_tokens, after_tokens)
+
+def create_and_check_pickle_tokenizer(tester, tokenizer_class, *inputs, **kwargs):
+    tokenizer = tokenizer_class(*inputs, **kwargs)
+
+    text = "Munich and Berlin are nice cities"
+    filename = u"/tmp/tokenizer.bin"
+
+    subwords = tokenizer.tokenize(text)
+
+    pickle.dump(tokenizer, open(filename, "wb"))
+
+    tokenizer_new = pickle.load(open(filename, "rb"))
+    subwords_loaded = tokenizer_new.tokenize(text)
+
+    tester.assertListEqual(subwords, subwords_loaded)
+
+
+def create_and_check_required_methods_tokenizer(tester, tokenizer_class, *inputs, **kwargs):
+    tokenizer = tokenizer_class(*inputs, **kwargs)
+
+    text = u"He is very happy, UNwant\u00E9d,running"
+    tokens = tokenizer.tokenize(text)
+    ids = tokenizer.convert_tokens_to_ids(tokens)
+    ids_2 = tokenizer.encode(text)
+    tester.assertListEqual(ids, ids_2)
+
+    tokens_2 = tokenizer.convert_ids_to_tokens(ids)
+    text_2 = tokenizer.decode(ids)
+
+    tester.assertNotEqual(len(tokens_2), 0)
+    tester.assertIsInstance(text_2, (str, unicode))
+
+def create_and_check_tokenizer_commons(tester, tokenizer_class, *inputs, **kwargs):
+    create_and_check_required_methods_tokenizer(tester, tokenizer_class, *inputs, **kwargs)
+    create_and_check_save_and_load_tokenizer(tester, tokenizer_class, *inputs, **kwargs)
+    create_and_check_pickle_tokenizer(tester, tokenizer_class, *inputs, **kwargs)
diff --git a/pytorch_pretrained_bert/tests/tokenization_transfo_xl_test.py b/pytorch_pretrained_bert/tests/tokenization_transfo_xl_test.py
index 226db4598e..a5ff30ab6e 100644
--- a/pytorch_pretrained_bert/tests/tokenization_transfo_xl_test.py
+++ b/pytorch_pretrained_bert/tests/tokenization_transfo_xl_test.py
@@ -22,6 +22,7 @@ import pytest
 
 from pytorch_pretrained_bert.tokenization_transfo_xl import TransfoXLTokenizer, PRETRAINED_VOCAB_ARCHIVE_MAP
 
+from.tokenization_tests_commons import create_and_check_tokenizer_commons
 
 class TransfoXLTokenizationTest(unittest.TestCase):
 
@@ -33,8 +34,9 @@ class TransfoXLTokenizationTest(unittest.TestCase):
             vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
             vocab_file = vocab_writer.name
 
+        create_and_check_tokenizer_commons(self, TransfoXLTokenizer, vocab_file=vocab_file, lower_case=True)
+
         tokenizer = TransfoXLTokenizer(vocab_file=vocab_file, lower_case=True)
-        tokenizer.build_vocab()
         os.remove(vocab_file)
 
         tokens = tokenizer.tokenize(u"<unk> UNwanted , running")
@@ -43,17 +45,6 @@ class TransfoXLTokenizationTest(unittest.TestCase):
         self.assertListEqual(
             tokenizer.convert_tokens_to_ids(tokens), [0, 4, 8, 7])
 
-        vocab_file = tokenizer.save_vocabulary(vocab_path="/tmp/")
-        tokenizer = tokenizer.from_pretrained(vocab_file)
-        os.remove(vocab_file)
-
-        tokens = tokenizer.tokenize(u"<unk> UNwanted , running")
-        self.assertListEqual(tokens, ["<unk>", "unwanted", ",", "running"])
-
-        self.assertListEqual(
-            tokenizer.convert_tokens_to_ids(tokens), [0, 4, 8, 7])
-
-
     def test_full_tokenizer_lower(self):
         tokenizer = TransfoXLTokenizer(lower_case=True)
 
diff --git a/pytorch_pretrained_bert/tests/tokenization_xlm_test.py b/pytorch_pretrained_bert/tests/tokenization_xlm_test.py
index d288f2fe60..3b2db8ea1f 100644
--- a/pytorch_pretrained_bert/tests/tokenization_xlm_test.py
+++ b/pytorch_pretrained_bert/tests/tokenization_xlm_test.py
@@ -22,6 +22,7 @@ import pytest
 
 from pytorch_pretrained_bert.tokenization_xlm import XLMTokenizer, PRETRAINED_VOCAB_ARCHIVE_MAP
 
+from.tokenization_tests_commons import create_and_check_tokenizer_commons
 
 class XLMTokenizationTest(unittest.TestCase):
 
@@ -40,6 +41,8 @@ class XLMTokenizationTest(unittest.TestCase):
             fp.write("\n".join(merges))
             merges_file = fp.name
 
+        create_and_check_tokenizer_commons(self, XLMTokenizer, vocab_file, merges_file, special_tokens=["<unk>", "<pad>"])
+
         tokenizer = XLMTokenizer(vocab_file, merges_file, special_tokens=["<unk>", "<pad>"])
         os.remove(vocab_file)
         os.remove(merges_file)
@@ -54,18 +57,6 @@ class XLMTokenizationTest(unittest.TestCase):
         self.assertListEqual(
             tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
 
-        vocab_file, merges_file, special_tokens_file = tokenizer.save_vocabulary(vocab_path="/tmp/")
-        tokenizer_2 = XLMTokenizer.from_pretrained("/tmp/")
-        os.remove(vocab_file)
-        os.remove(merges_file)
-        os.remove(special_tokens_file)
-
-        self.assertListEqual(
-            [tokenizer.encoder, tokenizer.decoder, tokenizer.bpe_ranks,
-             tokenizer.special_tokens, tokenizer.special_tokens_decoder],
-            [tokenizer_2.encoder, tokenizer_2.decoder, tokenizer_2.bpe_ranks,
-             tokenizer_2.special_tokens, tokenizer_2.special_tokens_decoder])
-
     @pytest.mark.slow
     def test_tokenizer_from_pretrained(self):
         cache_dir = "/tmp/pytorch_pretrained_bert_test/"
diff --git a/pytorch_pretrained_bert/tests/tokenization_xlnet_test.py b/pytorch_pretrained_bert/tests/tokenization_xlnet_test.py
index 707a516b96..9b6dd5a6c4 100644
--- a/pytorch_pretrained_bert/tests/tokenization_xlnet_test.py
+++ b/pytorch_pretrained_bert/tests/tokenization_xlnet_test.py
@@ -15,28 +15,25 @@
 from __future__ import absolute_import, division, print_function, unicode_literals
 
 import os
-import sys
 import unittest
-from io import open
 import shutil
 import pytest
 
-if sys.version_info[0] == 2:
-    import cPickle as pickle
-else:
-    import pickle
-
 from pytorch_pretrained_bert.tokenization_xlnet import (XLNetTokenizer,
                                                         PRETRAINED_VOCAB_ARCHIVE_MAP,
                                                         SPIECE_UNDERLINE)
 
+from.tokenization_tests_commons import create_and_check_tokenizer_commons
+
 SAMPLE_VOCAB = os.path.join(os.path.dirname(os.path.abspath(__file__)),
                     'fixtures/test_sentencepiece.model')
 
 class XLNetTokenizationTest(unittest.TestCase):
 
     def test_full_tokenizer(self):
-        tokenizer = XLNetTokenizer(SAMPLE_VOCAB)
+        create_and_check_tokenizer_commons(self, XLNetTokenizer, SAMPLE_VOCAB)
+
+        tokenizer = XLNetTokenizer(SAMPLE_VOCAB, keep_accents=True)
 
         tokens = tokenizer.tokenize(u'This is a test')
         self.assertListEqual(tokens, [u'▁This', u'▁is', u'▁a', u'▁t', u'est'])
@@ -44,11 +41,6 @@ class XLNetTokenizationTest(unittest.TestCase):
         self.assertListEqual(
             tokenizer.convert_tokens_to_ids(tokens), [285, 46, 10, 170, 382])
 
-        vocab_path = u"/tmp/"
-        vocab_file, special_tokens_file = tokenizer.save_vocabulary(vocab_path)
-        tokenizer = tokenizer.from_pretrained(vocab_path,
-                                              keep_accents=True)
-
         tokens = tokenizer.tokenize(u"I was born in 92000, and this is falsé.")
         self.assertListEqual(tokens, [SPIECE_UNDERLINE + u'I', SPIECE_UNDERLINE + u'was', SPIECE_UNDERLINE + u'b',
                                       u'or', u'n', SPIECE_UNDERLINE + u'in', SPIECE_UNDERLINE + u'',
@@ -68,22 +60,6 @@ class XLNetTokenizationTest(unittest.TestCase):
                                            SPIECE_UNDERLINE + u'is', SPIECE_UNDERLINE + u'f', u'al', u's',
                                            u'<unk>', u'.'])
 
-        text = "Munich and Berlin are nice cities"
-        filename = u"/tmp/tokenizer.bin"
-
-        subwords = tokenizer.tokenize(text)
-
-        pickle.dump(tokenizer, open(filename, "wb"))
-
-        tokenizer_new = pickle.load(open(filename, "rb"))
-        subwords_loaded = tokenizer_new.tokenize(text)
-
-        self.assertListEqual(subwords, subwords_loaded)
-
-        os.remove(filename)
-        os.remove(vocab_file)
-        os.remove(special_tokens_file)
-
     @pytest.mark.slow
     def test_tokenizer_from_pretrained(self):
         cache_dir = "/tmp/pytorch_pretrained_bert_test/"
diff --git a/pytorch_pretrained_bert/tokenization_bert.py b/pytorch_pretrained_bert/tokenization_bert.py
index 328964c535..c8db62b9c0 100644
--- a/pytorch_pretrained_bert/tokenization_bert.py
+++ b/pytorch_pretrained_bert/tokenization_bert.py
@@ -23,6 +23,7 @@ import unicodedata
 from io import open
 
 from .file_utils import cached_path
+from .model_utils import clean_up_tokenization
 
 logger = logging.getLogger(__name__)
 
@@ -185,6 +186,19 @@ class BertTokenizer(object):
             tokens.append(self.ids_to_tokens[i])
         return tokens
 
+    def encode(self, text):
+        return self.convert_tokens_to_ids(self.tokenize(text))
+
+    def decode(self, token_ids, clean_up_tokenization_spaces=True):
+        """Converts a sequence of ids in a string."""
+        tokens = self.convert_ids_to_tokens(token_ids)
+        out_string = ''.join(tokens).replace(' ##', '').strip()
+        if clean_up_tokenization_spaces:
+            for special_tok in (self.UNK_TOKEN, self.SEP_TOKEN, self.PAD_TOKEN, self.CLS_TOKEN, self.MASK_TOKEN):
+                out_string = out_string.replace(special_tok, '')
+            out_string = clean_up_tokenization(out_string)
+        return out_string
+
     def save_vocabulary(self, vocab_path):
         """Save the tokenizer vocabulary to a directory or file."""
         index = 0
@@ -198,7 +212,7 @@ class BertTokenizer(object):
                     index = token_index
                 writer.write(token + u'\n')
                 index += 1
-        return vocab_file
+        return (vocab_file,)
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, *inputs, **kwargs):
diff --git a/pytorch_pretrained_bert/tokenization_gpt2.py b/pytorch_pretrained_bert/tokenization_gpt2.py
index 78f7f59d65..2947ce66b8 100644
--- a/pytorch_pretrained_bert/tokenization_gpt2.py
+++ b/pytorch_pretrained_bert/tokenization_gpt2.py
@@ -23,6 +23,8 @@ import os
 import regex as re
 from io import open
 
+from .model_utils import clean_up_tokenization
+
 try:
     from functools import lru_cache
 except ImportError:
@@ -275,9 +277,7 @@ class GPT2Tokenizer(object):
         text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors=self.errors)
         if clean_up_tokenization_spaces:
             text = text.replace('<unk>', '')
-            text = text.replace(' .', '.').replace(' ?', '?').replace(' !', '!').replace(' ,', ','
-                    ).replace(" ' ", "'").replace(" n't", "n't").replace(" 'm", "'m").replace(" do not", " don't"
-                    ).replace(" 's", "'s").replace(" 've", "'ve").replace(" 're", "'re")
+            text = clean_up_tokenization(text)
         return text
 
     def save_vocabulary(self, vocab_path):
diff --git a/pytorch_pretrained_bert/tokenization_openai.py b/pytorch_pretrained_bert/tokenization_openai.py
index 5b2bd31cd0..7d005a8260 100644
--- a/pytorch_pretrained_bert/tokenization_openai.py
+++ b/pytorch_pretrained_bert/tokenization_openai.py
@@ -26,6 +26,7 @@ from io import open
 from tqdm import tqdm
 
 from .file_utils import cached_path
+from .model_utils import clean_up_tokenization
 from .tokenization_bert import BasicTokenizer
 
 logger = logging.getLogger(__name__)
@@ -277,9 +278,7 @@ class OpenAIGPTTokenizer(object):
         out_string = ''.join(tokens).replace('</w>', ' ').strip()
         if clean_up_tokenization_spaces:
             out_string = out_string.replace('<unk>', '')
-            out_string = out_string.replace(' .', '.').replace(' ?', '?').replace(' !', '!').replace(' ,', ','
-                    ).replace(" ' ", "'").replace(" n't", "n't").replace(" 'm", "'m").replace(" do not", " don't"
-                    ).replace(" 's", "'s").replace(" 've", "'ve").replace(" 're", "'re")
+            out_string = clean_up_tokenization(out_string)
         return out_string
 
     def save_vocabulary(self, vocab_path):
diff --git a/pytorch_pretrained_bert/tokenization_transfo_xl.py b/pytorch_pretrained_bert/tokenization_transfo_xl.py
index 6a882e0a7f..7e83680770 100644
--- a/pytorch_pretrained_bert/tokenization_transfo_xl.py
+++ b/pytorch_pretrained_bert/tokenization_transfo_xl.py
@@ -31,6 +31,7 @@ import torch
 import numpy as np
 
 from .file_utils import cached_path
+from .model_utils import clean_up_tokenization
 
 if sys.version_info[0] == 2:
     import cPickle as pickle
@@ -109,6 +110,9 @@ class TransfoXLTokenizer(object):
         self.vocab_file = vocab_file
         self.never_split = never_split
 
+        if vocab_file is not None:
+            self.build_vocab()
+
     def count_file(self, path, verbose=False, add_eos=False):
         if verbose: print('counting file {} ...'.format(path))
         assert os.path.exists(path)
@@ -155,7 +159,7 @@ class TransfoXLTokenizer(object):
         if os.path.isdir(vocab_path):
             vocab_file = os.path.join(vocab_path, VOCAB_NAME)
         torch.save(self.__dict__, vocab_file)
-        return vocab_file
+        return (vocab_file,)
 
     def build_vocab(self):
         if self.vocab_file:
@@ -251,12 +255,20 @@ class TransfoXLTokenizer(object):
     def convert_to_tensor(self, symbols):
         return torch.LongTensor(self.convert_tokens_to_ids(symbols))
 
-    def decode(self, indices, exclude=None):
+    def encode(self, text):
+        return self.convert_tokens_to_ids(self.tokenize(text))
+
+    def decode(self, indices, exclude=None, clean_up_tokenization_spaces=True):
         """Converts a sequence of indices in a string."""
         if exclude is None:
-            return ' '.join([self.get_sym(idx) for idx in indices])
+            out_string = ' '.join([self.get_sym(idx) for idx in indices])
         else:
-            return ' '.join([self.get_sym(idx) for idx in indices if idx not in exclude])
+            out_string = ' '.join([self.get_sym(idx) for idx in indices if idx not in exclude])
+
+        if clean_up_tokenization_spaces:
+            out_string = clean_up_tokenization(out_string)
+
+        return out_string
 
     def __len__(self):
         return len(self.idx2sym)
diff --git a/pytorch_pretrained_bert/tokenization_xlm.py b/pytorch_pretrained_bert/tokenization_xlm.py
index 25a0c1b542..26c73c56b2 100644
--- a/pytorch_pretrained_bert/tokenization_xlm.py
+++ b/pytorch_pretrained_bert/tokenization_xlm.py
@@ -26,6 +26,7 @@ from io import open
 from tqdm import tqdm
 
 from .file_utils import cached_path
+from .model_utils import clean_up_tokenization
 from .tokenization_bert import BasicTokenizer
 
 logger = logging.getLogger(__name__)
@@ -285,9 +286,7 @@ class XLMTokenizer(object):
         out_string = ''.join(tokens).replace('</w>', ' ').strip()
         if clean_up_tokenization_spaces:
             out_string = out_string.replace('<unk>', '')
-            out_string = out_string.replace(' .', '.').replace(' ?', '?').replace(' !', '!').replace(' ,', ','
-                    ).replace(" ' ", "'").replace(" n't", "n't").replace(" 'm", "'m").replace(" do not", " don't"
-                    ).replace(" 's", "'s").replace(" 've", "'ve").replace(" 're", "'re")
+            out_string = clean_up_tokenization(out_string)
         return out_string
 
     def save_vocabulary(self, vocab_path):
diff --git a/pytorch_pretrained_bert/tokenization_xlnet.py b/pytorch_pretrained_bert/tokenization_xlnet.py
index 3e28bbb807..76b9a9f870 100644
--- a/pytorch_pretrained_bert/tokenization_xlnet.py
+++ b/pytorch_pretrained_bert/tokenization_xlnet.py
@@ -27,6 +27,7 @@ import unicodedata
 import six
 
 from .file_utils import cached_path
+from .model_utils import clean_up_tokenization
 
 logger = logging.getLogger(__name__)
 
@@ -316,9 +317,7 @@ class XLNetTokenizer(object):
         out_string = ''.join(tokens)
         if clean_up_tokenization_spaces:
             out_string = out_string.strip().replace('<unk>', '')
-            out_string = out_string.replace(' .', '.').replace(' ?', '?').replace(' !', '!').replace(' ,', ','
-                    ).replace(" ' ", "'").replace(" n't", "n't").replace(" 'm", "'m").replace(" do not", " don't"
-                    ).replace(" 's", "'s").replace(" 've", "'ve").replace(" 're", "'re")
+            out_string = clean_up_tokenization(out_string)
         return out_string
 
     def save_vocabulary(self, vocab_path):

From 9113b50c9674c1821a337195ae3cd7f0bb9a86c0 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Fri, 5 Jul 2019 11:31:51 +0200
Subject: [PATCH 054/139] hubs [WIP]

---
 hubconfs/xlm_hubconf.py                       | 167 ++++++++++++++++++
 .../{xlnet_hubconf.py => xlnet_hubconf.1.py}  |   0
 2 files changed, 167 insertions(+)
 create mode 100644 hubconfs/xlm_hubconf.py
 rename hubconfs/{xlnet_hubconf.py => xlnet_hubconf.1.py} (100%)

diff --git a/hubconfs/xlm_hubconf.py b/hubconfs/xlm_hubconf.py
new file mode 100644
index 0000000000..154f875bfb
--- /dev/null
+++ b/hubconfs/xlm_hubconf.py
@@ -0,0 +1,167 @@
+from pytorch_pretrained_bert.tokenization_xlm import XLMTokenizer
+from pytorch_pretrained_bert.modeling_xlm import (
+    XLMConfig,
+    XLMModel,
+    XLMWithLMHeadModel,
+    XLMForSequenceClassification,
+    XLMForQuestionAnswering
+)
+
+# A lot of models share the same param doc. Use a decorator
+# to save typing
+xlm_start_docstring = """
+    Model class adapted from the XLM Transformer model of
+        "Cross-lingual Language Model Pretraining" by Guillaume Lample, Alexis Conneau
+        Paper: https://arxiv.org/abs/1901.07291
+        Original code: https://github.com/facebookresearch/XLM
+
+    Example:
+        # Load the tokenizer
+        >>> import torch
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'xlmTokenizer', 'xlm-mlm-en-2048')
+
+        #  Prepare tokenized input
+        >>> text_1 = "Who was Jim Henson ?"
+        >>> text_2 = "Jim Henson was a puppeteer"
+        >>> indexed_tokens_1 = tokenizer.encode(text_1)
+        >>> indexed_tokens_2 = tokenizer.encode(text_2)
+        >>> tokens_tensor_1 = torch.tensor([indexed_tokens_1])
+        >>> tokens_tensor_2 = torch.tensor([indexed_tokens_2])
+"""
+
+# A lot of models share the same param doc. Use a decorator
+# to save typing
+xlm_end_docstring = """
+    Params:
+        pretrained_model_name_or_path: either:
+            - a str with the name of a pre-trained model to load selected in the list of:
+                . `xlm-mlm-en-2048`
+            - a path or url to a pretrained model archive containing:
+                . `config.json` a configuration file for the model
+                . `pytorch_model.bin` a PyTorch dump created using the `convert_xlm_checkpoint_to_pytorch` conversion script
+        cache_dir: an optional path to a folder in which the pre-trained models will be cached.
+        state_dict: an optional state dictionary (collections.OrderedDict object) to use instead of pre-trained models
+        *inputs, **kwargs: additional input for the specific XLM class
+"""
+
+
+def _begin_with_docstring(docstr):
+    def docstring_decorator(fn):
+        fn.__doc__ = fn.__doc__ + docstr
+        return fn
+    return docstring_decorator
+
+def _end_with_docstring(docstr):
+    def docstring_decorator(fn):
+        fn.__doc__ = fn.__doc__ + docstr
+        return fn
+    return docstring_decorator
+
+
+def xlmTokenizer(*args, **kwargs):
+    """
+    Instantiate a XLM BPE tokenizer for XLM from a pre-trained vocab file.
+
+    Args:
+    pretrained_model_name_or_path: Path to pretrained model archive
+                                   or one of pre-trained vocab configs below.
+                                       * xlm-mlm-en-2048
+    Keyword args:
+    special_tokens: Special tokens in vocabulary that are not pretrained
+                    Default: None
+    max_len: An artificial maximum length to truncate tokenized sequences to;
+             Effective maximum length is always the minimum of this
+             value (if specified) and the underlying model's
+             sequence length.
+             Default: None
+
+    Example:
+        >>> import torch
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'xlmTokenizer', 'xlm-mlm-en-2048')
+
+        >>> text = "Who was Jim Henson ?"
+        >>> indexed_tokens = tokenizer.encode(tokenized_text)
+    """
+    tokenizer = XLMTokenizer.from_pretrained(*args, **kwargs)
+    return tokenizer
+
+
+@_begin_with_docstring(xlm_start_docstring)
+@_end_with_docstring(xlm_end_docstring)
+def xlmModel(*args, **kwargs):
+    """
+        # Load xlmModel
+        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'xlmModel', 'xlm-mlm-en-2048')
+        >>> model.eval()
+
+        # Predict hidden states features for each layer
+        >>> with torch.no_grad():
+                hidden_states_1, mems = model(tokens_tensor_1)
+                hidden_states_2, mems = model(tokens_tensor_2, past=mems)
+    """
+    model = XLMModel.from_pretrained(*args, **kwargs)
+    return model
+
+
+@_begin_with_docstring(xlm_start_docstring)
+@_end_with_docstring(xlm_end_docstring)
+def xlmLMHeadModel(*args, **kwargs):
+    """
+        #  Prepare tokenized input
+        >>> text_1 = "Who was Jim Henson ?"
+        >>> text_2 = "Jim Henson was a puppeteer"
+        >>> indexed_tokens_1 = tokenizer.encode(text_1)
+        >>> indexed_tokens_2 = tokenizer.encode(text_2)
+        >>> tokens_tensor_1 = torch.tensor([indexed_tokens_1])
+        >>> tokens_tensor_2 = torch.tensor([indexed_tokens_2])
+
+        # Load xlnetLMHeadModel
+        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'xlnetLMHeadModel', 'xlm-mlm-en-2048')
+        >>> model.eval()
+
+        # Predict hidden states features for each layer
+        >>> with torch.no_grad():
+                predictions_1, mems = model(tokens_tensor_1)
+                predictions_2, mems = model(tokens_tensor_2, mems=mems)
+
+        # Get the predicted last token
+        >>> predicted_index = torch.argmax(predictions_2[0, -1, :]).item()
+        >>> predicted_token = tokenizer.decode([predicted_index])
+        >>> assert predicted_token == ' who'
+    """
+    model = XLMWithLMHeadModel.from_pretrained(*args, **kwargs)
+    return model
+
+
+# @_end_with_docstring(xlnet_docstring)
+# def xlnetForSequenceClassification(*args, **kwargs):
+#     """
+#     xlnetModel is the basic XLNet Transformer model from
+#         "XLNet: Generalized Autoregressive Pretraining for Language Understanding"
+#         by Zhilin Yang, Zihang Dai1, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le
+
+#     Example:
+#         # Load the tokenizer
+#         >>> import torch
+#         >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'xlnetTokenizer', 'xlm-mlm-en-2048')
+
+#         #  Prepare tokenized input
+#         >>> text1 = "Who was Jim Henson ? Jim Henson was a puppeteer"
+#         >>> text2 = "Who was Jim Henson ? Jim Henson was a mysterious young man"
+#         >>> tokenized_text1 = tokenizer.tokenize(text1)
+#         >>> tokenized_text2 = tokenizer.tokenize(text2)
+#         >>> indexed_tokens1 = tokenizer.convert_tokens_to_ids(tokenized_text1)
+#         >>> indexed_tokens2 = tokenizer.convert_tokens_to_ids(tokenized_text2)
+#         >>> tokens_tensor = torch.tensor([[indexed_tokens1, indexed_tokens2]])
+#         >>> mc_token_ids = torch.LongTensor([[len(tokenized_text1)-1, len(tokenized_text2)-1]])
+
+#         # Load xlnetForSequenceClassification
+#         >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'xlnetForSequenceClassification', 'xlm-mlm-en-2048')
+#         >>> model.eval()
+
+#         # Predict sequence classes logits
+#         >>> with torch.no_grad():
+#                 lm_logits, mems = model(tokens_tensor)
+#     """
+#     model = XLNetForSequenceClassification.from_pretrained(*args, **kwargs)
+#     return model
diff --git a/hubconfs/xlnet_hubconf.py b/hubconfs/xlnet_hubconf.1.py
similarity index 100%
rename from hubconfs/xlnet_hubconf.py
rename to hubconfs/xlnet_hubconf.1.py

From 0bab55d5d52e4d538888980d05d73acc6da6274a Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Fri, 5 Jul 2019 11:55:36 +0200
Subject: [PATCH 055/139] [BIG] name change

---
 .circleci/config.yml                          |   8 +-
 .coveragerc                                   |   2 +-
 README.md                                     | 112 +++++++++---------
 docker/Dockerfile                             |   2 +-
 examples/bertology.py                         |   2 +-
 examples/generation_xlnet.py                  |   2 +-
 .../lm_finetuning/finetune_on_pregenerated.py |   8 +-
 .../pregenerate_training_data.py              |   2 +-
 .../lm_finetuning/simple_lm_finetuning.py     |   8 +-
 examples/run_bert_classifier.py               |   8 +-
 examples/run_bert_extract_features.py         |   4 +-
 examples/run_bert_squad.py                    |   8 +-
 examples/run_bert_swag.py                     |   8 +-
 examples/run_gpt2.py                          |   2 +-
 examples/run_openai_gpt.py                    |   2 +-
 examples/run_transfo_xl.py                    |   2 +-
 examples/run_xlnet_classifier.py              |   8 +-
 examples/run_xlnet_squad.py                   |   8 +-
 examples/tests/examples_tests.py              |  50 ++++++++
 examples/utils_squad.py                       |   2 +-
 hubconfs/bert_hubconf.py                      |  38 +++---
 hubconfs/gpt2_hubconf.py                      |  18 +--
 hubconfs/gpt_hubconf.py                       |  18 +--
 hubconfs/transformer_xl_hubconf.py            |  14 +--
 hubconfs/xlm_hubconf.py                       |  16 +--
 hubconfs/xlnet_hubconf.1.py                   |  18 +--
 .../Comparing-TF-and-PT-models-MLM-NSP.ipynb  |   8 +-
 notebooks/Comparing-TF-and-PT-models.ipynb    |   6 +-
 .../__init__.py                               |   2 +-
 .../__main__.py                               |  28 ++---
 .../convert_gpt2_checkpoint_to_pytorch.py     |   2 +-
 .../convert_openai_checkpoint_to_pytorch.py   |   2 +-
 .../convert_tf_checkpoint_to_pytorch.py       |   2 +-
 ...onvert_transfo_xl_checkpoint_to_pytorch.py |   6 +-
 .../convert_xlm_checkpoint_to_pytorch.py      |   4 +-
 .../convert_xlnet_checkpoint_to_pytorch.py    |   2 +-
 .../file_utils.py                             |   2 +-
 .../model_utils.py                            |   0
 .../modeling_bert.py                          |   0
 .../modeling_gpt2.py                          |   0
 .../modeling_openai.py                        |   0
 .../modeling_transfo_xl.py                    |   0
 .../modeling_transfo_xl_utilities.py          |   0
 .../modeling_xlm.py                           |   2 +-
 .../modeling_xlnet.py                         |   0
 .../optimization.py                           |   0
 .../optimization_openai.py                    |   0
 .../tests/__init__.py                         |   0
 .../tests/conftest.py                         |   0
 .../tests/fixtures/input.txt                  |   0
 .../tests/fixtures/sample_text.txt            |   0
 .../tests/fixtures/test_sentencepiece.model   | Bin
 .../tests/model_tests_commons.py              |   2 +-
 .../tests/model_utils_test.py                 |   4 +-
 .../tests/modeling_bert_test.py               |   6 +-
 .../tests/modeling_gpt2_test.py               |   2 +-
 .../tests/modeling_openai_test.py             |   2 +-
 .../tests/modeling_transfo_xl_test.py         |   6 +-
 .../tests/modeling_xlm_test.py                |   6 +-
 .../tests/modeling_xlnet_test.py              |   6 +-
 .../tests/optimization_test.py                |   6 +-
 .../tests/tokenization_bert_test.py           |   4 +-
 .../tests/tokenization_gpt2_test.py           |   4 +-
 .../tests/tokenization_openai_test.py         |   4 +-
 .../tests/tokenization_tests_commons.py       |   0
 .../tests/tokenization_transfo_xl_test.py     |   4 +-
 .../tests/tokenization_xlm_test.py            |   4 +-
 .../tests/tokenization_xlnet_test.py          |   4 +-
 .../tokenization_bert.py                      |   0
 .../tokenization_gpt2.py                      |   0
 .../tokenization_openai.py                    |   0
 .../tokenization_transfo_xl.py                |   0
 .../tokenization_xlm.py                       |   0
 .../tokenization_xlnet.py                     |   0
 setup.py                                      |  10 +-
 75 files changed, 280 insertions(+), 230 deletions(-)
 create mode 100644 examples/tests/examples_tests.py
 rename {pytorch_pretrained_bert => pytorch_transformers}/__init__.py (98%)
 rename {pytorch_pretrained_bert => pytorch_transformers}/__main__.py (72%)
 rename {pytorch_pretrained_bert => pytorch_transformers}/convert_gpt2_checkpoint_to_pytorch.py (97%)
 rename {pytorch_pretrained_bert => pytorch_transformers}/convert_openai_checkpoint_to_pytorch.py (97%)
 rename {pytorch_pretrained_bert => pytorch_transformers}/convert_tf_checkpoint_to_pytorch.py (95%)
 rename {pytorch_pretrained_bert => pytorch_transformers}/convert_transfo_xl_checkpoint_to_pytorch.py (96%)
 rename {pytorch_pretrained_bert => pytorch_transformers}/convert_xlm_checkpoint_to_pytorch.py (93%)
 rename {pytorch_pretrained_bert => pytorch_transformers}/convert_xlnet_checkpoint_to_pytorch.py (98%)
 rename {pytorch_pretrained_bert => pytorch_transformers}/file_utils.py (99%)
 rename {pytorch_pretrained_bert => pytorch_transformers}/model_utils.py (100%)
 rename {pytorch_pretrained_bert => pytorch_transformers}/modeling_bert.py (100%)
 rename {pytorch_pretrained_bert => pytorch_transformers}/modeling_gpt2.py (100%)
 rename {pytorch_pretrained_bert => pytorch_transformers}/modeling_openai.py (100%)
 rename {pytorch_pretrained_bert => pytorch_transformers}/modeling_transfo_xl.py (100%)
 rename {pytorch_pretrained_bert => pytorch_transformers}/modeling_transfo_xl_utilities.py (100%)
 rename {pytorch_pretrained_bert => pytorch_transformers}/modeling_xlm.py (99%)
 rename {pytorch_pretrained_bert => pytorch_transformers}/modeling_xlnet.py (100%)
 rename {pytorch_pretrained_bert => pytorch_transformers}/optimization.py (100%)
 rename {pytorch_pretrained_bert => pytorch_transformers}/optimization_openai.py (100%)
 rename {pytorch_pretrained_bert => pytorch_transformers}/tests/__init__.py (100%)
 rename {pytorch_pretrained_bert => pytorch_transformers}/tests/conftest.py (100%)
 rename {pytorch_pretrained_bert => pytorch_transformers}/tests/fixtures/input.txt (100%)
 rename {pytorch_pretrained_bert => pytorch_transformers}/tests/fixtures/sample_text.txt (100%)
 rename {pytorch_pretrained_bert => pytorch_transformers}/tests/fixtures/test_sentencepiece.model (100%)
 rename {pytorch_pretrained_bert => pytorch_transformers}/tests/model_tests_commons.py (99%)
 rename {pytorch_pretrained_bert => pytorch_transformers}/tests/model_utils_test.py (89%)
 rename {pytorch_pretrained_bert => pytorch_transformers}/tests/modeling_bert_test.py (98%)
 rename {pytorch_pretrained_bert => pytorch_transformers}/tests/modeling_gpt2_test.py (96%)
 rename {pytorch_pretrained_bert => pytorch_transformers}/tests/modeling_openai_test.py (96%)
 rename {pytorch_pretrained_bert => pytorch_transformers}/tests/modeling_transfo_xl_test.py (97%)
 rename {pytorch_pretrained_bert => pytorch_transformers}/tests/modeling_xlm_test.py (97%)
 rename {pytorch_pretrained_bert => pytorch_transformers}/tests/modeling_xlnet_test.py (97%)
 rename {pytorch_pretrained_bert => pytorch_transformers}/tests/optimization_test.py (94%)
 rename {pytorch_pretrained_bert => pytorch_transformers}/tests/tokenization_bert_test.py (97%)
 rename {pytorch_pretrained_bert => pytorch_transformers}/tests/tokenization_gpt2_test.py (94%)
 rename {pytorch_pretrained_bert => pytorch_transformers}/tests/tokenization_openai_test.py (94%)
 rename {pytorch_pretrained_bert => pytorch_transformers}/tests/tokenization_tests_commons.py (100%)
 rename {pytorch_pretrained_bert => pytorch_transformers}/tests/tokenization_transfo_xl_test.py (93%)
 rename {pytorch_pretrained_bert => pytorch_transformers}/tests/tokenization_xlm_test.py (94%)
 rename {pytorch_pretrained_bert => pytorch_transformers}/tests/tokenization_xlnet_test.py (97%)
 rename {pytorch_pretrained_bert => pytorch_transformers}/tokenization_bert.py (100%)
 rename {pytorch_pretrained_bert => pytorch_transformers}/tokenization_gpt2.py (100%)
 rename {pytorch_pretrained_bert => pytorch_transformers}/tokenization_openai.py (100%)
 rename {pytorch_pretrained_bert => pytorch_transformers}/tokenization_transfo_xl.py (100%)
 rename {pytorch_pretrained_bert => pytorch_transformers}/tokenization_xlm.py (100%)
 rename {pytorch_pretrained_bert => pytorch_transformers}/tokenization_xlnet.py (100%)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 858ca001d6..ac23723f98 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -1,7 +1,7 @@
 version: 2
 jobs:
     build_py3:
-        working_directory: ~/pytorch-pretrained-BERT
+        working_directory: ~/pytorch-transformers
         docker:
             - image: circleci/python:3.5
         steps:
@@ -10,11 +10,11 @@ jobs:
             - run: sudo pip install pytest codecov pytest-cov
             - run: sudo pip install spacy ftfy==4.4.3
             - run: sudo python -m spacy download en
-            - run: python -m pytest -sv ./pytorch_pretrained_bert/tests/ --cov
+            - run: python -m pytest -sv ./pytorch_transformers/tests/ --cov
             - run: codecov
         parallelism: 4
     build_py2:
-        working_directory: ~/pytorch-pretrained-BERT
+        working_directory: ~/pytorch-transformers
         docker:
             - image: circleci/python:2.7
         steps:
@@ -23,7 +23,7 @@ jobs:
             - run: sudo pip install pytest codecov pytest-cov
             - run: sudo pip install spacy ftfy==4.4.3
             - run: sudo python -m spacy download en
-            - run: python -m pytest -sv ./pytorch_pretrained_bert/tests/ --cov
+            - run: python -m pytest -sv ./pytorch_transformers/tests/ --cov
             - run: codecov
         parallelism: 4
 workflows:
diff --git a/.coveragerc b/.coveragerc
index fe05dda9a8..9b8c40ecf1 100644
--- a/.coveragerc
+++ b/.coveragerc
@@ -1,5 +1,5 @@
 [run]
-source=pytorch_pretrained_bert
+source=pytorch_transformers
 [report]
 exclude_lines =
     pragma: no cover
diff --git a/README.md b/README.md
index a5234bd9ba..b1e80edc89 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
 # PyTorch Pretrained BERT: The Big & Extending Repository of pretrained Transformers
 
-[![CircleCI](https://circleci.com/gh/huggingface/pytorch-pretrained-BERT.svg?style=svg)](https://circleci.com/gh/huggingface/pytorch-pretrained-BERT)
+[![CircleCI](https://circleci.com/gh/huggingface/pytorch-transformers.svg?style=svg)](https://circleci.com/gh/huggingface/pytorch-transformers)
 
 This repository contains op-for-op PyTorch reimplementations, pre-trained models and fine-tuning examples for:
 
@@ -47,7 +47,7 @@ This repo was tested on Python 2.7 and 3.5+ (examples are tested only on python
 
 PyTorch pretrained bert can be installed by pip as follows:
 ```bash
-pip install pytorch-pretrained-bert
+pip install pytorch-transformers
 ```
 
 If you want to reproduce the original tokenization process of the `OpenAI GPT` paper, you will need to install `ftfy` (limit to version 4.4.3 if you are using Python 2) and `SpaCy` :
@@ -73,7 +73,7 @@ python -m spacy download en
 
 Again, if you don't install `ftfy` and `SpaCy`, the `OpenAI GPT` tokenizer will default to tokenize using BERT's `BasicTokenizer` followed by Byte-Pair Encoding (which should be fine for most usage).
 
-A series of tests is included in the [tests folder](https://github.com/huggingface/pytorch-pretrained-BERT/tree/master/tests) and can be run using `pytest` (install pytest if needed: `pip install pytest`).
+A series of tests is included in the [tests folder](https://github.com/huggingface/pytorch-transformers/tree/master/tests) and can be run using `pytest` (install pytest if needed: `pip install pytest`).
 
 You can run the tests with the command:
 ```bash
@@ -84,51 +84,51 @@ python -m pytest -sv tests/
 
 This package comprises the following classes that can be imported in Python and are detailed in the [Doc](#doc) section of this readme:
 
-- Eight **Bert** PyTorch models (`torch.nn.Module`) with pre-trained weights (in the [`modeling.py`](./pytorch_pretrained_bert/modeling.py) file):
-  - [`BertModel`](./pytorch_pretrained_bert/modeling.py#L639) - raw BERT Transformer model (**fully pre-trained**),
-  - [`BertForMaskedLM`](./pytorch_pretrained_bert/modeling.py#L793) - BERT Transformer with the pre-trained masked language modeling head on top (**fully pre-trained**),
-  - [`BertForNextSentencePrediction`](./pytorch_pretrained_bert/modeling.py#L854) - BERT Transformer with the pre-trained next sentence prediction classifier on top  (**fully pre-trained**),
-  - [`BertForPreTraining`](./pytorch_pretrained_bert/modeling.py#L722) - BERT Transformer with masked language modeling head and next sentence prediction classifier on top (**fully pre-trained**),
-  - [`BertForSequenceClassification`](./pytorch_pretrained_bert/modeling.py#L916) - BERT Transformer with a sequence classification head on top (BERT Transformer is **pre-trained**, the sequence classification head **is only initialized and has to be trained**),
-  - [`BertForMultipleChoice`](./pytorch_pretrained_bert/modeling.py#L982) - BERT Transformer with a multiple choice head on top (used for task like Swag) (BERT Transformer is **pre-trained**, the multiple choice classification head **is only initialized and has to be trained**),
-  - [`BertForTokenClassification`](./pytorch_pretrained_bert/modeling.py#L1051) - BERT Transformer with a token classification head on top (BERT Transformer is **pre-trained**, the token classification head **is only initialized and has to be trained**),
-  - [`BertForQuestionAnswering`](./pytorch_pretrained_bert/modeling.py#L1124) - BERT Transformer with a token classification head on top (BERT Transformer is **pre-trained**, the token classification head **is only initialized and has to be trained**).
+- Eight **Bert** PyTorch models (`torch.nn.Module`) with pre-trained weights (in the [`modeling.py`](./pytorch_transformers/modeling.py) file):
+  - [`BertModel`](./pytorch_transformers/modeling.py#L639) - raw BERT Transformer model (**fully pre-trained**),
+  - [`BertForMaskedLM`](./pytorch_transformers/modeling.py#L793) - BERT Transformer with the pre-trained masked language modeling head on top (**fully pre-trained**),
+  - [`BertForNextSentencePrediction`](./pytorch_transformers/modeling.py#L854) - BERT Transformer with the pre-trained next sentence prediction classifier on top  (**fully pre-trained**),
+  - [`BertForPreTraining`](./pytorch_transformers/modeling.py#L722) - BERT Transformer with masked language modeling head and next sentence prediction classifier on top (**fully pre-trained**),
+  - [`BertForSequenceClassification`](./pytorch_transformers/modeling.py#L916) - BERT Transformer with a sequence classification head on top (BERT Transformer is **pre-trained**, the sequence classification head **is only initialized and has to be trained**),
+  - [`BertForMultipleChoice`](./pytorch_transformers/modeling.py#L982) - BERT Transformer with a multiple choice head on top (used for task like Swag) (BERT Transformer is **pre-trained**, the multiple choice classification head **is only initialized and has to be trained**),
+  - [`BertForTokenClassification`](./pytorch_transformers/modeling.py#L1051) - BERT Transformer with a token classification head on top (BERT Transformer is **pre-trained**, the token classification head **is only initialized and has to be trained**),
+  - [`BertForQuestionAnswering`](./pytorch_transformers/modeling.py#L1124) - BERT Transformer with a token classification head on top (BERT Transformer is **pre-trained**, the token classification head **is only initialized and has to be trained**).
 
-- Three **OpenAI GPT** PyTorch models (`torch.nn.Module`) with pre-trained weights (in the [`modeling_openai.py`](./pytorch_pretrained_bert/modeling_openai.py) file):
-  - [`OpenAIGPTModel`](./pytorch_pretrained_bert/modeling_openai.py#L536) - raw OpenAI GPT Transformer model (**fully pre-trained**),
-  - [`OpenAIGPTLMHeadModel`](./pytorch_pretrained_bert/modeling_openai.py#L643) - OpenAI GPT Transformer with the tied language modeling head on top (**fully pre-trained**),
-  - [`OpenAIGPTDoubleHeadsModel`](./pytorch_pretrained_bert/modeling_openai.py#L722) - OpenAI GPT Transformer with the tied language modeling head and a multiple choice classification head on top (OpenAI GPT Transformer is **pre-trained**, the multiple choice classification head **is only initialized and has to be trained**),
+- Three **OpenAI GPT** PyTorch models (`torch.nn.Module`) with pre-trained weights (in the [`modeling_openai.py`](./pytorch_transformers/modeling_openai.py) file):
+  - [`OpenAIGPTModel`](./pytorch_transformers/modeling_openai.py#L536) - raw OpenAI GPT Transformer model (**fully pre-trained**),
+  - [`OpenAIGPTLMHeadModel`](./pytorch_transformers/modeling_openai.py#L643) - OpenAI GPT Transformer with the tied language modeling head on top (**fully pre-trained**),
+  - [`OpenAIGPTDoubleHeadsModel`](./pytorch_transformers/modeling_openai.py#L722) - OpenAI GPT Transformer with the tied language modeling head and a multiple choice classification head on top (OpenAI GPT Transformer is **pre-trained**, the multiple choice classification head **is only initialized and has to be trained**),
 
-- Two **Transformer-XL** PyTorch models (`torch.nn.Module`) with pre-trained weights (in the [`modeling_transfo_xl.py`](./pytorch_pretrained_bert/modeling_transfo_xl.py) file):
-  - [`TransfoXLModel`](./pytorch_pretrained_bert/modeling_transfo_xl.py#L983) - Transformer-XL model which outputs the last hidden state and memory cells (**fully pre-trained**),
-  - [`TransfoXLLMHeadModel`](./pytorch_pretrained_bert/modeling_transfo_xl.py#L1260) - Transformer-XL with the tied adaptive softmax head on top for language modeling which outputs the logits/loss and memory cells (**fully pre-trained**),
+- Two **Transformer-XL** PyTorch models (`torch.nn.Module`) with pre-trained weights (in the [`modeling_transfo_xl.py`](./pytorch_transformers/modeling_transfo_xl.py) file):
+  - [`TransfoXLModel`](./pytorch_transformers/modeling_transfo_xl.py#L983) - Transformer-XL model which outputs the last hidden state and memory cells (**fully pre-trained**),
+  - [`TransfoXLLMHeadModel`](./pytorch_transformers/modeling_transfo_xl.py#L1260) - Transformer-XL with the tied adaptive softmax head on top for language modeling which outputs the logits/loss and memory cells (**fully pre-trained**),
 
-- Three **OpenAI GPT-2** PyTorch models (`torch.nn.Module`) with pre-trained weights (in the [`modeling_gpt2.py`](./pytorch_pretrained_bert/modeling_gpt2.py) file):
-  - [`GPT2Model`](./pytorch_pretrained_bert/modeling_gpt2.py#L479) - raw OpenAI GPT-2 Transformer model (**fully pre-trained**),
-  - [`GPT2LMHeadModel`](./pytorch_pretrained_bert/modeling_gpt2.py#L559) - OpenAI GPT-2 Transformer with the tied language modeling head on top (**fully pre-trained**),
-  - [`GPT2DoubleHeadsModel`](./pytorch_pretrained_bert/modeling_gpt2.py#L624) - OpenAI GPT-2 Transformer with the tied language modeling head and a multiple choice classification head on top (OpenAI GPT-2 Transformer is **pre-trained**, the multiple choice classification head **is only initialized and has to be trained**),
+- Three **OpenAI GPT-2** PyTorch models (`torch.nn.Module`) with pre-trained weights (in the [`modeling_gpt2.py`](./pytorch_transformers/modeling_gpt2.py) file):
+  - [`GPT2Model`](./pytorch_transformers/modeling_gpt2.py#L479) - raw OpenAI GPT-2 Transformer model (**fully pre-trained**),
+  - [`GPT2LMHeadModel`](./pytorch_transformers/modeling_gpt2.py#L559) - OpenAI GPT-2 Transformer with the tied language modeling head on top (**fully pre-trained**),
+  - [`GPT2DoubleHeadsModel`](./pytorch_transformers/modeling_gpt2.py#L624) - OpenAI GPT-2 Transformer with the tied language modeling head and a multiple choice classification head on top (OpenAI GPT-2 Transformer is **pre-trained**, the multiple choice classification head **is only initialized and has to be trained**),
 
-- Tokenizers for **BERT** (using word-piece) (in the [`tokenization.py`](./pytorch_pretrained_bert/tokenization.py) file):
+- Tokenizers for **BERT** (using word-piece) (in the [`tokenization.py`](./pytorch_transformers/tokenization.py) file):
   - `BasicTokenizer` - basic tokenization (punctuation splitting, lower casing, etc.),
   - `WordpieceTokenizer` - WordPiece tokenization,
   - `BertTokenizer` - perform end-to-end tokenization, i.e. basic tokenization followed by WordPiece tokenization.
 
-- Tokenizer for **OpenAI GPT** (using Byte-Pair-Encoding) (in the [`tokenization_openai.py`](./pytorch_pretrained_bert/tokenization_openai.py) file):
+- Tokenizer for **OpenAI GPT** (using Byte-Pair-Encoding) (in the [`tokenization_openai.py`](./pytorch_transformers/tokenization_openai.py) file):
   - `OpenAIGPTTokenizer` - perform Byte-Pair-Encoding (BPE) tokenization.
 
-- Tokenizer for **Transformer-XL** (word tokens ordered by frequency for adaptive softmax) (in the [`tokenization_transfo_xl.py`](./pytorch_pretrained_bert/tokenization_transfo_xl.py) file):
+- Tokenizer for **Transformer-XL** (word tokens ordered by frequency for adaptive softmax) (in the [`tokenization_transfo_xl.py`](./pytorch_transformers/tokenization_transfo_xl.py) file):
   - `OpenAIGPTTokenizer` - perform word tokenization and can order words by frequency in a corpus for use in an adaptive softmax.
 
-- Tokenizer for **OpenAI GPT-2** (using byte-level Byte-Pair-Encoding) (in the [`tokenization_gpt2.py`](./pytorch_pretrained_bert/tokenization_gpt2.py) file):
+- Tokenizer for **OpenAI GPT-2** (using byte-level Byte-Pair-Encoding) (in the [`tokenization_gpt2.py`](./pytorch_transformers/tokenization_gpt2.py) file):
   - `GPT2Tokenizer` - perform byte-level Byte-Pair-Encoding (BPE) tokenization.
 
-- Optimizer for **BERT** (in the [`optimization.py`](./pytorch_pretrained_bert/optimization.py) file):
+- Optimizer for **BERT** (in the [`optimization.py`](./pytorch_transformers/optimization.py) file):
   - `BertAdam` - Bert version of Adam algorithm with weight decay fix, warmup and linear decay of the learning rate.
 
-- Optimizer for **OpenAI GPT** (in the [`optimization_openai.py`](./pytorch_pretrained_bert/optimization_openai.py) file):
+- Optimizer for **OpenAI GPT** (in the [`optimization_openai.py`](./pytorch_transformers/optimization_openai.py) file):
   - `OpenAIAdam` - OpenAI GPT version of Adam algorithm with weight decay fix, warmup and linear decay of the learning rate.
 
-- Configuration classes for BERT, OpenAI GPT and Transformer-XL (in the respective [`modeling.py`](./pytorch_pretrained_bert/modeling.py), [`modeling_openai.py`](./pytorch_pretrained_bert/modeling_openai.py), [`modeling_transfo_xl.py`](./pytorch_pretrained_bert/modeling_transfo_xl.py) files):
+- Configuration classes for BERT, OpenAI GPT and Transformer-XL (in the respective [`modeling.py`](./pytorch_transformers/modeling.py), [`modeling_openai.py`](./pytorch_transformers/modeling_openai.py), [`modeling_transfo_xl.py`](./pytorch_transformers/modeling_transfo_xl.py) files):
   - `BertConfig` - Configuration class to store the configuration of a `BertModel` with utilities to read and write from JSON configuration files.
   - `OpenAIGPTConfig` - Configuration class to store the configuration of a `OpenAIGPTModel` with utilities to read and write from JSON configuration files.
   - `GPT2Config` - Configuration class to store the configuration of a `GPT2Model` with utilities to read and write from JSON configuration files.
@@ -175,7 +175,7 @@ First let's prepare a tokenized input with `BertTokenizer`
 
 ```python
 import torch
-from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM
+from pytorch_transformers import BertTokenizer, BertModel, BertForMaskedLM
 
 # OPTIONAL: if you want to have more information on what's happening, activate the logger as follows
 import logging
@@ -252,7 +252,7 @@ First let's prepare a tokenized input with `OpenAIGPTTokenizer`
 
 ```python
 import torch
-from pytorch_pretrained_bert import OpenAIGPTTokenizer, OpenAIGPTModel, OpenAIGPTLMHeadModel
+from pytorch_transformers import OpenAIGPTTokenizer, OpenAIGPTModel, OpenAIGPTLMHeadModel
 
 # OPTIONAL: if you want to have more information on what's happening, activate the logger as follows
 import logging
@@ -339,7 +339,7 @@ First let's prepare a tokenized input with `TransfoXLTokenizer`
 
 ```python
 import torch
-from pytorch_pretrained_bert import TransfoXLTokenizer, TransfoXLModel, TransfoXLLMHeadModel
+from pytorch_transformers import TransfoXLTokenizer, TransfoXLModel, TransfoXLLMHeadModel
 
 # OPTIONAL: if you want to have more information on what's happening, activate the logger as follows
 import logging
@@ -414,7 +414,7 @@ First let's prepare a tokenized input with `GPT2Tokenizer`
 
 ```python
 import torch
-from pytorch_pretrained_bert import GPT2Tokenizer, GPT2Model, GPT2LMHeadModel
+from pytorch_transformers import GPT2Tokenizer, GPT2Model, GPT2LMHeadModel
 
 # OPTIONAL: if you want to have more information on what's happening, activate the logger as follows
 import logging
@@ -552,7 +552,7 @@ where
     - `bert_config.json` or `openai_gpt_config.json` a configuration file for the model, and
     - `pytorch_model.bin` a PyTorch dump of a pre-trained instance of `BertForPreTraining`, `OpenAIGPTModel`, `TransfoXLModel`, `GPT2LMHeadModel` (saved with the usual `torch.save()`)
 
-  If `PRE_TRAINED_MODEL_NAME_OR_PATH` is a shortcut name, the pre-trained weights will be downloaded from AWS S3 (see the links [here](pytorch_pretrained_bert/modeling.py)) and stored in a cache folder to avoid future download (the cache folder can be found at `~/.pytorch_pretrained_bert/`).
+  If `PRE_TRAINED_MODEL_NAME_OR_PATH` is a shortcut name, the pre-trained weights will be downloaded from AWS S3 (see the links [here](pytorch_transformers/modeling.py)) and stored in a cache folder to avoid future download (the cache folder can be found at `~/.pytorch_transformers/`).
 
 - `cache_dir` can be an optional path to a specific directory to download and cache the pre-trained model weights. This option is useful in particular when you are using distributed training: to avoid concurrent access to the same weights you can set for example `cache_dir='./pretrained_model_{}'.format(args.local_rank)` (see the section on distributed training for more information).
 - `from_tf`: should we load the weights from a locally saved TensorFlow checkpoint
@@ -586,19 +586,19 @@ model = GPT2Model.from_pretrained('gpt2')
 
 #### Cache directory
 
-`pytorch_pretrained_bert` save the pretrained weights in a cache directory which is located at (in this order of priority):
+`pytorch_transformers` save the pretrained weights in a cache directory which is located at (in this order of priority):
 
 - `cache_dir` optional arguments to the `from_pretrained()` method (see above),
 - shell environment variable `PYTORCH_PRETRAINED_BERT_CACHE`,
-- PyTorch cache home + `/pytorch_pretrained_bert/`
+- PyTorch cache home + `/pytorch_transformers/`
   where PyTorch cache home is defined by (in this order):
   - shell environment variable `ENV_TORCH_HOME`
   - shell environment variable `ENV_XDG_CACHE_HOME` + `/torch/`)
   - default: `~/.cache/torch/`
 
-Usually, if you don't set any specific environment variable, `pytorch_pretrained_bert` cache will be at `~/.cache/torch/pytorch_pretrained_bert/`.
+Usually, if you don't set any specific environment variable, `pytorch_transformers` cache will be at `~/.cache/torch/pytorch_transformers/`.
 
-You can alsways safely delete `pytorch_pretrained_bert` cache but the pretrained model weights and vocabulary files wil have to be re-downloaded from our S3.
+You can alsways safely delete `pytorch_transformers` cache but the pretrained model weights and vocabulary files wil have to be re-downloaded from our S3.
 
 ### Serialization best-practices
 
@@ -621,7 +621,7 @@ The *default filenames* of these files are as follow:
 Here is the recommended way of saving the model, configuration and vocabulary to an `output_dir` directory and reloading the model and tokenizer afterwards:
 
 ```python
-from pytorch_pretrained_bert import WEIGHTS_NAME, CONFIG_NAME
+from pytorch_transformers import WEIGHTS_NAME, CONFIG_NAME
 
 output_dir = "./models/"
 
@@ -719,7 +719,7 @@ The model can be instantiated with the following arguments:
 The inputs and output are **identical to the TensorFlow model inputs and outputs**.
 
 We detail them here. This model takes as *inputs*:
-[`modeling.py`](./pytorch_pretrained_bert/modeling.py)
+[`modeling.py`](./pytorch_transformers/modeling.py)
 - `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] with the word token indices in the vocabulary (see the tokens preprocessing logic in the scripts [`run_bert_extract_features.py`](./examples/run_bert_extract_features.py), [`run_bert_classifier.py`](./examples/run_bert_classifier.py) and [`run_bert_squad.py`](./examples/run_bert_squad.py)), and
 - `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to a `sentence B` token (see BERT paper for more details).
 - `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices selected in [0, 1]. It's a mask to be used if some input sequence lengths are smaller than the max input sequence length of the current batch. It's the mask that we typically use for attention when a batch has varying length sentences.
@@ -852,7 +852,7 @@ The model can be instantiated with the following arguments:
 The inputs and output are **identical to the TensorFlow model inputs and outputs**.
 
 We detail them here. This model takes as *inputs*:
-[`modeling_openai.py`](./pytorch_pretrained_bert/modeling_openai.py)
+[`modeling_openai.py`](./pytorch_transformers/modeling_openai.py)
 - `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] (or more generally [d_1, ..., d_n, sequence_length] were d_1 ... d_n are arbitrary dimensions) with the word BPE token indices selected in the range [0, total_tokens_embeddings[
 - `position_ids`: an optional torch.LongTensor with the same shape as input_ids
     with the position indices (selected in the range [0, config.n_positions - 1[.
@@ -905,7 +905,7 @@ Transformer XL use a relative positioning with sinusiodal patterns and adaptive
 - the tokens in the vocabulary have to be sorted to decreasing frequency.
 
 This model takes as *inputs*:
-[`modeling_transfo_xl.py`](./pytorch_pretrained_bert/modeling_transfo_xl.py)
+[`modeling_transfo_xl.py`](./pytorch_transformers/modeling_transfo_xl.py)
 - `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] with the token indices selected in the range [0, self.config.n_token[
 - `mems`: an optional memory of hidden states from previous forward passes as a list (num layers) of hidden states at the entry of each layer. Each hidden states has shape [self.config.mem_len, bsz, self.config.d_model]. Note that the first two dimensions are transposed in `mems` with regards to `input_ids`.
 
@@ -952,7 +952,7 @@ The model can be instantiated with the following arguments:
 The inputs and output are **identical to the TensorFlow model inputs and outputs**.
 
 We detail them here. This model takes as *inputs*:
-[`modeling_gpt2.py`](./pytorch_pretrained_bert/modeling_gpt2.py)
+[`modeling_gpt2.py`](./pytorch_transformers/modeling_gpt2.py)
 - `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] (or more generally [d_1, ..., d_n, sequence_length] were d_1 ... d_n are arbitrary dimensions) with the word BPE token indices selected in the range [0, vocab_size[
 - `position_ids`: an optional torch.LongTensor with the same shape as input_ids
     with the position indices (selected in the range [0, config.n_positions - 1[.
@@ -1020,7 +1020,7 @@ and three methods:
 - `convert_ids_to_tokens(tokens)`: convert a list of `int` indices in a list of `str` tokens in the vocabulary.
 - `save_vocabulary(directory_path)`: save the vocabulary file to `directory_path`. Return the path to the saved vocabulary file: `vocab_file_path`. The vocabulary can be reloaded with `BertTokenizer.from_pretrained('vocab_file_path')` or `BertTokenizer.from_pretrained('directory_path')`.
 
-Please refer to the doc strings and code in [`tokenization.py`](./pytorch_pretrained_bert/tokenization.py) for the details of the `BasicTokenizer` and `WordpieceTokenizer` classes. In general it is recommended to use `BertTokenizer` unless you know what you are doing.
+Please refer to the doc strings and code in [`tokenization.py`](./pytorch_transformers/tokenization.py) for the details of the `BasicTokenizer` and `WordpieceTokenizer` classes. In general it is recommended to use `BertTokenizer` unless you know what you are doing.
 
 #### `OpenAIGPTTokenizer`
 
@@ -1043,7 +1043,7 @@ and five methods:
 - `decode(ids, skip_special_tokens=False, clean_up_tokenization_spaces=False)`: decode a list of `int` indices in a string and do some post-processing if needed: (i) remove special tokens from the output and (ii) clean up tokenization spaces.
 - `save_vocabulary(directory_path)`: save the vocabulary, merge and special tokens files to `directory_path`. Return the path to the three files: `vocab_file_path`, `merge_file_path`, `special_tokens_file_path`. The vocabulary can be reloaded with `OpenAIGPTTokenizer.from_pretrained('directory_path')`.
 
-Please refer to the doc strings and code in [`tokenization_openai.py`](./pytorch_pretrained_bert/tokenization_openai.py) for the details of the `OpenAIGPTTokenizer`.
+Please refer to the doc strings and code in [`tokenization_openai.py`](./pytorch_transformers/tokenization_openai.py) for the details of the `OpenAIGPTTokenizer`.
 
 #### `TransfoXLTokenizer`
 
@@ -1051,7 +1051,7 @@ Please refer to the doc strings and code in [`tokenization_openai.py`](./pytorch
 
 The API is similar to the API of `BertTokenizer` (see above).
 
-Please refer to the doc strings and code in [`tokenization_transfo_xl.py`](./pytorch_pretrained_bert/tokenization_transfo_xl.py) for the details of these additional methods in `TransfoXLTokenizer`.
+Please refer to the doc strings and code in [`tokenization_transfo_xl.py`](./pytorch_transformers/tokenization_transfo_xl.py) for the details of these additional methods in `TransfoXLTokenizer`.
 
 #### `GPT2Tokenizer`
 
@@ -1073,7 +1073,7 @@ and two methods:
 - `decode(tokens)`: convert back a list of `int` tokens in a `str`.
 - `save_vocabulary(directory_path)`: save the vocabulary, merge and special tokens files to `directory_path`. Return the path to the three files: `vocab_file_path`, `merge_file_path`, `special_tokens_file_path`. The vocabulary can be reloaded with `OpenAIGPTTokenizer.from_pretrained('directory_path')`.
 
-Please refer to [`tokenization_gpt2.py`](./pytorch_pretrained_bert/tokenization_gpt2.py) for more details on the `GPT2Tokenizer`.
+Please refer to [`tokenization_gpt2.py`](./pytorch_transformers/tokenization_gpt2.py) for more details on the `GPT2Tokenizer`.
 
 ### Optimizers
 
@@ -1155,7 +1155,7 @@ Here is how to use these techniques in our scripts:
 - **Distributed training**: Distributed training can be activated by supplying an integer greater or equal to 0 to the `--local_rank` argument (see below).
 - **16-bits training**: 16-bits training, also called mixed-precision training, can reduce the memory requirement of your model on the GPU by using half-precision training, basically allowing to double the batch size. If you have a recent GPU (starting from NVIDIA Volta architecture) you should see no decrease in speed. A good introduction to Mixed precision training can be found [here](https://devblogs.nvidia.com/mixed-precision-training-deep-neural-networks/) and a full documentation is [here](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html). In our scripts, this option can be activated by setting the `--fp16` flag and you can play with loss scaling using the `--loss_scale` flag (see the previously linked documentation for details on loss scaling). The loss scale can be zero in which case the scale is dynamically adjusted or a positive power of two in which case the scaling is static.
 
-To use 16-bits training and distributed training, you need to install NVIDIA's apex extension [as detailed here](https://github.com/nvidia/apex). You will find more information regarding the internals of `apex` and how to use `apex` in [the doc and the associated repository](https://github.com/nvidia/apex). The results of the tests performed on pytorch-BERT by the NVIDIA team (and my trials at reproducing them) can be consulted in [the relevant PR of the present repository](https://github.com/huggingface/pytorch-pretrained-BERT/pull/116).
+To use 16-bits training and distributed training, you need to install NVIDIA's apex extension [as detailed here](https://github.com/nvidia/apex). You will find more information regarding the internals of `apex` and how to use `apex` in [the doc and the associated repository](https://github.com/nvidia/apex). The results of the tests performed on pytorch-BERT by the NVIDIA team (and my trials at reproducing them) can be consulted in [the relevant PR of the present repository](https://github.com/huggingface/pytorch-transformers/pull/116).
 
 Note: To use *Distributed Training*, you will need to run one training script on each of your machines. This can be done for example by running the following command on each server (see [the above mentioned blog post]((https://medium.com/huggingface/training-larger-batches-practical-tips-on-1-gpu-multi-gpu-distributed-setups-ec88c3e51255)) for more details):
 ```bash
@@ -1660,7 +1660,7 @@ To help you understand and use these features, we have added a specific example
 
 ## Notebooks
 
-We include [three Jupyter Notebooks](https://github.com/huggingface/pytorch-pretrained-BERT/tree/master/notebooks) that can be used to check that the predictions of the PyTorch model are identical to the predictions of the original TensorFlow model.
+We include [three Jupyter Notebooks](https://github.com/huggingface/pytorch-transformers/tree/master/notebooks) that can be used to check that the predictions of the PyTorch model are identical to the predictions of the original TensorFlow model.
 
 - The first NoteBook ([Comparing-TF-and-PT-models.ipynb](./notebooks/Comparing-TF-and-PT-models.ipynb)) extracts the hidden states of a full sequence on each layers of the TensorFlow and the PyTorch models and computes the standard deviation between them. In the given example, we get a standard deviation of 1.5e-7 to 9e-7 on the various hidden state of the models.
 
@@ -1676,7 +1676,7 @@ A command-line interface is provided to convert a TensorFlow checkpoint in a PyT
 
 ### BERT
 
-You can convert any TensorFlow checkpoint for BERT (in particular [the pre-trained models released by Google](https://github.com/google-research/bert#pre-trained-models)) in a PyTorch save file by using the [`convert_tf_checkpoint_to_pytorch.py`](./pytorch_pretrained_bert/convert_tf_checkpoint_to_pytorch.py ) script.
+You can convert any TensorFlow checkpoint for BERT (in particular [the pre-trained models released by Google](https://github.com/google-research/bert#pre-trained-models)) in a PyTorch save file by using the [`convert_tf_checkpoint_to_pytorch.py`](./pytorch_transformers/convert_tf_checkpoint_to_pytorch.py ) script.
 
 This CLI takes as input a TensorFlow checkpoint (three files starting with `bert_model.ckpt`) and the associated configuration file (`bert_config.json`), and creates a PyTorch model for this configuration, loads the weights from the TensorFlow checkpoint in the PyTorch model and saves the resulting model in a standard PyTorch save file that can be imported using `torch.load()` (see examples in [`run_bert_extract_features.py`](./examples/run_bert_extract_features.py), [`run_bert_classifier.py`](./examples/run_bert_classifier.py) and [`run_bert_squad.py`](./examples/run_bert_squad.py)).
 
@@ -1689,7 +1689,7 @@ Here is an example of the conversion process for a pre-trained `BERT-Base Uncase
 ```shell
 export BERT_BASE_DIR=/path/to/bert/uncased_L-12_H-768_A-12
 
-pytorch_pretrained_bert bert \
+pytorch_transformers bert \
   $BERT_BASE_DIR/bert_model.ckpt \
   $BERT_BASE_DIR/bert_config.json \
   $BERT_BASE_DIR/pytorch_model.bin
@@ -1704,7 +1704,7 @@ Here is an example of the conversion process for a pre-trained OpenAI GPT model,
 ```shell
 export OPENAI_GPT_CHECKPOINT_FOLDER_PATH=/path/to/openai/pretrained/numpy/weights
 
-pytorch_pretrained_bert gpt \
+pytorch_transformers gpt \
   $OPENAI_GPT_CHECKPOINT_FOLDER_PATH \
   $PYTORCH_DUMP_OUTPUT \
   [OPENAI_GPT_CONFIG]
@@ -1717,7 +1717,7 @@ Here is an example of the conversion process for a pre-trained Transformer-XL mo
 ```shell
 export TRANSFO_XL_CHECKPOINT_FOLDER_PATH=/path/to/transfo/xl/checkpoint
 
-pytorch_pretrained_bert transfo_xl \
+pytorch_transformers transfo_xl \
   $TRANSFO_XL_CHECKPOINT_FOLDER_PATH \
   $PYTORCH_DUMP_OUTPUT \
   [TRANSFO_XL_CONFIG]
@@ -1730,7 +1730,7 @@ Here is an example of the conversion process for a pre-trained OpenAI's GPT-2 mo
 ```shell
 export GPT2_DIR=/path/to/gpt2/checkpoint
 
-pytorch_pretrained_bert gpt2 \
+pytorch_transformers gpt2 \
   $GPT2_DIR/model.ckpt \
   $PYTORCH_DUMP_OUTPUT \
   [GPT2_CONFIG]
@@ -1744,7 +1744,7 @@ Here is an example of the conversion process for a pre-trained XLNet model, fine
 export TRANSFO_XL_CHECKPOINT_PATH=/path/to/xlnet/checkpoint
 export TRANSFO_XL_CONFIG_PATH=/path/to/xlnet/config
 
-pytorch_pretrained_bert xlnet \
+pytorch_transformers xlnet \
   $TRANSFO_XL_CHECKPOINT_PATH \
   $TRANSFO_XL_CONFIG_PATH \
   $PYTORCH_DUMP_OUTPUT \
diff --git a/docker/Dockerfile b/docker/Dockerfile
index e47eb548f9..1a6c6f06f9 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -2,6 +2,6 @@ FROM pytorch/pytorch:latest
 
 RUN git clone https://github.com/NVIDIA/apex.git && cd apex && python setup.py install --cuda_ext --cpp_ext
 
-RUN pip install pytorch-pretrained-bert
+RUN pip install pytorch_transformers
 
 WORKDIR /workspace
\ No newline at end of file
diff --git a/examples/bertology.py b/examples/bertology.py
index 6f7f7c9592..096b1b44fc 100644
--- a/examples/bertology.py
+++ b/examples/bertology.py
@@ -12,7 +12,7 @@ from torch.utils.data import DataLoader, SequentialSampler, TensorDataset, Subse
 from torch.utils.data.distributed import DistributedSampler
 from torch.nn import CrossEntropyLoss, MSELoss
 
-from pytorch_pretrained_bert import BertForSequenceClassification, BertTokenizer
+from pytorch_transformers import BertForSequenceClassification, BertTokenizer
 
 from utils_glue import processors, output_modes, convert_examples_to_features, compute_metrics
 
diff --git a/examples/generation_xlnet.py b/examples/generation_xlnet.py
index e54f6a365f..fe3610cfd1 100644
--- a/examples/generation_xlnet.py
+++ b/examples/generation_xlnet.py
@@ -1,6 +1,6 @@
 import torch
 from torch.nn import functional as F
-from pytorch_pretrained_bert import XLNetModel, XLNetLMHeadModel, XLNetTokenizer
+from pytorch_transformers import XLNetModel, XLNetLMHeadModel, XLNetTokenizer
 
 import logging
 logging.basicConfig(level=logging.INFO)
diff --git a/examples/lm_finetuning/finetune_on_pregenerated.py b/examples/lm_finetuning/finetune_on_pregenerated.py
index 8eda2aa5c5..505cd466f6 100644
--- a/examples/lm_finetuning/finetune_on_pregenerated.py
+++ b/examples/lm_finetuning/finetune_on_pregenerated.py
@@ -13,10 +13,10 @@ from torch.utils.data import DataLoader, Dataset, RandomSampler
 from torch.utils.data.distributed import DistributedSampler
 from tqdm import tqdm
 
-from pytorch_pretrained_bert import WEIGHTS_NAME, CONFIG_NAME
-from pytorch_pretrained_bert.modeling_bert import BertForPreTraining
-from pytorch_pretrained_bert.tokenization_bert import BertTokenizer
-from pytorch_pretrained_bert.optimization import BertAdam, WarmupLinearSchedule
+from pytorch_transformers import WEIGHTS_NAME, CONFIG_NAME
+from pytorch_transformers.modeling_bert import BertForPreTraining
+from pytorch_transformers.tokenization_bert import BertTokenizer
+from pytorch_transformers.optimization import BertAdam, WarmupLinearSchedule
 
 InputFeatures = namedtuple("InputFeatures", "input_ids input_mask segment_ids lm_label_ids is_next")
 
diff --git a/examples/lm_finetuning/pregenerate_training_data.py b/examples/lm_finetuning/pregenerate_training_data.py
index c2211c88e6..b79257fd4b 100644
--- a/examples/lm_finetuning/pregenerate_training_data.py
+++ b/examples/lm_finetuning/pregenerate_training_data.py
@@ -5,7 +5,7 @@ from tempfile import TemporaryDirectory
 import shelve
 
 from random import random, randrange, randint, shuffle, choice
-from pytorch_pretrained_bert.tokenization_bert import BertTokenizer
+from pytorch_transformers.tokenization_bert import BertTokenizer
 import numpy as np
 import json
 import collections
diff --git a/examples/lm_finetuning/simple_lm_finetuning.py b/examples/lm_finetuning/simple_lm_finetuning.py
index bcfd138442..3008787cd1 100644
--- a/examples/lm_finetuning/simple_lm_finetuning.py
+++ b/examples/lm_finetuning/simple_lm_finetuning.py
@@ -29,10 +29,10 @@ from torch.utils.data import DataLoader, Dataset, RandomSampler
 from torch.utils.data.distributed import DistributedSampler
 from tqdm import tqdm, trange
 
-from pytorch_pretrained_bert import WEIGHTS_NAME, CONFIG_NAME
-from pytorch_pretrained_bert.modeling_bert import BertForPreTraining
-from pytorch_pretrained_bert.tokenization_bert import BertTokenizer
-from pytorch_pretrained_bert.optimization import BertAdam, WarmupLinearSchedule
+from pytorch_transformers import WEIGHTS_NAME, CONFIG_NAME
+from pytorch_transformers.modeling_bert import BertForPreTraining
+from pytorch_transformers.tokenization_bert import BertTokenizer
+from pytorch_transformers.optimization import BertAdam, WarmupLinearSchedule
 
 logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                     datefmt='%m/%d/%Y %H:%M:%S',
diff --git a/examples/run_bert_classifier.py b/examples/run_bert_classifier.py
index 233a7ee5d1..506aecc5b1 100644
--- a/examples/run_bert_classifier.py
+++ b/examples/run_bert_classifier.py
@@ -34,10 +34,10 @@ from torch.nn import CrossEntropyLoss, MSELoss
 
 from tensorboardX import SummaryWriter
 
-from pytorch_pretrained_bert import WEIGHTS_NAME, CONFIG_NAME
-from pytorch_pretrained_bert.modeling_bert import BertForSequenceClassification
-from pytorch_pretrained_bert.tokenization_bert import BertTokenizer
-from pytorch_pretrained_bert.optimization import BertAdam, WarmupLinearSchedule
+from pytorch_transformers import WEIGHTS_NAME, CONFIG_NAME
+from pytorch_transformers.modeling_bert import BertForSequenceClassification
+from pytorch_transformers.tokenization_bert import BertTokenizer
+from pytorch_transformers.optimization import BertAdam, WarmupLinearSchedule
 
 from utils_glue import processors, output_modes, convert_examples_to_features, compute_metrics
 
diff --git a/examples/run_bert_extract_features.py b/examples/run_bert_extract_features.py
index 2a550c431a..cc7dedd6af 100644
--- a/examples/run_bert_extract_features.py
+++ b/examples/run_bert_extract_features.py
@@ -28,8 +28,8 @@ import torch
 from torch.utils.data import TensorDataset, DataLoader, SequentialSampler
 from torch.utils.data.distributed import DistributedSampler
 
-from pytorch_pretrained_bert.tokenization_bert import BertTokenizer
-from pytorch_pretrained_bert.modeling_bert import BertModel
+from pytorch_transformers.tokenization_bert import BertTokenizer
+from pytorch_transformers.modeling_bert import BertModel
 
 logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s', 
                     datefmt = '%m/%d/%Y %H:%M:%S',
diff --git a/examples/run_bert_squad.py b/examples/run_bert_squad.py
index f8eee9c8eb..c3fdb06316 100644
--- a/examples/run_bert_squad.py
+++ b/examples/run_bert_squad.py
@@ -33,10 +33,10 @@ from tqdm import tqdm, trange
 
 from tensorboardX import SummaryWriter
 
-from pytorch_pretrained_bert import WEIGHTS_NAME, CONFIG_NAME
-from pytorch_pretrained_bert.modeling_bert import BertForQuestionAnswering
-from pytorch_pretrained_bert.optimization import BertAdam, WarmupLinearSchedule
-from pytorch_pretrained_bert.tokenization_bert import BertTokenizer
+from pytorch_transformers import WEIGHTS_NAME, CONFIG_NAME
+from pytorch_transformers.modeling_bert import BertForQuestionAnswering
+from pytorch_transformers.optimization import BertAdam, WarmupLinearSchedule
+from pytorch_transformers.tokenization_bert import BertTokenizer
 
 from utils_squad import read_squad_examples, convert_examples_to_features, RawResult, write_predictions
 
diff --git a/examples/run_bert_swag.py b/examples/run_bert_swag.py
index 3e45225891..00cd3a7840 100644
--- a/examples/run_bert_swag.py
+++ b/examples/run_bert_swag.py
@@ -32,10 +32,10 @@ from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
 from torch.utils.data.distributed import DistributedSampler
 from tqdm import tqdm, trange
 
-from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE, WEIGHTS_NAME, CONFIG_NAME
-from pytorch_pretrained_bert.modeling_bert import BertForMultipleChoice, BertConfig
-from pytorch_pretrained_bert.optimization import BertAdam, WarmupLinearSchedule
-from pytorch_pretrained_bert.tokenization_bert import BertTokenizer
+from pytorch_transformers.file_utils import PYTORCH_PRETRAINED_BERT_CACHE, WEIGHTS_NAME, CONFIG_NAME
+from pytorch_transformers.modeling_bert import BertForMultipleChoice, BertConfig
+from pytorch_transformers.optimization import BertAdam, WarmupLinearSchedule
+from pytorch_transformers.tokenization_bert import BertTokenizer
 
 logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                     datefmt = '%m/%d/%Y %H:%M:%S',
diff --git a/examples/run_gpt2.py b/examples/run_gpt2.py
index 8f8208bbcd..a759e449f9 100644
--- a/examples/run_gpt2.py
+++ b/examples/run_gpt2.py
@@ -8,7 +8,7 @@ import torch
 import torch.nn.functional as F
 import numpy as np
 
-from pytorch_pretrained_bert import GPT2LMHeadModel, GPT2Tokenizer
+from pytorch_transformers import GPT2LMHeadModel, GPT2Tokenizer
 
 logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                     datefmt = '%m/%d/%Y %H:%M:%S',
diff --git a/examples/run_openai_gpt.py b/examples/run_openai_gpt.py
index ac5c474491..02b86b3a22 100644
--- a/examples/run_openai_gpt.py
+++ b/examples/run_openai_gpt.py
@@ -39,7 +39,7 @@ import torch
 from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
                               TensorDataset)
 
-from pytorch_pretrained_bert import (OpenAIGPTDoubleHeadsModel, OpenAIGPTTokenizer,
+from pytorch_transformers import (OpenAIGPTDoubleHeadsModel, OpenAIGPTTokenizer,
                                      OpenAIAdam, cached_path, WEIGHTS_NAME, CONFIG_NAME)
 
 ROCSTORIES_URL = "https://s3.amazonaws.com/datasets.huggingface.co/ROCStories.tar.gz"
diff --git a/examples/run_transfo_xl.py b/examples/run_transfo_xl.py
index 0ea7b32053..fda0d8dc28 100644
--- a/examples/run_transfo_xl.py
+++ b/examples/run_transfo_xl.py
@@ -28,7 +28,7 @@ import math
 
 import torch
 
-from pytorch_pretrained_bert import TransfoXLLMHeadModel, TransfoXLCorpus, TransfoXLTokenizer
+from pytorch_transformers import TransfoXLLMHeadModel, TransfoXLCorpus, TransfoXLTokenizer
 
 logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                     datefmt = '%m/%d/%Y %H:%M:%S',
diff --git a/examples/run_xlnet_classifier.py b/examples/run_xlnet_classifier.py
index e30cad773b..7cf8a8d877 100644
--- a/examples/run_xlnet_classifier.py
+++ b/examples/run_xlnet_classifier.py
@@ -34,10 +34,10 @@ from torch.nn import CrossEntropyLoss, MSELoss
 
 from tensorboardX import SummaryWriter
 
-from pytorch_pretrained_bert import WEIGHTS_NAME, CONFIG_NAME
-from pytorch_pretrained_bert.modeling_xlnet import XLNetForSequenceClassification
-from pytorch_pretrained_bert.tokenization_xlnet import XLNetTokenizer
-from pytorch_pretrained_bert.optimization import BertAdam, WarmupLinearSchedule
+from pytorch_transformers import WEIGHTS_NAME, CONFIG_NAME
+from pytorch_transformers.modeling_xlnet import XLNetForSequenceClassification
+from pytorch_transformers.tokenization_xlnet import XLNetTokenizer
+from pytorch_transformers.optimization import BertAdam, WarmupLinearSchedule
 
 from utils_glue import processors, output_modes, convert_examples_to_features, compute_metrics
 
diff --git a/examples/run_xlnet_squad.py b/examples/run_xlnet_squad.py
index c299358b79..393fa98abd 100644
--- a/examples/run_xlnet_squad.py
+++ b/examples/run_xlnet_squad.py
@@ -33,10 +33,10 @@ from tqdm import tqdm, trange
 
 from tensorboardX import SummaryWriter
 
-from pytorch_pretrained_bert import WEIGHTS_NAME, CONFIG_NAME
-from pytorch_pretrained_bert.modeling_xlnet import BertForQuestionAnswering
-from pytorch_pretrained_bert.tokenization_xlnet import XLNetTokenizer
-from pytorch_pretrained_bert.optimization import BertAdam, WarmupLinearSchedule
+from pytorch_transformers import WEIGHTS_NAME, CONFIG_NAME
+from pytorch_transformers.modeling_xlnet import BertForQuestionAnswering
+from pytorch_transformers.tokenization_xlnet import XLNetTokenizer
+from pytorch_transformers.optimization import BertAdam, WarmupLinearSchedule
 
 from utils_squad import read_squad_examples, convert_examples_to_features, RawResult, write_predictions
 
diff --git a/examples/tests/examples_tests.py b/examples/tests/examples_tests.py
new file mode 100644
index 0000000000..120df35f82
--- /dev/null
+++ b/examples/tests/examples_tests.py
@@ -0,0 +1,50 @@
+# coding=utf-8
+# Copyright 2018 HuggingFace Inc..
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import unittest
+import json
+import random
+import shutil
+import pytest
+
+import torch
+
+from pytorch_transformers import PretrainedConfig, PreTrainedModel
+from pytorch_transformers.modeling_bert import BertModel, BertConfig, PRETRAINED_MODEL_ARCHIVE_MAP, PRETRAINED_CONFIG_ARCHIVE_MAP
+
+
+class ModelUtilsTest(unittest.TestCase):
+    def test_model_from_pretrained(self):
+        for model_name in list(PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
+            config = BertConfig.from_pretrained(model_name)
+            self.assertIsNotNone(config)
+            self.assertIsInstance(config, PretrainedConfig)
+
+            model = BertModel.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+            self.assertIsInstance(model, PreTrainedModel)
+
+            config = BertConfig.from_pretrained(model_name, output_attentions=True, output_hidden_states=True)
+            model = BertModel.from_pretrained(model_name, output_attentions=True, output_hidden_states=True)
+            self.assertEqual(model.config.output_attentions, True)
+            self.assertEqual(model.config.output_hidden_states, True)
+            self.assertEqual(model.config, config)
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/examples/utils_squad.py b/examples/utils_squad.py
index 0dfecd202c..c858776183 100644
--- a/examples/utils_squad.py
+++ b/examples/utils_squad.py
@@ -24,7 +24,7 @@ import math
 import collections
 from io import open
 
-from pytorch_pretrained_bert.tokenization_bert import BasicTokenizer, whitespace_tokenize
+from pytorch_transformers.tokenization_bert import BasicTokenizer, whitespace_tokenize
 
 logger = logging.getLogger(__name__)
 
diff --git a/hubconfs/bert_hubconf.py b/hubconfs/bert_hubconf.py
index 94c7a18a30..0ee0df6697 100644
--- a/hubconfs/bert_hubconf.py
+++ b/hubconfs/bert_hubconf.py
@@ -1,5 +1,5 @@
-from pytorch_pretrained_bert.tokenization_bert import BertTokenizer
-from pytorch_pretrained_bert.modeling_bert import (
+from pytorch_transformers.tokenization_bert import BertTokenizer
+from pytorch_transformers.modeling_bert import (
         BertModel,
         BertForNextSentencePrediction,
         BertForMaskedLM,
@@ -86,7 +86,7 @@ def bertTokenizer(*args, **kwargs):
     Example:
         >>> import torch
         >>> sentence = 'Hello, World!'
-        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
         >>> toks = tokenizer.tokenize(sentence)
         ['Hello', '##,', 'World', '##!']
         >>> ids = tokenizer.convert_tokens_to_ids(toks)
@@ -106,7 +106,7 @@ def bertModel(*args, **kwargs):
     Example:
         # Load the tokenizer
         >>> import torch
-        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
         #  Prepare tokenized input
         >>> text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
         >>> tokenized_text = tokenizer.tokenize(text)
@@ -115,7 +115,7 @@ def bertModel(*args, **kwargs):
         >>> tokens_tensor = torch.tensor([indexed_tokens])
         >>> segments_tensors = torch.tensor([segments_ids])
         # Load bertModel
-        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertModel', 'bert-base-cased')
+        >>> model = torch.hub.load('huggingface/pytorch-transformers', 'bertModel', 'bert-base-cased')
         >>> model.eval()
         # Predict hidden states features for each layer
         >>> with torch.no_grad():
@@ -135,7 +135,7 @@ def bertForNextSentencePrediction(*args, **kwargs):
     Example:
         # Load the tokenizer
         >>> import torch
-        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
         #  Prepare tokenized input
         >>> text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
         >>> tokenized_text = tokenizer.tokenize(text)
@@ -144,7 +144,7 @@ def bertForNextSentencePrediction(*args, **kwargs):
         >>> tokens_tensor = torch.tensor([indexed_tokens])
         >>> segments_tensors = torch.tensor([segments_ids])
         # Load bertForNextSentencePrediction
-        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertForNextSentencePrediction', 'bert-base-cased')
+        >>> model = torch.hub.load('huggingface/pytorch-transformers', 'bertForNextSentencePrediction', 'bert-base-cased')
         >>> model.eval()
         # Predict the next sentence classification logits
         >>> with torch.no_grad():
@@ -165,7 +165,7 @@ def bertForPreTraining(*args, **kwargs):
     Example:
         # Load the tokenizer
         >>> import torch
-        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
         #  Prepare tokenized input
         >>> text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
         >>> tokenized_text = tokenizer.tokenize(text)
@@ -173,7 +173,7 @@ def bertForPreTraining(*args, **kwargs):
         >>> tokens_tensor = torch.tensor([indexed_tokens])
         >>> segments_tensors = torch.tensor([segments_ids])
         # Load bertForPreTraining
-        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertForPreTraining', 'bert-base-cased')
+        >>> model = torch.hub.load('huggingface/pytorch-transformers', 'bertForPreTraining', 'bert-base-cased')
         >>> masked_lm_logits_scores, seq_relationship_logits = model(tokens_tensor, segments_tensors)
     """
     model = BertForPreTraining.from_pretrained(*args, **kwargs)
@@ -189,7 +189,7 @@ def bertForMaskedLM(*args, **kwargs):
     Example:
         # Load the tokenizer
         >>> import torch
-        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
         #  Prepare tokenized input
         >>> text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
         >>> tokenized_text = tokenizer.tokenize(text)
@@ -200,7 +200,7 @@ def bertForMaskedLM(*args, **kwargs):
         >>> tokens_tensor = torch.tensor([indexed_tokens])
         >>> segments_tensors = torch.tensor([segments_ids])
         # Load bertForMaskedLM
-        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertForMaskedLM', 'bert-base-cased')
+        >>> model = torch.hub.load('huggingface/pytorch-transformers', 'bertForMaskedLM', 'bert-base-cased')
         >>> model.eval()
         # Predict all tokens
         >>> with torch.no_grad():
@@ -231,7 +231,7 @@ def bertForSequenceClassification(*args, **kwargs):
     Example:
         # Load the tokenizer
         >>> import torch
-        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
         #  Prepare tokenized input
         >>> text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
         >>> tokenized_text = tokenizer.tokenize(text)
@@ -240,7 +240,7 @@ def bertForSequenceClassification(*args, **kwargs):
         >>> tokens_tensor = torch.tensor([indexed_tokens])
         >>> segments_tensors = torch.tensor([segments_ids])
         # Load bertForSequenceClassification
-        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertForSequenceClassification', 'bert-base-cased', num_labels=2)
+        >>> model = torch.hub.load('huggingface/pytorch-transformers', 'bertForSequenceClassification', 'bert-base-cased', num_labels=2)
         >>> model.eval()
         # Predict the sequence classification logits
         >>> with torch.no_grad():
@@ -266,7 +266,7 @@ def bertForMultipleChoice(*args, **kwargs):
     Example:
         # Load the tokenizer
         >>> import torch
-        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
         #  Prepare tokenized input
         >>> text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
         >>> tokenized_text = tokenizer.tokenize(text)
@@ -275,7 +275,7 @@ def bertForMultipleChoice(*args, **kwargs):
         >>> tokens_tensor = torch.tensor([indexed_tokens, indexed_tokens]).unsqueeze(0)
         >>> segments_tensors = torch.tensor([segments_ids, segments_ids]).unsqueeze(0)
         # Load bertForMultipleChoice
-        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertForMultipleChoice', 'bert-base-cased', num_choices=2)
+        >>> model = torch.hub.load('huggingface/pytorch-transformers', 'bertForMultipleChoice', 'bert-base-cased', num_choices=2)
         >>> model.eval()
         # Predict the multiple choice logits
         >>> with torch.no_grad():
@@ -299,7 +299,7 @@ def bertForQuestionAnswering(*args, **kwargs):
     Example:
         # Load the tokenizer
         >>> import torch
-        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
         #  Prepare tokenized input
         >>> text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
         >>> tokenized_text = tokenizer.tokenize(text)
@@ -308,7 +308,7 @@ def bertForQuestionAnswering(*args, **kwargs):
         >>> tokens_tensor = torch.tensor([indexed_tokens])
         >>> segments_tensors = torch.tensor([segments_ids])
         # Load bertForQuestionAnswering
-        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertForQuestionAnswering', 'bert-base-cased')
+        >>> model = torch.hub.load('huggingface/pytorch-transformers', 'bertForQuestionAnswering', 'bert-base-cased')
         >>> model.eval()
         # Predict the start and end positions logits
         >>> with torch.no_grad():
@@ -338,7 +338,7 @@ def bertForTokenClassification(*args, **kwargs):
     Example:
         # Load the tokenizer
         >>> import torch
-        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
         #  Prepare tokenized input
         >>> text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
         >>> tokenized_text = tokenizer.tokenize(text)
@@ -347,7 +347,7 @@ def bertForTokenClassification(*args, **kwargs):
         >>> tokens_tensor = torch.tensor([indexed_tokens])
         >>> segments_tensors = torch.tensor([segments_ids])
         # Load bertForTokenClassification
-        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertForTokenClassification', 'bert-base-cased', num_labels=2)
+        >>> model = torch.hub.load('huggingface/pytorch-transformers', 'bertForTokenClassification', 'bert-base-cased', num_labels=2)
         >>> model.eval()
         # Predict the token classification logits
         >>> with torch.no_grad():
diff --git a/hubconfs/gpt2_hubconf.py b/hubconfs/gpt2_hubconf.py
index 3ac8bc72ab..dbaa2cd612 100644
--- a/hubconfs/gpt2_hubconf.py
+++ b/hubconfs/gpt2_hubconf.py
@@ -1,5 +1,5 @@
-from pytorch_pretrained_bert.tokenization_gpt2 import GPT2Tokenizer
-from pytorch_pretrained_bert.modeling_gpt2 import (
+from pytorch_transformers.tokenization_gpt2 import GPT2Tokenizer
+from pytorch_transformers.modeling_gpt2 import (
     GPT2Model,
     GPT2LMHeadModel,
     GPT2DoubleHeadsModel
@@ -53,7 +53,7 @@ def gpt2Tokenizer(*args, **kwargs):
 
     Example:
         >>> import torch
-        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'gpt2Tokenizer', 'gpt2')
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'gpt2Tokenizer', 'gpt2')
 
         >>> text = "Who was Jim Henson ?"
         >>> indexed_tokens = tokenizer.encode(tokenized_text)
@@ -72,7 +72,7 @@ def gpt2Model(*args, **kwargs):
     Example:
         # Load the tokenizer
         >>> import torch
-        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'gpt2Tokenizer', 'gpt2')
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'gpt2Tokenizer', 'gpt2')
 
         #  Prepare tokenized input
         >>> text_1 = "Who was Jim Henson ?"
@@ -83,7 +83,7 @@ def gpt2Model(*args, **kwargs):
         >>> tokens_tensor_2 = torch.tensor([indexed_tokens_2])
 
         # Load gpt2Model
-        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'gpt2Model', 'gpt2')
+        >>> model = torch.hub.load('huggingface/pytorch-transformers', 'gpt2Model', 'gpt2')
         >>> model.eval()
 
         # Predict hidden states features for each layer
@@ -105,7 +105,7 @@ def gpt2LMHeadModel(*args, **kwargs):
     Example:
         # Load the tokenizer
         >>> import torch
-        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'gpt2Tokenizer', 'gpt2')
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'gpt2Tokenizer', 'gpt2')
 
         #  Prepare tokenized input
         >>> text_1 = "Who was Jim Henson ?"
@@ -116,7 +116,7 @@ def gpt2LMHeadModel(*args, **kwargs):
         >>> tokens_tensor_2 = torch.tensor([indexed_tokens_2])
 
         # Load gpt2LMHeadModel
-        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'gpt2LMHeadModel', 'gpt2')
+        >>> model = torch.hub.load('huggingface/pytorch-transformers', 'gpt2LMHeadModel', 'gpt2')
         >>> model.eval()
 
         # Predict hidden states features for each layer
@@ -144,7 +144,7 @@ def gpt2DoubleHeadsModel(*args, **kwargs):
     Example:
         # Load the tokenizer
         >>> import torch
-        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'gpt2Tokenizer', 'gpt2')
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'gpt2Tokenizer', 'gpt2')
 
         #  Prepare tokenized input
         >>> text1 = "Who was Jim Henson ? Jim Henson was a puppeteer"
@@ -157,7 +157,7 @@ def gpt2DoubleHeadsModel(*args, **kwargs):
         >>> mc_token_ids = torch.LongTensor([[len(tokenized_text1)-1, len(tokenized_text2)-1]])
 
         # Load gpt2DoubleHeadsModel
-        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'gpt2DoubleHeadsModel', 'gpt2')
+        >>> model = torch.hub.load('huggingface/pytorch-transformers', 'gpt2DoubleHeadsModel', 'gpt2')
         >>> model.eval()
 
         # Predict hidden states features for each layer
diff --git a/hubconfs/gpt_hubconf.py b/hubconfs/gpt_hubconf.py
index f3d03888ae..1683c881fa 100644
--- a/hubconfs/gpt_hubconf.py
+++ b/hubconfs/gpt_hubconf.py
@@ -1,5 +1,5 @@
-from pytorch_pretrained_bert.tokenization_openai import OpenAIGPTTokenizer
-from pytorch_pretrained_bert.modeling_openai import (
+from pytorch_transformers.tokenization_openai import OpenAIGPTTokenizer
+from pytorch_transformers.modeling_openai import (
 	OpenAIGPTModel,
 	OpenAIGPTLMHeadModel,
 	OpenAIGPTDoubleHeadsModel
@@ -77,7 +77,7 @@ def openAIGPTTokenizer(*args, **kwargs):
 
     Example:
 		>>> import torch
-        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'openAIGPTTokenizer', 'openai-gpt')
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'openAIGPTTokenizer', 'openai-gpt')
 		
 		>>> text = "Who was Jim Henson ? Jim Henson was a puppeteer"
         >>> tokenized_text = tokenizer.tokenize(text)
@@ -98,7 +98,7 @@ def openAIGPTModel(*args, **kwargs):
     Example:
         # Load the tokenizer
 		>>> import torch
-        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'openAIGPTTokenizer', 'openai-gpt')
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'openAIGPTTokenizer', 'openai-gpt')
 
         #  Prepare tokenized input
         >>> text = "Who was Jim Henson ? Jim Henson was a puppeteer"
@@ -107,7 +107,7 @@ def openAIGPTModel(*args, **kwargs):
         >>> tokens_tensor = torch.tensor([indexed_tokens])
 
         # Load openAIGPTModel
-        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'openAIGPTModel', 'openai-gpt')
+        >>> model = torch.hub.load('huggingface/pytorch-transformers', 'openAIGPTModel', 'openai-gpt')
         >>> model.eval()
 
         # Predict hidden states features for each layer
@@ -127,7 +127,7 @@ def openAIGPTLMHeadModel(*args, **kwargs):
 	Example:
         # Load the tokenizer
         >>> import torch
-        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'openAIGPTTokenizer', 'openai-gpt')
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'openAIGPTTokenizer', 'openai-gpt')
 
         #  Prepare tokenized input
         >>> text = "Who was Jim Henson ? Jim Henson was a puppeteer"
@@ -136,7 +136,7 @@ def openAIGPTLMHeadModel(*args, **kwargs):
         >>> tokens_tensor = torch.tensor([indexed_tokens])
 
         # Load openAIGPTLMHeadModel
-        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'openAIGPTLMHeadModel', 'openai-gpt')
+        >>> model = torch.hub.load('huggingface/pytorch-transformers', 'openAIGPTLMHeadModel', 'openai-gpt')
         >>> model.eval()
 
         # Predict hidden states features for each layer
@@ -162,7 +162,7 @@ def openAIGPTDoubleHeadsModel(*args, **kwargs):
 	Example:
         # Load the tokenizer
         >>> import torch
-        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'openAIGPTTokenizer', 'openai-gpt')
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'openAIGPTTokenizer', 'openai-gpt')
 
         #  Prepare tokenized input
         >>> text1 = "Who was Jim Henson ? Jim Henson was a puppeteer"
@@ -175,7 +175,7 @@ def openAIGPTDoubleHeadsModel(*args, **kwargs):
         >>> mc_token_ids = torch.LongTensor([[len(tokenized_text1)-1, len(tokenized_text2)-1]])
 
         # Load openAIGPTDoubleHeadsModel
-        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'openAIGPTDoubleHeadsModel', 'openai-gpt')
+        >>> model = torch.hub.load('huggingface/pytorch-transformers', 'openAIGPTDoubleHeadsModel', 'openai-gpt')
         >>> model.eval()
 
         # Predict hidden states features for each layer
diff --git a/hubconfs/transformer_xl_hubconf.py b/hubconfs/transformer_xl_hubconf.py
index d5c697547e..d89db894ad 100644
--- a/hubconfs/transformer_xl_hubconf.py
+++ b/hubconfs/transformer_xl_hubconf.py
@@ -1,5 +1,5 @@
-from pytorch_pretrained_bert.tokenization_transfo_xl import TransfoXLTokenizer
-from pytorch_pretrained_bert.modeling_transfo_xl import (
+from pytorch_transformers.tokenization_transfo_xl import TransfoXLTokenizer
+from pytorch_transformers.modeling_transfo_xl import (
     TransfoXLModel,
     TransfoXLLMHeadModel
 )
@@ -46,7 +46,7 @@ def transformerXLTokenizer(*args, **kwargs):
 
     Example:
         >>> import torch
-        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'transformerXLTokenizer', 'transfo-xl-wt103')
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'transformerXLTokenizer', 'transfo-xl-wt103')
         
         >>> text = "Who was Jim Henson ?"
         >>> tokenized_text = tokenizer.tokenize(tokenized_text)
@@ -64,7 +64,7 @@ def transformerXLModel(*args, **kwargs):
     Example:
         # Load the tokenizer
         >>> import torch
-        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'transformerXLTokenizer', 'transfo-xl-wt103')
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'transformerXLTokenizer', 'transfo-xl-wt103')
 
         #  Prepare tokenized input
         >>> text_1 = "Who was Jim Henson ?"
@@ -77,7 +77,7 @@ def transformerXLModel(*args, **kwargs):
         >>> tokens_tensor_2 = torch.tensor([indexed_tokens_2])
 
         # Load transformerXLModel
-        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'transformerXLModel', 'transfo-xl-wt103')
+        >>> model = torch.hub.load('huggingface/pytorch-transformers', 'transformerXLModel', 'transfo-xl-wt103')
         >>> model.eval()
 
         # Predict hidden states features for each layer
@@ -99,7 +99,7 @@ def transformerXLLMHeadModel(*args, **kwargs):
     Example:
         # Load the tokenizer
         >>> import torch
-        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'transformerXLTokenizer', 'transfo-xl-wt103')
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'transformerXLTokenizer', 'transfo-xl-wt103')
 
         #  Prepare tokenized input
         >>> text_1 = "Who was Jim Henson ?"
@@ -112,7 +112,7 @@ def transformerXLLMHeadModel(*args, **kwargs):
         >>> tokens_tensor_2 = torch.tensor([indexed_tokens_2])
 
         # Load transformerXLLMHeadModel
-        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'transformerXLLMHeadModel', 'transfo-xl-wt103')
+        >>> model = torch.hub.load('huggingface/pytorch-transformers', 'transformerXLLMHeadModel', 'transfo-xl-wt103')
         >>> model.eval()
 
         # Predict hidden states features for each layer
diff --git a/hubconfs/xlm_hubconf.py b/hubconfs/xlm_hubconf.py
index 154f875bfb..4f6fd93c24 100644
--- a/hubconfs/xlm_hubconf.py
+++ b/hubconfs/xlm_hubconf.py
@@ -1,5 +1,5 @@
-from pytorch_pretrained_bert.tokenization_xlm import XLMTokenizer
-from pytorch_pretrained_bert.modeling_xlm import (
+from pytorch_transformers.tokenization_xlm import XLMTokenizer
+from pytorch_transformers.modeling_xlm import (
     XLMConfig,
     XLMModel,
     XLMWithLMHeadModel,
@@ -18,7 +18,7 @@ xlm_start_docstring = """
     Example:
         # Load the tokenizer
         >>> import torch
-        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'xlmTokenizer', 'xlm-mlm-en-2048')
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'xlmTokenizer', 'xlm-mlm-en-2048')
 
         #  Prepare tokenized input
         >>> text_1 = "Who was Jim Henson ?"
@@ -77,7 +77,7 @@ def xlmTokenizer(*args, **kwargs):
 
     Example:
         >>> import torch
-        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'xlmTokenizer', 'xlm-mlm-en-2048')
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'xlmTokenizer', 'xlm-mlm-en-2048')
 
         >>> text = "Who was Jim Henson ?"
         >>> indexed_tokens = tokenizer.encode(tokenized_text)
@@ -91,7 +91,7 @@ def xlmTokenizer(*args, **kwargs):
 def xlmModel(*args, **kwargs):
     """
         # Load xlmModel
-        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'xlmModel', 'xlm-mlm-en-2048')
+        >>> model = torch.hub.load('huggingface/pytorch-transformers', 'xlmModel', 'xlm-mlm-en-2048')
         >>> model.eval()
 
         # Predict hidden states features for each layer
@@ -116,7 +116,7 @@ def xlmLMHeadModel(*args, **kwargs):
         >>> tokens_tensor_2 = torch.tensor([indexed_tokens_2])
 
         # Load xlnetLMHeadModel
-        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'xlnetLMHeadModel', 'xlm-mlm-en-2048')
+        >>> model = torch.hub.load('huggingface/pytorch-transformers', 'xlnetLMHeadModel', 'xlm-mlm-en-2048')
         >>> model.eval()
 
         # Predict hidden states features for each layer
@@ -143,7 +143,7 @@ def xlmLMHeadModel(*args, **kwargs):
 #     Example:
 #         # Load the tokenizer
 #         >>> import torch
-#         >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'xlnetTokenizer', 'xlm-mlm-en-2048')
+#         >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'xlnetTokenizer', 'xlm-mlm-en-2048')
 
 #         #  Prepare tokenized input
 #         >>> text1 = "Who was Jim Henson ? Jim Henson was a puppeteer"
@@ -156,7 +156,7 @@ def xlmLMHeadModel(*args, **kwargs):
 #         >>> mc_token_ids = torch.LongTensor([[len(tokenized_text1)-1, len(tokenized_text2)-1]])
 
 #         # Load xlnetForSequenceClassification
-#         >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'xlnetForSequenceClassification', 'xlm-mlm-en-2048')
+#         >>> model = torch.hub.load('huggingface/pytorch-transformers', 'xlnetForSequenceClassification', 'xlm-mlm-en-2048')
 #         >>> model.eval()
 
 #         # Predict sequence classes logits
diff --git a/hubconfs/xlnet_hubconf.1.py b/hubconfs/xlnet_hubconf.1.py
index d3766d04e0..4c5105a241 100644
--- a/hubconfs/xlnet_hubconf.1.py
+++ b/hubconfs/xlnet_hubconf.1.py
@@ -1,5 +1,5 @@
-from pytorch_pretrained_bert.tokenization_xlnet import XLNetTokenizer
-from pytorch_pretrained_bert.modeling_xlnet import (
+from pytorch_transformers.tokenization_xlnet import XLNetTokenizer
+from pytorch_transformers.modeling_xlnet import (
     XLNetConfig,
     XLNetModel,
     XLNetLMHeadModel,
@@ -54,7 +54,7 @@ def xlnetTokenizer(*args, **kwargs):
 
     Example:
         >>> import torch
-        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'xlnetTokenizer', 'xlnet-large-cased')
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'xlnetTokenizer', 'xlnet-large-cased')
 
         >>> text = "Who was Jim Henson ?"
         >>> indexed_tokens = tokenizer.encode(tokenized_text)
@@ -73,7 +73,7 @@ def xlnetModel(*args, **kwargs):
     Example:
         # Load the tokenizer
         >>> import torch
-        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'xlnetTokenizer', 'xlnet-large-cased')
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'xlnetTokenizer', 'xlnet-large-cased')
 
         #  Prepare tokenized input
         >>> text_1 = "Who was Jim Henson ?"
@@ -84,7 +84,7 @@ def xlnetModel(*args, **kwargs):
         >>> tokens_tensor_2 = torch.tensor([indexed_tokens_2])
 
         # Load xlnetModel
-        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'xlnetModel', 'xlnet-large-cased')
+        >>> model = torch.hub.load('huggingface/pytorch-transformers', 'xlnetModel', 'xlnet-large-cased')
         >>> model.eval()
 
         # Predict hidden states features for each layer
@@ -107,7 +107,7 @@ def xlnetLMHeadModel(*args, **kwargs):
     Example:
         # Load the tokenizer
         >>> import torch
-        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'xlnetTokenizer', 'xlnet-large-cased')
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'xlnetTokenizer', 'xlnet-large-cased')
 
         #  Prepare tokenized input
         >>> text_1 = "Who was Jim Henson ?"
@@ -118,7 +118,7 @@ def xlnetLMHeadModel(*args, **kwargs):
         >>> tokens_tensor_2 = torch.tensor([indexed_tokens_2])
 
         # Load xlnetLMHeadModel
-        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'xlnetLMHeadModel', 'xlnet-large-cased')
+        >>> model = torch.hub.load('huggingface/pytorch-transformers', 'xlnetLMHeadModel', 'xlnet-large-cased')
         >>> model.eval()
 
         # Predict hidden states features for each layer
@@ -145,7 +145,7 @@ def xlnetLMHeadModel(*args, **kwargs):
 #     Example:
 #         # Load the tokenizer
 #         >>> import torch
-#         >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'xlnetTokenizer', 'xlnet-large-cased')
+#         >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'xlnetTokenizer', 'xlnet-large-cased')
 
 #         #  Prepare tokenized input
 #         >>> text1 = "Who was Jim Henson ? Jim Henson was a puppeteer"
@@ -158,7 +158,7 @@ def xlnetLMHeadModel(*args, **kwargs):
 #         >>> mc_token_ids = torch.LongTensor([[len(tokenized_text1)-1, len(tokenized_text2)-1]])
 
 #         # Load xlnetForSequenceClassification
-#         >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'xlnetForSequenceClassification', 'xlnet-large-cased')
+#         >>> model = torch.hub.load('huggingface/pytorch-transformers', 'xlnetForSequenceClassification', 'xlnet-large-cased')
 #         >>> model.eval()
 
 #         # Predict sequence classes logits
diff --git a/notebooks/Comparing-TF-and-PT-models-MLM-NSP.ipynb b/notebooks/Comparing-TF-and-PT-models-MLM-NSP.ipynb
index ea7271df96..809f6ea6e0 100644
--- a/notebooks/Comparing-TF-and-PT-models-MLM-NSP.ipynb
+++ b/notebooks/Comparing-TF-and-PT-models-MLM-NSP.ipynb
@@ -78,7 +78,7 @@
     "import importlib.util\n",
     "import sys\n",
     "import tensorflow as tf\n",
-    "import pytorch_pretrained_bert as ppb\n",
+    "import pytorch_transformers as ppb\n",
     "\n",
     "def del_all_flags(FLAGS):\n",
     "    flags_dict = FLAGS._flags()    \n",
@@ -3997,9 +3997,9 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "11/16/2018 11:03:05 - INFO - pytorch_pretrained_bert.modeling_bert -   loading archive file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased.tar.gz from cache at /Users/thomaswolf/.pytorch_pretrained_bert/9c41111e2de84547a463fd39217199738d1e3deb72d4fec4399e6e241983c6f0.ae3cef932725ca7a30cdcb93fc6e09150a55e2a130ec7af63975a16c153ae2ba\n",
-      "11/16/2018 11:03:05 - INFO - pytorch_pretrained_bert.modeling_bert -   extracting archive file /Users/thomaswolf/.pytorch_pretrained_bert/9c41111e2de84547a463fd39217199738d1e3deb72d4fec4399e6e241983c6f0.ae3cef932725ca7a30cdcb93fc6e09150a55e2a130ec7af63975a16c153ae2ba to temp dir /var/folders/yx/cw8n_njx3js5jksyw_qlp8p00000gn/T/tmpaqgsm566\n",
-      "11/16/2018 11:03:08 - INFO - pytorch_pretrained_bert.modeling_bert -   Model config {\n",
+      "11/16/2018 11:03:05 - INFO - pytorch_transformers.modeling_bert -   loading archive file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased.tar.gz from cache at /Users/thomaswolf/.pytorch_transformers/9c41111e2de84547a463fd39217199738d1e3deb72d4fec4399e6e241983c6f0.ae3cef932725ca7a30cdcb93fc6e09150a55e2a130ec7af63975a16c153ae2ba\n",
+      "11/16/2018 11:03:05 - INFO - pytorch_transformers.modeling_bert -   extracting archive file /Users/thomaswolf/.pytorch_transformers/9c41111e2de84547a463fd39217199738d1e3deb72d4fec4399e6e241983c6f0.ae3cef932725ca7a30cdcb93fc6e09150a55e2a130ec7af63975a16c153ae2ba to temp dir /var/folders/yx/cw8n_njx3js5jksyw_qlp8p00000gn/T/tmpaqgsm566\n",
+      "11/16/2018 11:03:08 - INFO - pytorch_transformers.modeling_bert -   Model config {\n",
       "  \"attention_probs_dropout_prob\": 0.1,\n",
       "  \"hidden_act\": \"gelu\",\n",
       "  \"hidden_dropout_prob\": 0.1,\n",
diff --git a/notebooks/Comparing-TF-and-PT-models.ipynb b/notebooks/Comparing-TF-and-PT-models.ipynb
index 3e438e2f55..b7382e4652 100644
--- a/notebooks/Comparing-TF-and-PT-models.ipynb
+++ b/notebooks/Comparing-TF-and-PT-models.ipynb
@@ -342,7 +342,7 @@
    "outputs": [],
    "source": [
     "import extract_features\n",
-    "import pytorch_pretrained_bert as ppb\n",
+    "import pytorch_transformers as ppb\n",
     "from extract_features import *"
    ]
   },
@@ -375,8 +375,8 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "11/15/2018 16:21:18 - INFO - pytorch_pretrained_bert.modeling_bert -   loading archive file ../../google_models/uncased_L-12_H-768_A-12/\n",
-      "11/15/2018 16:21:18 - INFO - pytorch_pretrained_bert.modeling_bert -   Model config {\n",
+      "11/15/2018 16:21:18 - INFO - pytorch_transformers.modeling_bert -   loading archive file ../../google_models/uncased_L-12_H-768_A-12/\n",
+      "11/15/2018 16:21:18 - INFO - pytorch_transformers.modeling_bert -   Model config {\n",
       "  \"attention_probs_dropout_prob\": 0.1,\n",
       "  \"hidden_act\": \"gelu\",\n",
       "  \"hidden_dropout_prob\": 0.1,\n",
diff --git a/pytorch_pretrained_bert/__init__.py b/pytorch_transformers/__init__.py
similarity index 98%
rename from pytorch_pretrained_bert/__init__.py
rename to pytorch_transformers/__init__.py
index 23346967ba..cbd007f872 100644
--- a/pytorch_pretrained_bert/__init__.py
+++ b/pytorch_transformers/__init__.py
@@ -1,4 +1,4 @@
-__version__ = "0.6.2"
+__version__ = "0.7.0"
 from .tokenization_bert import BertTokenizer, BasicTokenizer, WordpieceTokenizer
 from .tokenization_openai import OpenAIGPTTokenizer
 from .tokenization_transfo_xl import (TransfoXLTokenizer, TransfoXLCorpus)
diff --git a/pytorch_pretrained_bert/__main__.py b/pytorch_transformers/__main__.py
similarity index 72%
rename from pytorch_pretrained_bert/__main__.py
rename to pytorch_transformers/__main__.py
index bb9534a830..95504c1493 100644
--- a/pytorch_pretrained_bert/__main__.py
+++ b/pytorch_transformers/__main__.py
@@ -4,24 +4,24 @@ def main():
     if (len(sys.argv) < 4 or len(sys.argv) > 6) or sys.argv[1] not in ["bert", "gpt", "transfo_xl", "gpt2", "xlnet"]:
         print(
         "Should be used as one of: \n"
-        ">> `pytorch_pretrained_bert bert TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT`, \n"
-        ">> `pytorch_pretrained_bert gpt OPENAI_GPT_CHECKPOINT_FOLDER_PATH PYTORCH_DUMP_OUTPUT [OPENAI_GPT_CONFIG]`, \n"
-        ">> `pytorch_pretrained_bert transfo_xl TF_CHECKPOINT_OR_DATASET PYTORCH_DUMP_OUTPUT [TF_CONFIG]` or \n"
-        ">> `pytorch_pretrained_bert gpt2 TF_CHECKPOINT PYTORCH_DUMP_OUTPUT [GPT2_CONFIG]` or \n"
-        ">> `pytorch_pretrained_bert xlnet TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT [FINETUNING_TASK_NAME]`")
+        ">> `pytorch_transformers bert TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT`, \n"
+        ">> `pytorch_transformers gpt OPENAI_GPT_CHECKPOINT_FOLDER_PATH PYTORCH_DUMP_OUTPUT [OPENAI_GPT_CONFIG]`, \n"
+        ">> `pytorch_transformers transfo_xl TF_CHECKPOINT_OR_DATASET PYTORCH_DUMP_OUTPUT [TF_CONFIG]` or \n"
+        ">> `pytorch_transformers gpt2 TF_CHECKPOINT PYTORCH_DUMP_OUTPUT [GPT2_CONFIG]` or \n"
+        ">> `pytorch_transformers xlnet TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT [FINETUNING_TASK_NAME]`")
     else:
         if sys.argv[1] == "bert":
             try:
                 from .convert_tf_checkpoint_to_pytorch import convert_tf_checkpoint_to_pytorch
             except ImportError:
-                print("pytorch_pretrained_bert can only be used from the commandline to convert TensorFlow models in PyTorch, "
+                print("pytorch_transformers can only be used from the commandline to convert TensorFlow models in PyTorch, "
                     "In that case, it requires TensorFlow to be installed. Please see "
                     "https://www.tensorflow.org/install/ for installation instructions.")
                 raise
 
             if len(sys.argv) != 5:
                 # pylint: disable=line-too-long
-                print("Should be used as `pytorch_pretrained_bert bert TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT`")
+                print("Should be used as `pytorch_transformers bert TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT`")
             else:
                 PYTORCH_DUMP_OUTPUT = sys.argv.pop()
                 TF_CONFIG = sys.argv.pop()
@@ -31,7 +31,7 @@ def main():
             from .convert_openai_checkpoint_to_pytorch import convert_openai_checkpoint_to_pytorch
             if len(sys.argv) < 4 or len(sys.argv) > 5:
                 # pylint: disable=line-too-long
-                print("Should be used as `pytorch_pretrained_bert gpt OPENAI_GPT_CHECKPOINT_FOLDER_PATH PYTORCH_DUMP_OUTPUT [OPENAI_GPT_CONFIG]`")
+                print("Should be used as `pytorch_transformers gpt OPENAI_GPT_CHECKPOINT_FOLDER_PATH PYTORCH_DUMP_OUTPUT [OPENAI_GPT_CONFIG]`")
             else:
                 OPENAI_GPT_CHECKPOINT_FOLDER_PATH = sys.argv[2]
                 PYTORCH_DUMP_OUTPUT = sys.argv[3]
@@ -46,13 +46,13 @@ def main():
             try:
                 from .convert_transfo_xl_checkpoint_to_pytorch import convert_transfo_xl_checkpoint_to_pytorch
             except ImportError:
-                print("pytorch_pretrained_bert can only be used from the commandline to convert TensorFlow models in PyTorch, "
+                print("pytorch_transformers can only be used from the commandline to convert TensorFlow models in PyTorch, "
                     "In that case, it requires TensorFlow to be installed. Please see "
                     "https://www.tensorflow.org/install/ for installation instructions.")
                 raise
             if len(sys.argv) < 4 or len(sys.argv) > 5:
                 # pylint: disable=line-too-long
-                print("Should be used as `pytorch_pretrained_bert transfo_xl TF_CHECKPOINT/TF_DATASET_FILE PYTORCH_DUMP_OUTPUT [TF_CONFIG]`")
+                print("Should be used as `pytorch_transformers transfo_xl TF_CHECKPOINT/TF_DATASET_FILE PYTORCH_DUMP_OUTPUT [TF_CONFIG]`")
             else:
                 if 'ckpt' in sys.argv[2].lower():
                     TF_CHECKPOINT = sys.argv[2]
@@ -70,14 +70,14 @@ def main():
             try:
                 from .convert_gpt2_checkpoint_to_pytorch import convert_gpt2_checkpoint_to_pytorch
             except ImportError:
-                print("pytorch_pretrained_bert can only be used from the commandline to convert TensorFlow models in PyTorch, "
+                print("pytorch_transformers can only be used from the commandline to convert TensorFlow models in PyTorch, "
                     "In that case, it requires TensorFlow to be installed. Please see "
                     "https://www.tensorflow.org/install/ for installation instructions.")
                 raise
 
             if len(sys.argv) < 4 or len(sys.argv) > 5:
                 # pylint: disable=line-too-long
-                print("Should be used as `pytorch_pretrained_bert gpt2 TF_CHECKPOINT PYTORCH_DUMP_OUTPUT [TF_CONFIG]`")
+                print("Should be used as `pytorch_transformers gpt2 TF_CHECKPOINT PYTORCH_DUMP_OUTPUT [TF_CONFIG]`")
             else:
                 TF_CHECKPOINT = sys.argv[2]
                 PYTORCH_DUMP_OUTPUT = sys.argv[3]
@@ -90,14 +90,14 @@ def main():
             try:
                 from .convert_xlnet_checkpoint_to_pytorch import convert_xlnet_checkpoint_to_pytorch
             except ImportError:
-                print("pytorch_pretrained_bert can only be used from the commandline to convert TensorFlow models in PyTorch, "
+                print("pytorch_transformers can only be used from the commandline to convert TensorFlow models in PyTorch, "
                     "In that case, it requires TensorFlow to be installed. Please see "
                     "https://www.tensorflow.org/install/ for installation instructions.")
                 raise
 
             if len(sys.argv) < 5 or len(sys.argv) > 6:
                 # pylint: disable=line-too-long
-                print("Should be used as `pytorch_pretrained_bert xlnet TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT [FINETUNING_TASK_NAME]`")
+                print("Should be used as `pytorch_transformers xlnet TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT [FINETUNING_TASK_NAME]`")
             else:
                 TF_CHECKPOINT = sys.argv[2]
                 TF_CONFIG = sys.argv[3]
diff --git a/pytorch_pretrained_bert/convert_gpt2_checkpoint_to_pytorch.py b/pytorch_transformers/convert_gpt2_checkpoint_to_pytorch.py
similarity index 97%
rename from pytorch_pretrained_bert/convert_gpt2_checkpoint_to_pytorch.py
rename to pytorch_transformers/convert_gpt2_checkpoint_to_pytorch.py
index 51d52a6694..86c8264cb5 100755
--- a/pytorch_pretrained_bert/convert_gpt2_checkpoint_to_pytorch.py
+++ b/pytorch_transformers/convert_gpt2_checkpoint_to_pytorch.py
@@ -21,7 +21,7 @@ from io import open
 
 import torch
 
-from pytorch_pretrained_bert.modeling_gpt2 import (CONFIG_NAME, WEIGHTS_NAME,
+from pytorch_transformers.modeling_gpt2 import (CONFIG_NAME, WEIGHTS_NAME,
                                                      GPT2Config,
                                                      GPT2Model,
                                                      load_tf_weights_in_gpt2)
diff --git a/pytorch_pretrained_bert/convert_openai_checkpoint_to_pytorch.py b/pytorch_transformers/convert_openai_checkpoint_to_pytorch.py
similarity index 97%
rename from pytorch_pretrained_bert/convert_openai_checkpoint_to_pytorch.py
rename to pytorch_transformers/convert_openai_checkpoint_to_pytorch.py
index 566008aaa0..68e9dea624 100755
--- a/pytorch_pretrained_bert/convert_openai_checkpoint_to_pytorch.py
+++ b/pytorch_transformers/convert_openai_checkpoint_to_pytorch.py
@@ -21,7 +21,7 @@ from io import open
 
 import torch
 
-from pytorch_pretrained_bert.modeling_openai import (CONFIG_NAME, WEIGHTS_NAME,
+from pytorch_transformers.modeling_openai import (CONFIG_NAME, WEIGHTS_NAME,
                                                      OpenAIGPTConfig,
                                                      OpenAIGPTModel,
                                                      load_tf_weights_in_openai_gpt)
diff --git a/pytorch_pretrained_bert/convert_tf_checkpoint_to_pytorch.py b/pytorch_transformers/convert_tf_checkpoint_to_pytorch.py
similarity index 95%
rename from pytorch_pretrained_bert/convert_tf_checkpoint_to_pytorch.py
rename to pytorch_transformers/convert_tf_checkpoint_to_pytorch.py
index 42f7380969..7530d7e12d 100755
--- a/pytorch_pretrained_bert/convert_tf_checkpoint_to_pytorch.py
+++ b/pytorch_transformers/convert_tf_checkpoint_to_pytorch.py
@@ -25,7 +25,7 @@ import tensorflow as tf
 import torch
 import numpy as np
 
-from pytorch_pretrained_bert.modeling_bert import BertConfig, BertForPreTraining, load_tf_weights_in_bert
+from pytorch_transformers.modeling_bert import BertConfig, BertForPreTraining, load_tf_weights_in_bert
 
 def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, pytorch_dump_path):
     # Initialise PyTorch model
diff --git a/pytorch_pretrained_bert/convert_transfo_xl_checkpoint_to_pytorch.py b/pytorch_transformers/convert_transfo_xl_checkpoint_to_pytorch.py
similarity index 96%
rename from pytorch_pretrained_bert/convert_transfo_xl_checkpoint_to_pytorch.py
rename to pytorch_transformers/convert_transfo_xl_checkpoint_to_pytorch.py
index 8d6b9651c7..2d666a1f03 100755
--- a/pytorch_pretrained_bert/convert_transfo_xl_checkpoint_to_pytorch.py
+++ b/pytorch_transformers/convert_transfo_xl_checkpoint_to_pytorch.py
@@ -23,13 +23,13 @@ from io import open
 
 import torch
 
-import pytorch_pretrained_bert.tokenization_transfo_xl as data_utils
-from pytorch_pretrained_bert.modeling_transfo_xl import (CONFIG_NAME,
+import pytorch_transformers.tokenization_transfo_xl as data_utils
+from pytorch_transformers.modeling_transfo_xl import (CONFIG_NAME,
                                                          WEIGHTS_NAME,
                                                          TransfoXLConfig,
                                                          TransfoXLLMHeadModel,
                                                          load_tf_weights_in_transfo_xl)
-from pytorch_pretrained_bert.tokenization_transfo_xl import (CORPUS_NAME,
+from pytorch_transformers.tokenization_transfo_xl import (CORPUS_NAME,
                                                              VOCAB_NAME)
 
 if sys.version_info[0] == 2:
diff --git a/pytorch_pretrained_bert/convert_xlm_checkpoint_to_pytorch.py b/pytorch_transformers/convert_xlm_checkpoint_to_pytorch.py
similarity index 93%
rename from pytorch_pretrained_bert/convert_xlm_checkpoint_to_pytorch.py
rename to pytorch_transformers/convert_xlm_checkpoint_to_pytorch.py
index 44a40174b4..0cbe962cea 100755
--- a/pytorch_pretrained_bert/convert_xlm_checkpoint_to_pytorch.py
+++ b/pytorch_transformers/convert_xlm_checkpoint_to_pytorch.py
@@ -23,8 +23,8 @@ from io import open
 import torch
 import numpy
 
-from pytorch_pretrained_bert.modeling_xlm import (CONFIG_NAME, WEIGHTS_NAME, XLMConfig, XLMModel)
-from pytorch_pretrained_bert.tokenization_xlm import MERGES_NAME, VOCAB_NAME
+from pytorch_transformers.modeling_xlm import (CONFIG_NAME, WEIGHTS_NAME, XLMConfig, XLMModel)
+from pytorch_transformers.tokenization_xlm import MERGES_NAME, VOCAB_NAME
 
 
 def convert_xlm_checkpoint_to_pytorch(xlm_checkpoint_path, pytorch_dump_folder_path):
diff --git a/pytorch_pretrained_bert/convert_xlnet_checkpoint_to_pytorch.py b/pytorch_transformers/convert_xlnet_checkpoint_to_pytorch.py
similarity index 98%
rename from pytorch_pretrained_bert/convert_xlnet_checkpoint_to_pytorch.py
rename to pytorch_transformers/convert_xlnet_checkpoint_to_pytorch.py
index ce4fcc7810..f41db87124 100755
--- a/pytorch_pretrained_bert/convert_xlnet_checkpoint_to_pytorch.py
+++ b/pytorch_transformers/convert_xlnet_checkpoint_to_pytorch.py
@@ -22,7 +22,7 @@ import os
 import argparse
 import torch
 
-from pytorch_pretrained_bert.modeling_xlnet import (CONFIG_NAME, WEIGHTS_NAME,
+from pytorch_transformers.modeling_xlnet import (CONFIG_NAME, WEIGHTS_NAME,
                                                     XLNetConfig,
                                                     XLNetLMHeadModel, XLNetForQuestionAnswering,
                                                     XLNetForSequenceClassification,
diff --git a/pytorch_pretrained_bert/file_utils.py b/pytorch_transformers/file_utils.py
similarity index 99%
rename from pytorch_pretrained_bert/file_utils.py
rename to pytorch_transformers/file_utils.py
index 994f47d57c..1397bd416b 100644
--- a/pytorch_pretrained_bert/file_utils.py
+++ b/pytorch_transformers/file_utils.py
@@ -29,7 +29,7 @@ except ImportError:
     torch_cache_home = os.path.expanduser(
         os.getenv('TORCH_HOME', os.path.join(
             os.getenv('XDG_CACHE_HOME', '~/.cache'), 'torch')))
-default_cache_path = os.path.join(torch_cache_home, 'pytorch_pretrained_bert')
+default_cache_path = os.path.join(torch_cache_home, 'pytorch_transformers')
 
 try:
     from urllib.parse import urlparse
diff --git a/pytorch_pretrained_bert/model_utils.py b/pytorch_transformers/model_utils.py
similarity index 100%
rename from pytorch_pretrained_bert/model_utils.py
rename to pytorch_transformers/model_utils.py
diff --git a/pytorch_pretrained_bert/modeling_bert.py b/pytorch_transformers/modeling_bert.py
similarity index 100%
rename from pytorch_pretrained_bert/modeling_bert.py
rename to pytorch_transformers/modeling_bert.py
diff --git a/pytorch_pretrained_bert/modeling_gpt2.py b/pytorch_transformers/modeling_gpt2.py
similarity index 100%
rename from pytorch_pretrained_bert/modeling_gpt2.py
rename to pytorch_transformers/modeling_gpt2.py
diff --git a/pytorch_pretrained_bert/modeling_openai.py b/pytorch_transformers/modeling_openai.py
similarity index 100%
rename from pytorch_pretrained_bert/modeling_openai.py
rename to pytorch_transformers/modeling_openai.py
diff --git a/pytorch_pretrained_bert/modeling_transfo_xl.py b/pytorch_transformers/modeling_transfo_xl.py
similarity index 100%
rename from pytorch_pretrained_bert/modeling_transfo_xl.py
rename to pytorch_transformers/modeling_transfo_xl.py
diff --git a/pytorch_pretrained_bert/modeling_transfo_xl_utilities.py b/pytorch_transformers/modeling_transfo_xl_utilities.py
similarity index 100%
rename from pytorch_pretrained_bert/modeling_transfo_xl_utilities.py
rename to pytorch_transformers/modeling_transfo_xl_utilities.py
diff --git a/pytorch_pretrained_bert/modeling_xlm.py b/pytorch_transformers/modeling_xlm.py
similarity index 99%
rename from pytorch_pretrained_bert/modeling_xlm.py
rename to pytorch_transformers/modeling_xlm.py
index 9d1775161d..6decba3cce 100644
--- a/pytorch_pretrained_bert/modeling_xlm.py
+++ b/pytorch_transformers/modeling_xlm.py
@@ -204,7 +204,7 @@ def gelu(x):
     GELU activation
     https://arxiv.org/abs/1606.08415
     https://github.com/huggingface/pytorch-openai-transformer-lm/blob/master/model_pytorch.py#L14
-    https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/modeling.py
+    https://github.com/huggingface/pytorch-transformers/blob/master/modeling.py
     """
     # return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
     return 0.5 * x * (1.0 + torch.erf(x / math.sqrt(2.0)))
diff --git a/pytorch_pretrained_bert/modeling_xlnet.py b/pytorch_transformers/modeling_xlnet.py
similarity index 100%
rename from pytorch_pretrained_bert/modeling_xlnet.py
rename to pytorch_transformers/modeling_xlnet.py
diff --git a/pytorch_pretrained_bert/optimization.py b/pytorch_transformers/optimization.py
similarity index 100%
rename from pytorch_pretrained_bert/optimization.py
rename to pytorch_transformers/optimization.py
diff --git a/pytorch_pretrained_bert/optimization_openai.py b/pytorch_transformers/optimization_openai.py
similarity index 100%
rename from pytorch_pretrained_bert/optimization_openai.py
rename to pytorch_transformers/optimization_openai.py
diff --git a/pytorch_pretrained_bert/tests/__init__.py b/pytorch_transformers/tests/__init__.py
similarity index 100%
rename from pytorch_pretrained_bert/tests/__init__.py
rename to pytorch_transformers/tests/__init__.py
diff --git a/pytorch_pretrained_bert/tests/conftest.py b/pytorch_transformers/tests/conftest.py
similarity index 100%
rename from pytorch_pretrained_bert/tests/conftest.py
rename to pytorch_transformers/tests/conftest.py
diff --git a/pytorch_pretrained_bert/tests/fixtures/input.txt b/pytorch_transformers/tests/fixtures/input.txt
similarity index 100%
rename from pytorch_pretrained_bert/tests/fixtures/input.txt
rename to pytorch_transformers/tests/fixtures/input.txt
diff --git a/pytorch_pretrained_bert/tests/fixtures/sample_text.txt b/pytorch_transformers/tests/fixtures/sample_text.txt
similarity index 100%
rename from pytorch_pretrained_bert/tests/fixtures/sample_text.txt
rename to pytorch_transformers/tests/fixtures/sample_text.txt
diff --git a/pytorch_pretrained_bert/tests/fixtures/test_sentencepiece.model b/pytorch_transformers/tests/fixtures/test_sentencepiece.model
similarity index 100%
rename from pytorch_pretrained_bert/tests/fixtures/test_sentencepiece.model
rename to pytorch_transformers/tests/fixtures/test_sentencepiece.model
diff --git a/pytorch_pretrained_bert/tests/model_tests_commons.py b/pytorch_transformers/tests/model_tests_commons.py
similarity index 99%
rename from pytorch_pretrained_bert/tests/model_tests_commons.py
rename to pytorch_transformers/tests/model_tests_commons.py
index e7c97a0787..b831f85552 100644
--- a/pytorch_pretrained_bert/tests/model_tests_commons.py
+++ b/pytorch_transformers/tests/model_tests_commons.py
@@ -412,7 +412,7 @@ class GPTModelTester(object):
             [[], []])
 
     def create_and_check_model_from_pretrained(self):
-        cache_dir = "/tmp/pytorch_pretrained_bert_test/"
+        cache_dir = "/tmp/pytorch_transformers_test/"
         for model_name in list(self.base_model_class.PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
             model = self.base_model_class.from_pretrained(model_name, cache_dir=cache_dir)
             shutil.rmtree(cache_dir)
diff --git a/pytorch_pretrained_bert/tests/model_utils_test.py b/pytorch_transformers/tests/model_utils_test.py
similarity index 89%
rename from pytorch_pretrained_bert/tests/model_utils_test.py
rename to pytorch_transformers/tests/model_utils_test.py
index 59f076fa00..120df35f82 100644
--- a/pytorch_pretrained_bert/tests/model_utils_test.py
+++ b/pytorch_transformers/tests/model_utils_test.py
@@ -25,8 +25,8 @@ import pytest
 
 import torch
 
-from pytorch_pretrained_bert import PretrainedConfig, PreTrainedModel
-from pytorch_pretrained_bert.modeling_bert import BertModel, BertConfig, PRETRAINED_MODEL_ARCHIVE_MAP, PRETRAINED_CONFIG_ARCHIVE_MAP
+from pytorch_transformers import PretrainedConfig, PreTrainedModel
+from pytorch_transformers.modeling_bert import BertModel, BertConfig, PRETRAINED_MODEL_ARCHIVE_MAP, PRETRAINED_CONFIG_ARCHIVE_MAP
 
 
 class ModelUtilsTest(unittest.TestCase):
diff --git a/pytorch_pretrained_bert/tests/modeling_bert_test.py b/pytorch_transformers/tests/modeling_bert_test.py
similarity index 98%
rename from pytorch_pretrained_bert/tests/modeling_bert_test.py
rename to pytorch_transformers/tests/modeling_bert_test.py
index 7a9d49fde7..b140f5e647 100644
--- a/pytorch_pretrained_bert/tests/modeling_bert_test.py
+++ b/pytorch_transformers/tests/modeling_bert_test.py
@@ -20,11 +20,11 @@ import unittest
 import shutil
 import pytest
 
-from pytorch_pretrained_bert import (BertConfig, BertModel, BertForMaskedLM,
+from pytorch_transformers import (BertConfig, BertModel, BertForMaskedLM,
                                      BertForNextSentencePrediction, BertForPreTraining,
                                      BertForQuestionAnswering, BertForSequenceClassification,
                                      BertForTokenClassification, BertForMultipleChoice)
-from pytorch_pretrained_bert.modeling_bert import PRETRAINED_MODEL_ARCHIVE_MAP
+from pytorch_transformers.modeling_bert import PRETRAINED_MODEL_ARCHIVE_MAP
 
 from .model_tests_commons import (create_and_check_commons, ConfigTester, ids_tensor)
 
@@ -266,7 +266,7 @@ class BertModelTest(unittest.TestCase):
 
     @pytest.mark.slow
     def test_model_from_pretrained(self):
-        cache_dir = "/tmp/pytorch_pretrained_bert_test/"
+        cache_dir = "/tmp/pytorch_transformers_test/"
         for model_name in list(PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
             model = BertModel.from_pretrained(model_name, cache_dir=cache_dir)
             shutil.rmtree(cache_dir)
diff --git a/pytorch_pretrained_bert/tests/modeling_gpt2_test.py b/pytorch_transformers/tests/modeling_gpt2_test.py
similarity index 96%
rename from pytorch_pretrained_bert/tests/modeling_gpt2_test.py
rename to pytorch_transformers/tests/modeling_gpt2_test.py
index 122cdf3c7b..4ace52571a 100644
--- a/pytorch_pretrained_bert/tests/modeling_gpt2_test.py
+++ b/pytorch_transformers/tests/modeling_gpt2_test.py
@@ -25,7 +25,7 @@ import pytest
 
 import torch
 
-from pytorch_pretrained_bert import (GPT2Config, GPT2Model,
+from pytorch_transformers import (GPT2Config, GPT2Model,
                                      GPT2LMHeadModel, GPT2DoubleHeadsModel)
 
 from .model_tests_commons import (create_and_check_commons, ConfigTester, GPTModelTester)
diff --git a/pytorch_pretrained_bert/tests/modeling_openai_test.py b/pytorch_transformers/tests/modeling_openai_test.py
similarity index 96%
rename from pytorch_pretrained_bert/tests/modeling_openai_test.py
rename to pytorch_transformers/tests/modeling_openai_test.py
index 627bc564de..fe81157023 100644
--- a/pytorch_pretrained_bert/tests/modeling_openai_test.py
+++ b/pytorch_transformers/tests/modeling_openai_test.py
@@ -21,7 +21,7 @@ import pytest
 
 import torch
 
-from pytorch_pretrained_bert import (OpenAIGPTConfig, OpenAIGPTModel,
+from pytorch_transformers import (OpenAIGPTConfig, OpenAIGPTModel,
                                      OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel)
 
 from .model_tests_commons import (create_and_check_commons, ConfigTester, GPTModelTester)
diff --git a/pytorch_pretrained_bert/tests/modeling_transfo_xl_test.py b/pytorch_transformers/tests/modeling_transfo_xl_test.py
similarity index 97%
rename from pytorch_pretrained_bert/tests/modeling_transfo_xl_test.py
rename to pytorch_transformers/tests/modeling_transfo_xl_test.py
index caeb25b412..d15a19eb64 100644
--- a/pytorch_pretrained_bert/tests/modeling_transfo_xl_test.py
+++ b/pytorch_transformers/tests/modeling_transfo_xl_test.py
@@ -25,8 +25,8 @@ import pytest
 
 import torch
 
-from pytorch_pretrained_bert import (TransfoXLConfig, TransfoXLModel, TransfoXLLMHeadModel)
-from pytorch_pretrained_bert.modeling_transfo_xl import PRETRAINED_MODEL_ARCHIVE_MAP
+from pytorch_transformers import (TransfoXLConfig, TransfoXLModel, TransfoXLLMHeadModel)
+from pytorch_transformers.modeling_transfo_xl import PRETRAINED_MODEL_ARCHIVE_MAP
 
 from .model_tests_commons import ConfigTester, create_and_check_commons, ids_tensor
 
@@ -184,7 +184,7 @@ class TransfoXLModelTest(unittest.TestCase):
 
     @pytest.mark.slow
     def test_model_from_pretrained(self):
-        cache_dir = "/tmp/pytorch_pretrained_bert_test/"
+        cache_dir = "/tmp/pytorch_transformers_test/"
         for model_name in list(PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
             model = TransfoXLModel.from_pretrained(model_name, cache_dir=cache_dir)
             shutil.rmtree(cache_dir)
diff --git a/pytorch_pretrained_bert/tests/modeling_xlm_test.py b/pytorch_transformers/tests/modeling_xlm_test.py
similarity index 97%
rename from pytorch_pretrained_bert/tests/modeling_xlm_test.py
rename to pytorch_transformers/tests/modeling_xlm_test.py
index 3e442a09fb..8a8905cc31 100644
--- a/pytorch_pretrained_bert/tests/modeling_xlm_test.py
+++ b/pytorch_transformers/tests/modeling_xlm_test.py
@@ -20,8 +20,8 @@ import unittest
 import shutil
 import pytest
 
-from pytorch_pretrained_bert import (XLMConfig, XLMModel, XLMWithLMHeadModel, XLMForQuestionAnswering, XLMForSequenceClassification)
-from pytorch_pretrained_bert.modeling_xlm import PRETRAINED_MODEL_ARCHIVE_MAP
+from pytorch_transformers import (XLMConfig, XLMModel, XLMWithLMHeadModel, XLMForQuestionAnswering, XLMForSequenceClassification)
+from pytorch_transformers.modeling_xlm import PRETRAINED_MODEL_ARCHIVE_MAP
 
 from .model_tests_commons import (create_and_check_commons, ConfigTester, ids_tensor)
 
@@ -250,7 +250,7 @@ class XLMModelTest(unittest.TestCase):
 
     @pytest.mark.slow
     def test_model_from_pretrained(self):
-        cache_dir = "/tmp/pytorch_pretrained_bert_test/"
+        cache_dir = "/tmp/pytorch_transformers_test/"
         for model_name in list(PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
             model = XLMModel.from_pretrained(model_name, cache_dir=cache_dir)
             shutil.rmtree(cache_dir)
diff --git a/pytorch_pretrained_bert/tests/modeling_xlnet_test.py b/pytorch_transformers/tests/modeling_xlnet_test.py
similarity index 97%
rename from pytorch_pretrained_bert/tests/modeling_xlnet_test.py
rename to pytorch_transformers/tests/modeling_xlnet_test.py
index 58617cf7b9..b9d55a26c7 100644
--- a/pytorch_pretrained_bert/tests/modeling_xlnet_test.py
+++ b/pytorch_transformers/tests/modeling_xlnet_test.py
@@ -25,8 +25,8 @@ import pytest
 
 import torch
 
-from pytorch_pretrained_bert import (XLNetConfig, XLNetModel, XLNetLMHeadModel, XLNetForSequenceClassification, XLNetForQuestionAnswering)
-from pytorch_pretrained_bert.modeling_xlnet import PRETRAINED_MODEL_ARCHIVE_MAP
+from pytorch_transformers import (XLNetConfig, XLNetModel, XLNetLMHeadModel, XLNetForSequenceClassification, XLNetForQuestionAnswering)
+from pytorch_transformers.modeling_xlnet import PRETRAINED_MODEL_ARCHIVE_MAP
 
 from .model_tests_commons import ConfigTester, create_and_check_commons, ids_tensor
 
@@ -278,7 +278,7 @@ class XLNetModelTest(unittest.TestCase):
 
     @pytest.mark.slow
     def test_model_from_pretrained(self):
-        cache_dir = "/tmp/pytorch_pretrained_bert_test/"
+        cache_dir = "/tmp/pytorch_transformers_test/"
         for model_name in list(PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
             model = XLNetModel.from_pretrained(model_name, cache_dir=cache_dir)
             shutil.rmtree(cache_dir)
diff --git a/pytorch_pretrained_bert/tests/optimization_test.py b/pytorch_transformers/tests/optimization_test.py
similarity index 94%
rename from pytorch_pretrained_bert/tests/optimization_test.py
rename to pytorch_transformers/tests/optimization_test.py
index c6924bd4bc..dfbbd44b6e 100644
--- a/pytorch_pretrained_bert/tests/optimization_test.py
+++ b/pytorch_transformers/tests/optimization_test.py
@@ -20,9 +20,9 @@ import unittest
 
 import torch
 
-from pytorch_pretrained_bert import BertAdam
-from pytorch_pretrained_bert import OpenAIAdam
-from pytorch_pretrained_bert.optimization import ConstantLR, WarmupLinearSchedule, WarmupConstantSchedule, \
+from pytorch_transformers import BertAdam
+from pytorch_transformers import OpenAIAdam
+from pytorch_transformers.optimization import ConstantLR, WarmupLinearSchedule, WarmupConstantSchedule, \
     WarmupCosineWithWarmupRestartsSchedule, WarmupCosineWithHardRestartsSchedule, WarmupCosineSchedule
 import numpy as np
 
diff --git a/pytorch_pretrained_bert/tests/tokenization_bert_test.py b/pytorch_transformers/tests/tokenization_bert_test.py
similarity index 97%
rename from pytorch_pretrained_bert/tests/tokenization_bert_test.py
rename to pytorch_transformers/tests/tokenization_bert_test.py
index 3d0b4323b2..59a87a4cb9 100644
--- a/pytorch_pretrained_bert/tests/tokenization_bert_test.py
+++ b/pytorch_transformers/tests/tokenization_bert_test.py
@@ -20,7 +20,7 @@ from io import open
 import shutil
 import pytest
 
-from pytorch_pretrained_bert.tokenization_bert import (BasicTokenizer,
+from pytorch_transformers.tokenization_bert import (BasicTokenizer,
                                                   BertTokenizer,
                                                   WordpieceTokenizer,
                                                   _is_control, _is_punctuation,
@@ -51,7 +51,7 @@ class TokenizationTest(unittest.TestCase):
 
     @pytest.mark.slow
     def test_tokenizer_from_pretrained(self):
-        cache_dir = "/tmp/pytorch_pretrained_bert_test/"
+        cache_dir = "/tmp/pytorch_transformers_test/"
         for model_name in list(PRETRAINED_VOCAB_ARCHIVE_MAP.keys())[:1]:
             tokenizer = BertTokenizer.from_pretrained(model_name, cache_dir=cache_dir)
             shutil.rmtree(cache_dir)
diff --git a/pytorch_pretrained_bert/tests/tokenization_gpt2_test.py b/pytorch_transformers/tests/tokenization_gpt2_test.py
similarity index 94%
rename from pytorch_pretrained_bert/tests/tokenization_gpt2_test.py
rename to pytorch_transformers/tests/tokenization_gpt2_test.py
index 70f69a1f23..c6d926bdd4 100644
--- a/pytorch_pretrained_bert/tests/tokenization_gpt2_test.py
+++ b/pytorch_transformers/tests/tokenization_gpt2_test.py
@@ -20,7 +20,7 @@ import json
 import shutil
 import pytest
 
-from pytorch_pretrained_bert.tokenization_gpt2 import GPT2Tokenizer, PRETRAINED_VOCAB_ARCHIVE_MAP
+from pytorch_transformers.tokenization_gpt2 import GPT2Tokenizer, PRETRAINED_VOCAB_ARCHIVE_MAP
 
 from .tokenization_tests_commons import create_and_check_tokenizer_commons
 
@@ -58,7 +58,7 @@ class GPT2TokenizationTest(unittest.TestCase):
 
     # @pytest.mark.slow
     def test_tokenizer_from_pretrained(self):
-        cache_dir = "/tmp/pytorch_pretrained_bert_test/"
+        cache_dir = "/tmp/pytorch_transformers_test/"
         for model_name in list(PRETRAINED_VOCAB_ARCHIVE_MAP.keys())[:1]:
             tokenizer = GPT2Tokenizer.from_pretrained(model_name, cache_dir=cache_dir)
             shutil.rmtree(cache_dir)
diff --git a/pytorch_pretrained_bert/tests/tokenization_openai_test.py b/pytorch_transformers/tests/tokenization_openai_test.py
similarity index 94%
rename from pytorch_pretrained_bert/tests/tokenization_openai_test.py
rename to pytorch_transformers/tests/tokenization_openai_test.py
index 6ae72858a7..38315f927b 100644
--- a/pytorch_pretrained_bert/tests/tokenization_openai_test.py
+++ b/pytorch_transformers/tests/tokenization_openai_test.py
@@ -20,7 +20,7 @@ import json
 import shutil
 import pytest
 
-from pytorch_pretrained_bert.tokenization_openai import OpenAIGPTTokenizer, PRETRAINED_VOCAB_ARCHIVE_MAP
+from pytorch_transformers.tokenization_openai import OpenAIGPTTokenizer, PRETRAINED_VOCAB_ARCHIVE_MAP
 
 from.tokenization_tests_commons import create_and_check_tokenizer_commons
 
@@ -60,7 +60,7 @@ class OpenAIGPTTokenizationTest(unittest.TestCase):
 
     @pytest.mark.slow
     def test_tokenizer_from_pretrained(self):
-        cache_dir = "/tmp/pytorch_pretrained_bert_test/"
+        cache_dir = "/tmp/pytorch_transformers_test/"
         for model_name in list(PRETRAINED_VOCAB_ARCHIVE_MAP.keys())[:1]:
             tokenizer = OpenAIGPTTokenizer.from_pretrained(model_name, cache_dir=cache_dir)
             shutil.rmtree(cache_dir)
diff --git a/pytorch_pretrained_bert/tests/tokenization_tests_commons.py b/pytorch_transformers/tests/tokenization_tests_commons.py
similarity index 100%
rename from pytorch_pretrained_bert/tests/tokenization_tests_commons.py
rename to pytorch_transformers/tests/tokenization_tests_commons.py
diff --git a/pytorch_pretrained_bert/tests/tokenization_transfo_xl_test.py b/pytorch_transformers/tests/tokenization_transfo_xl_test.py
similarity index 93%
rename from pytorch_pretrained_bert/tests/tokenization_transfo_xl_test.py
rename to pytorch_transformers/tests/tokenization_transfo_xl_test.py
index a5ff30ab6e..f744e319c8 100644
--- a/pytorch_pretrained_bert/tests/tokenization_transfo_xl_test.py
+++ b/pytorch_transformers/tests/tokenization_transfo_xl_test.py
@@ -20,7 +20,7 @@ from io import open
 import shutil
 import pytest
 
-from pytorch_pretrained_bert.tokenization_transfo_xl import TransfoXLTokenizer, PRETRAINED_VOCAB_ARCHIVE_MAP
+from pytorch_transformers.tokenization_transfo_xl import TransfoXLTokenizer, PRETRAINED_VOCAB_ARCHIVE_MAP
 
 from.tokenization_tests_commons import create_and_check_tokenizer_commons
 
@@ -61,7 +61,7 @@ class TransfoXLTokenizationTest(unittest.TestCase):
 
     @pytest.mark.slow
     def test_tokenizer_from_pretrained(self):
-        cache_dir = "/tmp/pytorch_pretrained_bert_test/"
+        cache_dir = "/tmp/pytorch_transformers_test/"
         for model_name in list(PRETRAINED_VOCAB_ARCHIVE_MAP.keys())[:1]:
             tokenizer = TransfoXLTokenizer.from_pretrained(model_name, cache_dir=cache_dir)
             shutil.rmtree(cache_dir)
diff --git a/pytorch_pretrained_bert/tests/tokenization_xlm_test.py b/pytorch_transformers/tests/tokenization_xlm_test.py
similarity index 94%
rename from pytorch_pretrained_bert/tests/tokenization_xlm_test.py
rename to pytorch_transformers/tests/tokenization_xlm_test.py
index 3b2db8ea1f..9cc18f3d60 100644
--- a/pytorch_pretrained_bert/tests/tokenization_xlm_test.py
+++ b/pytorch_transformers/tests/tokenization_xlm_test.py
@@ -20,7 +20,7 @@ import json
 import shutil
 import pytest
 
-from pytorch_pretrained_bert.tokenization_xlm import XLMTokenizer, PRETRAINED_VOCAB_ARCHIVE_MAP
+from pytorch_transformers.tokenization_xlm import XLMTokenizer, PRETRAINED_VOCAB_ARCHIVE_MAP
 
 from.tokenization_tests_commons import create_and_check_tokenizer_commons
 
@@ -59,7 +59,7 @@ class XLMTokenizationTest(unittest.TestCase):
 
     @pytest.mark.slow
     def test_tokenizer_from_pretrained(self):
-        cache_dir = "/tmp/pytorch_pretrained_bert_test/"
+        cache_dir = "/tmp/pytorch_transformers_test/"
         for model_name in list(PRETRAINED_VOCAB_ARCHIVE_MAP.keys())[:1]:
             tokenizer = XLMTokenizer.from_pretrained(model_name, cache_dir=cache_dir)
             shutil.rmtree(cache_dir)
diff --git a/pytorch_pretrained_bert/tests/tokenization_xlnet_test.py b/pytorch_transformers/tests/tokenization_xlnet_test.py
similarity index 97%
rename from pytorch_pretrained_bert/tests/tokenization_xlnet_test.py
rename to pytorch_transformers/tests/tokenization_xlnet_test.py
index 9b6dd5a6c4..4dd76e114b 100644
--- a/pytorch_pretrained_bert/tests/tokenization_xlnet_test.py
+++ b/pytorch_transformers/tests/tokenization_xlnet_test.py
@@ -19,7 +19,7 @@ import unittest
 import shutil
 import pytest
 
-from pytorch_pretrained_bert.tokenization_xlnet import (XLNetTokenizer,
+from pytorch_transformers.tokenization_xlnet import (XLNetTokenizer,
                                                         PRETRAINED_VOCAB_ARCHIVE_MAP,
                                                         SPIECE_UNDERLINE)
 
@@ -62,7 +62,7 @@ class XLNetTokenizationTest(unittest.TestCase):
 
     @pytest.mark.slow
     def test_tokenizer_from_pretrained(self):
-        cache_dir = "/tmp/pytorch_pretrained_bert_test/"
+        cache_dir = "/tmp/pytorch_transformers_test/"
         for model_name in list(PRETRAINED_VOCAB_ARCHIVE_MAP.keys())[:1]:
             tokenizer = XLNetTokenizer.from_pretrained(model_name, cache_dir=cache_dir)
             shutil.rmtree(cache_dir)
diff --git a/pytorch_pretrained_bert/tokenization_bert.py b/pytorch_transformers/tokenization_bert.py
similarity index 100%
rename from pytorch_pretrained_bert/tokenization_bert.py
rename to pytorch_transformers/tokenization_bert.py
diff --git a/pytorch_pretrained_bert/tokenization_gpt2.py b/pytorch_transformers/tokenization_gpt2.py
similarity index 100%
rename from pytorch_pretrained_bert/tokenization_gpt2.py
rename to pytorch_transformers/tokenization_gpt2.py
diff --git a/pytorch_pretrained_bert/tokenization_openai.py b/pytorch_transformers/tokenization_openai.py
similarity index 100%
rename from pytorch_pretrained_bert/tokenization_openai.py
rename to pytorch_transformers/tokenization_openai.py
diff --git a/pytorch_pretrained_bert/tokenization_transfo_xl.py b/pytorch_transformers/tokenization_transfo_xl.py
similarity index 100%
rename from pytorch_pretrained_bert/tokenization_transfo_xl.py
rename to pytorch_transformers/tokenization_transfo_xl.py
diff --git a/pytorch_pretrained_bert/tokenization_xlm.py b/pytorch_transformers/tokenization_xlm.py
similarity index 100%
rename from pytorch_pretrained_bert/tokenization_xlm.py
rename to pytorch_transformers/tokenization_xlm.py
diff --git a/pytorch_pretrained_bert/tokenization_xlnet.py b/pytorch_transformers/tokenization_xlnet.py
similarity index 100%
rename from pytorch_pretrained_bert/tokenization_xlnet.py
rename to pytorch_transformers/tokenization_xlnet.py
diff --git a/setup.py b/setup.py
index 28e85a0068..09b8c01ad5 100644
--- a/setup.py
+++ b/setup.py
@@ -37,16 +37,16 @@ from io import open
 from setuptools import find_packages, setup
 
 setup(
-    name="pytorch_pretrained_bert",
-    version="0.6.2",
-    author="Thomas Wolf, Victor Sanh, Tim Rault, Google AI Language Team Authors, Open AI team Authors",
+    name="pytorch_transformers",
+    version="0.7.0",
+    author="Thomas Wolf, Lysandre Debut, Victor Sanh, Tim Rault, Google AI Language Team Authors, Open AI team Authors",
     author_email="thomas@huggingface.co",
     description="PyTorch version of Google AI BERT model with script to load Google pre-trained models",
     long_description=open("README.md", "r", encoding='utf-8').read(),
     long_description_content_type="text/markdown",
     keywords='BERT NLP deep learning google',
     license='Apache',
-    url="https://github.com/huggingface/pytorch-pretrained-BERT",
+    url="https://github.com/huggingface/pytorch-transformers",
     packages=find_packages(exclude=["*.tests", "*.tests.*",
                                     "tests.*", "tests"]),
     install_requires=['torch>=0.4.1',
@@ -58,7 +58,7 @@ setup(
                       'sentencepiece'],
     entry_points={
       'console_scripts': [
-        "pytorch_pretrained_bert=pytorch_pretrained_bert.__main__:main",
+        "pytorch_transformers=pytorch_transformers.__main__:main",
       ]
     },
     # python_requires='>=3.5.0',

From 0231ba291e585cca2d01b723202d40ebdd5541b7 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Fri, 5 Jul 2019 11:59:04 +0200
Subject: [PATCH 056/139] circle-ci

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index b1e80edc89..77a7a9a88a 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
 # PyTorch Pretrained BERT: The Big & Extending Repository of pretrained Transformers
 
-[![CircleCI](https://circleci.com/gh/huggingface/pytorch-transformers.svg?style=svg)](https://circleci.com/gh/huggingface/pytorch-transformers)
+[![CircleCI](https://circleci.com/gh/huggingface/pytorch-pretrained-bert.svg?style=svg)](https://circleci.com/gh/huggingface/pytorch-pretrained-bert)
 
 This repository contains op-for-op PyTorch reimplementations, pre-trained models and fine-tuning examples for:
 

From eb91f6437e539f2434b43d6107b8a021453c3f0d Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Fri, 5 Jul 2019 12:30:15 +0200
Subject: [PATCH 057/139] update readme and setup

---
 README.md | 130 ++++++++++++++++++++++++++++++++++++------------------
 setup.py  |   4 +-
 2 files changed, 89 insertions(+), 45 deletions(-)

diff --git a/README.md b/README.md
index 77a7a9a88a..ef0d3ba3bc 100644
--- a/README.md
+++ b/README.md
@@ -2,34 +2,38 @@
 
 [![CircleCI](https://circleci.com/gh/huggingface/pytorch-pretrained-bert.svg?style=svg)](https://circleci.com/gh/huggingface/pytorch-pretrained-bert)
 
-This repository contains op-for-op PyTorch reimplementations, pre-trained models and fine-tuning examples for:
+This repository contains op-for-op PyTorch implementations, pre-trained models and fine-tuning examples for:
 
 - [Google's BERT model](https://github.com/google-research/bert),
 - [OpenAI's GPT model](https://github.com/openai/finetune-transformer-lm),
-- [Google/CMU's Transformer-XL model](https://github.com/kimiyoung/transformer-xl), and
 - [OpenAI's GPT-2 model](https://blog.openai.com/better-language-models/).
+- [Google/CMU's Transformer-XL model](https://github.com/kimiyoung/transformer-xl), and
+- [Google/CMU's XLNet model](https://github.com/zihangdai/xlnet/).
+- [Facebook's XLM model](https://github.com/facebookresearch/XLM/).
 
 These implementations have been tested on several datasets (see the examples) and should match the performances of the associated TensorFlow implementations (e.g. ~91 F1 on SQuAD for BERT, ~88 F1 on RocStories for OpenAI GPT and ~18.3 perplexity on WikiText 103 for the Transformer-XL). You can find more details in the [Examples](#examples) section below.
 
 Here are some information on these models:
 
-**BERT** was released together with the paper [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova.
-This PyTorch implementation of BERT is provided with [Google's pre-trained models](https://github.com/google-research/bert), examples, notebooks and a command-line interface to load any pre-trained TensorFlow checkpoint for BERT is also provided.
+**BERT** was released together with the paper [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova. This PyTorch implementation of BERT is provided with [Google's pre-trained models](https://github.com/google-research/bert), examples, notebooks and a command-line interface to load any pre-trained TensorFlow checkpoint for BERT is also provided.
 
-**OpenAI GPT** was released together with the paper [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.
-This PyTorch implementation of OpenAI GPT is an adaptation of the [PyTorch implementation by HuggingFace](https://github.com/huggingface/pytorch-openai-transformer-lm) and is provided with [OpenAI's pre-trained model](https://github.com/openai/finetune-transformer-lm) and a command-line interface that was used to convert the pre-trained NumPy checkpoint in PyTorch.
+**OpenAI GPT** was released together with the paper [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever. This PyTorch implementation of OpenAI GPT is an adaptation of the [PyTorch implementation by HuggingFace](https://github.com/huggingface/pytorch-openai-transformer-lm) and is provided with [OpenAI's pre-trained model](https://github.com/openai/finetune-transformer-lm) and a command-line interface that was used to convert the pre-trained NumPy checkpoint in PyTorch.
 
-**Google/CMU's Transformer-XL** was released together with the paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](http://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
-This PyTorch implementation of Transformer-XL is an adaptation of the original [PyTorch implementation](https://github.com/kimiyoung/transformer-xl) which has been slightly modified to match the performances of the TensorFlow implementation and allow to re-use the pretrained weights. A command-line interface is provided to convert TensorFlow checkpoints in PyTorch models.
+**OpenAI GPT-2** was released together with the paper [Language Models are Unsupervised Multitask Learners](https://blog.openai.com/better-language-models/) by Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever**. This PyTorch implementation of OpenAI GPT-2 is an adaptation of the [OpenAI's implementation](https://github.com/openai/gpt-2) and is provided with [OpenAI's pre-trained model](https://github.com/openai/gpt-2) and a command-line interface that was used to convert the TensorFlow checkpoint in PyTorch.
 
-**OpenAI GPT-2** was released together with the paper [Language Models are Unsupervised Multitask Learners](https://blog.openai.com/better-language-models/) by Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever**.
-This PyTorch implementation of OpenAI GPT-2 is an adaptation of the [OpenAI's implementation](https://github.com/openai/gpt-2) and is provided with [OpenAI's pre-trained model](https://github.com/openai/gpt-2) and a command-line interface that was used to convert the TensorFlow checkpoint in PyTorch.
+**Google/CMU's Transformer-XL** was released together with the paper [​XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
+This PyTorch implementation of XLNet is an adaptation of the original [PyTorch implementation](https://github.com/kimiyoung/transformer-xl) which has been slightly modified to match the performances of the TensorFlow implementation and allow to re-use the pretrained weights. A command-line interface is provided to convert TensorFlow checkpoints in PyTorch models.
 
+**Google/CMU's XLNet** was released together with the paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](http://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
+This PyTorch implementation of XLNet is provided with [Google/CMU's pre-trained models](https://github.com/zihangdai/xlnet) and examples. A command-line interface is provided to convert TensorFlow checkpoints in PyTorch models.
+
+**Facebook's XLM** was released together with the paper [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) by Guillaume Lample and Alexis Conneau.
+This PyTorch implementation of XLM is an adaptation of the original [PyTorch implementation](https://github.com/facebookresearch/XLM). A command-line interface is provided to convert original PyTorch checkpoints in PyTorch models according to the present repository.
 
 ## Content
 
 | Section | Description |
-|-|-|
+| - | - |
 | [Installation](#installation) | How to install the package |
 | [Overview](#overview) | Overview of the package |
 | [Usage](#usage) | Quickstart examples |
@@ -46,11 +50,13 @@ This repo was tested on Python 2.7 and 3.5+ (examples are tested only on python
 ### With pip
 
 PyTorch pretrained bert can be installed by pip as follows:
+
 ```bash
 pip install pytorch-transformers
 ```
 
 If you want to reproduce the original tokenization process of the `OpenAI GPT` paper, you will need to install `ftfy` (limit to version 4.4.3 if you are using Python 2) and `SpaCy` :
+
 ```bash
 pip install spacy ftfy==4.4.3
 python -m spacy download en
@@ -61,11 +67,13 @@ If you don't install `ftfy` and `SpaCy`, the `OpenAI GPT` tokenizer will default
 ### From source
 
 Clone the repository and run:
+
 ```bash
 pip install [--editable] .
 ```
 
 Here also, if you want to reproduce the original tokenization process of the `OpenAI GPT` model, you will need to install `ftfy` (limit to version 4.4.3 if you are using Python 2) and `SpaCy` :
+
 ```bash
 pip install spacy ftfy==4.4.3
 python -m spacy download en
@@ -76,6 +84,7 @@ Again, if you don't install `ftfy` and `SpaCy`, the `OpenAI GPT` tokenizer will
 A series of tests is included in the [tests folder](https://github.com/huggingface/pytorch-transformers/tree/master/tests) and can be run using `pytest` (install pytest if needed: `pip install pytest`).
 
 You can run the tests with the command:
+
 ```bash
 python -m pytest -sv tests/
 ```
@@ -500,7 +509,6 @@ with torch.no_grad():
     lm_logits, multiple_choice_logits, past = model(tokens_tensor, mc_token_ids)
 ```
 
-
 ## Doc
 
 Here is a detailed documentation of the classes in the package and how to use them:
@@ -559,12 +567,12 @@ where
 - `state_dict`: an optional state dictionnary (collections.OrderedDict object) to use instead of Google pre-trained models
 - `*inputs`, `**kwargs`: additional input for the specific Bert class (ex: num_labels for BertForSequenceClassification)
 
-
 `Uncased` means that the text has been lowercased before WordPiece tokenization, e.g., `John Smith` becomes `john smith`. The Uncased model also strips out any accent markers. `Cased` means that the true case and accent markers are preserved. Typically, the Uncased model is better unless you know that case information is important for your task (e.g., Named Entity Recognition or Part-of-Speech tagging). For information about the Multilingual and Chinese model, see the [Multilingual README](https://github.com/google-research/bert/blob/master/multilingual.md) or the original TensorFlow repository.
 
 **When using an `uncased model`, make sure to pass `--do_lower_case` to the example training scripts (or pass `do_lower_case=True` to FullTokenizer if you're using your own script and loading the tokenizer your-self.).**
 
 Examples:
+
 ```python
 # BERT
 tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True, do_basic_tokenize=True)
@@ -720,6 +728,7 @@ The inputs and output are **identical to the TensorFlow model inputs and outputs
 
 We detail them here. This model takes as *inputs*:
 [`modeling.py`](./pytorch_transformers/modeling.py)
+
 - `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] with the word token indices in the vocabulary (see the tokens preprocessing logic in the scripts [`run_bert_extract_features.py`](./examples/run_bert_extract_features.py), [`run_bert_classifier.py`](./examples/run_bert_classifier.py) and [`run_bert_squad.py`](./examples/run_bert_squad.py)), and
 - `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to a `sentence B` token (see BERT paper for more details).
 - `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices selected in [0, 1]. It's a mask to be used if some input sequence lengths are smaller than the max input sequence length of the current batch. It's the mask that we typically use for attention when a batch has varying length sentences.
@@ -759,7 +768,6 @@ An example on how to use this class is given in the [`run_bert_extract_features.
 
 An example on how to use this class is given in the [`run_lm_finetuning.py`](./examples/run_lm_finetuning.py) script which can be used to fine-tune the BERT language model on your specific different text corpus. This should improve model performance, if the language style is different from the original BERT training corpus (Wiki + BookCorpus).
 
-
 #### 3. `BertForMaskedLM`
 
 `BertForMaskedLM` includes the `BertModel` Transformer followed by the (possibly) pre-trained  masked language modeling head.
@@ -853,6 +861,7 @@ The inputs and output are **identical to the TensorFlow model inputs and outputs
 
 We detail them here. This model takes as *inputs*:
 [`modeling_openai.py`](./pytorch_transformers/modeling_openai.py)
+
 - `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] (or more generally [d_1, ..., d_n, sequence_length] were d_1 ... d_n are arbitrary dimensions) with the word BPE token indices selected in the range [0, total_tokens_embeddings[
 - `position_ids`: an optional torch.LongTensor with the same shape as input_ids
     with the position indices (selected in the range [0, config.n_positions - 1[.
@@ -862,6 +871,7 @@ We detail them here. This model takes as *inputs*:
 - `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1. It's a mask to be used to nullify some heads of the transformer. 0.0 => head is fully masked, 1.0 => head is not masked.
 
 This model *outputs*:
+
 - `hidden_states`: a list of all the encoded-hidden-states in the model (length of the list: number of layers + 1 for the output of the embeddings) as torch.FloatTensor of size [batch_size, sequence_length, hidden_size] (or more generally [d_1, ..., d_n, hidden_size] were d_1 ... d_n are the dimension of input_ids)
 
 #### 10. `OpenAIGPTLMHeadModel`
@@ -869,9 +879,11 @@ This model *outputs*:
 `OpenAIGPTLMHeadModel` includes the `OpenAIGPTModel` Transformer followed by a language modeling head with weights tied to the input embeddings (no additional parameters).
 
 *Inputs* are the same as the inputs of the [`OpenAIGPTModel`](#-9.-`OpenAIGPTModel`) class plus optional labels:
+
 - `lm_labels`: optional language modeling labels: torch.LongTensor of shape [batch_size, sequence_length] with indices selected in [-1, 0, ..., vocab_size]. All labels set to -1 are ignored (masked), the loss is only computed for the labels set in [0, ..., vocab_size].
 
 *Outputs*:
+
 - if `lm_labels` is not `None`:
   Outputs the language modeling loss.
 - else:
@@ -880,15 +892,18 @@ This model *outputs*:
 #### 11. `OpenAIGPTDoubleHeadsModel`
 
 `OpenAIGPTDoubleHeadsModel` includes the `OpenAIGPTModel` Transformer followed by two heads:
+
 - a language modeling head with weights tied to the input embeddings (no additional parameters) and:
 - a multiple choice classifier (linear layer that take as input a hidden state in a sequence to compute a score, see details in paper).
 
 *Inputs* are the same as the inputs of the [`OpenAIGPTModel`](#-9.-`OpenAIGPTModel`) class plus a classification mask and two optional labels:
+
 - `multiple_choice_token_ids`: a torch.LongTensor of shape [batch_size, num_choices] with the index of the token whose hidden state should be used as input for the multiple choice classifier (usually the [CLS] token for each choice).
 - `lm_labels`: optional language modeling labels: torch.LongTensor of shape [batch_size, sequence_length] with indices selected in [-1, 0, ..., vocab_size]. All labels set to -1 are ignored (masked), the loss is only computed for the labels set in [0, ..., vocab_size].
 - `multiple_choice_labels`: optional multiple choice labels: torch.LongTensor of shape [batch_size] with indices selected in [0, ..., num_choices].
 
 *Outputs*:
+
 - if `lm_labels` and `multiple_choice_labels` are not `None`:
   Outputs a tuple of losses with the language modeling loss and the multiple choice loss.
 - else Outputs a tuple with:
@@ -906,14 +921,17 @@ Transformer XL use a relative positioning with sinusiodal patterns and adaptive
 
 This model takes as *inputs*:
 [`modeling_transfo_xl.py`](./pytorch_transformers/modeling_transfo_xl.py)
+
 - `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] with the token indices selected in the range [0, self.config.n_token[
 - `mems`: an optional memory of hidden states from previous forward passes as a list (num layers) of hidden states at the entry of each layer. Each hidden states has shape [self.config.mem_len, bsz, self.config.d_model]. Note that the first two dimensions are transposed in `mems` with regards to `input_ids`.
 
 This model *outputs* a tuple of (last_hidden_state, new_mems)
+
 - `last_hidden_state`: the encoded-hidden-states at the top of the model as a torch.FloatTensor of size [batch_size, sequence_length, self.config.d_model]
 - `new_mems`: list (num layers) of updated mem states at the entry of each layer each mem state is a torch.FloatTensor of size [self.config.mem_len, batch_size, self.config.d_model]. Note that the first two dimensions are transposed in `mems` with regards to `input_ids`.
 
-##### Extracting a list of the hidden states at each layer of the Transformer-XL from `last_hidden_state` and `new_mems`:
+##### Extracting a list of the hidden states at each layer of the Transformer-XL from `last_hidden_state` and `new_mems`
+
 The `new_mems` contain all the hidden states PLUS the output of the embeddings (`new_mems[0]`). `new_mems[-1]` is the output of the hidden state of the layer below the last layer and `last_hidden_state` is the output of the last layer (i.E. the input of the softmax when we have a language modeling head on top).
 
 There are two differences between the shapes of `new_mems` and `last_hidden_state`: `new_mems` have transposed first dimensions and are longer (of size `self.config.mem_len`). Here is how to extract the full list of hidden states from the model output:
@@ -930,11 +948,13 @@ all_hidden_states = lower_hidden_states + [hidden_states]
 `TransfoXLLMHeadModel` includes the `TransfoXLModel` Transformer followed by an (adaptive) softmax head with weights tied to the input embeddings.
 
 *Inputs* are the same as the inputs of the [`TransfoXLModel`](#-12.-`TransfoXLModel`) class plus optional labels:
+
 - `labels`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the labels token indices selected in the range [0, self.config.n_token[
 
 *Outputs* a tuple of (last_hidden_state, new_mems)
+
 - `softmax_output`: output of the (adaptive) softmax:
-  - if labels is None: log probabilities of tokens, shape [batch_size, sequence_length, n_tokens] 
+  - if labels is None: log probabilities of tokens, shape [batch_size, sequence_length, n_tokens]
   - else: Negative log likelihood of labels tokens with shape [batch_size, sequence_length]
 - `new_mems`: list (num layers) of updated mem states at the entry of each layer each mem state is a torch.FloatTensor of size [self.config.mem_len, batch_size, self.config.d_model]. Note that the first two dimensions are transposed in `mems` with regards to `input_ids`.
 
@@ -953,6 +973,7 @@ The inputs and output are **identical to the TensorFlow model inputs and outputs
 
 We detail them here. This model takes as *inputs*:
 [`modeling_gpt2.py`](./pytorch_transformers/modeling_gpt2.py)
+
 - `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] (or more generally [d_1, ..., d_n, sequence_length] were d_1 ... d_n are arbitrary dimensions) with the word BPE token indices selected in the range [0, vocab_size[
 - `position_ids`: an optional torch.LongTensor with the same shape as input_ids
     with the position indices (selected in the range [0, config.n_positions - 1[.
@@ -963,6 +984,7 @@ We detail them here. This model takes as *inputs*:
 - `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1. It's a mask to be used to nullify some heads of the transformer. 0.0 => head is fully masked, 1.0 => head is not masked.
 
 This model *outputs*:
+
 - `hidden_states`: a list of all the encoded-hidden-states in the model (length of the list: number of layers + 1 for the output of the embeddings) as torch.FloatTensor of size [batch_size, sequence_length, hidden_size] (or more generally [d_1, ..., d_n, hidden_size] were d_1 ... d_n are the dimension of input_ids)
 - `presents`: a list of pre-computed hidden-states (key and values in each attention blocks) as a torch.FloatTensors. They can be reused to speed up sequential decoding (see the `run_gpt2.py` example).
 
@@ -971,9 +993,11 @@ This model *outputs*:
 `GPT2LMHeadModel` includes the `GPT2Model` Transformer followed by a language modeling head with weights tied to the input embeddings (no additional parameters).
 
 *Inputs* are the same as the inputs of the [`GPT2Model`](#-14.-`GPT2Model`) class plus optional labels:
+
 - `lm_labels`: optional language modeling labels: torch.LongTensor of shape [batch_size, sequence_length] with indices selected in [-1, 0, ..., vocab_size]. All labels set to -1 are ignored (masked), the loss is only computed for the labels set in [0, ..., vocab_size].
 
 *Outputs*:
+
 - if `lm_labels` is not `None`:
   Outputs the language modeling loss.
 - else: a tuple of
@@ -983,15 +1007,18 @@ This model *outputs*:
 #### 16. `GPT2DoubleHeadsModel`
 
 `GPT2DoubleHeadsModel` includes the `GPT2Model` Transformer followed by two heads:
+
 - a language modeling head with weights tied to the input embeddings (no additional parameters) and:
 - a multiple choice classifier (linear layer that take as input a hidden state in a sequence to compute a score, see details in paper).
 
 *Inputs* are the same as the inputs of the [`GPT2Model`](#-14.-`GPT2Model`) class plus a classification mask and two optional labels:
+
 - `multiple_choice_token_ids`: a torch.LongTensor of shape [batch_size, num_choices] with the index of the token whose hidden state should be used as input for the multiple choice classifier (usually the [CLS] token for each choice).
 - `lm_labels`: optional language modeling labels: torch.LongTensor of shape [batch_size, sequence_length] with indices selected in [-1, 0, ..., vocab_size]. All labels set to -1 are ignored (masked), the loss is only computed for the labels set in [0, ..., vocab_size].
 - `multiple_choice_labels`: optional multiple choice labels: torch.LongTensor of shape [batch_size] with indices selected in [0, ..., num_choices].
 
 *Outputs*:
+
 - if `lm_labels` and `multiple_choice_labels` are not `None`:
   Outputs a tuple of losses with the language modeling loss and the multiple choice loss.
 - else Outputs a tuple with:
@@ -1108,30 +1135,32 @@ The differences with `BertAdam` is that `OpenAIAdam` compensate for bias as in t
 `OpenAIAdam` accepts the same arguments as `BertAdam`.
 
 #### Learning Rate Schedules
+
 The `.optimization` module also provides additional schedules in the form of schedule objects that inherit from `_LRSchedule`.
 All `_LRSchedule` subclasses accept `warmup` and `t_total` arguments at construction.
-When an `_LRSchedule` object is passed into `BertAdam` or `OpenAIAdam`, 
-the `warmup` and `t_total` arguments on the optimizer are ignored and the ones in the `_LRSchedule` object are used. 
+When an `_LRSchedule` object is passed into `BertAdam` or `OpenAIAdam`,
+the `warmup` and `t_total` arguments on the optimizer are ignored and the ones in the `_LRSchedule` object are used.
 An overview of the implemented schedules:
+
 - `ConstantLR`: always returns learning rate 1.
 - `WarmupConstantSchedule`: Linearly increases learning rate from 0 to 1 over `warmup` fraction of training steps.
     Keeps learning rate equal to 1. after warmup.
-    ![](docs/imgs/warmup_constant_schedule.png)
+    ![warmup constant schedule](docs/imgs/warmup_constant_schedule.png)
 - `WarmupLinearSchedule`: Linearly increases learning rate from 0 to 1 over `warmup` fraction of training steps.
     Linearly decreases learning rate from 1. to 0. over remaining `1 - warmup` steps.
-    ![](docs/imgs/warmup_linear_schedule.png)
--  `WarmupCosineSchedule`: Linearly increases learning rate from 0 to 1 over `warmup` fraction of training steps.
+    ![warmup linear schedule](docs/imgs/warmup_linear_schedule.png)
+- `WarmupCosineSchedule`: Linearly increases learning rate from 0 to 1 over `warmup` fraction of training steps.
     Decreases learning rate from 1. to 0. over remaining `1 - warmup` steps following a cosine curve.
     If `cycles` (default=0.5) is different from default, learning rate follows cosine function after warmup.
-    ![](docs/imgs/warmup_cosine_schedule.png)
+    ![warmup cosine schedule](docs/imgs/warmup_cosine_schedule.png)
 - `WarmupCosineWithHardRestartsSchedule`: Linearly increases learning rate from 0 to 1 over `warmup` fraction of training steps.
     If `cycles` (default=1.) is different from default, learning rate follows `cycles` times a cosine decaying learning rate (with hard restarts).
-    ![](docs/imgs/warmup_cosine_hard_restarts_schedule.png)
+    ![warmup cosine hard restarts schedule](docs/imgs/warmup_cosine_hard_restarts_schedule.png)
 - `WarmupCosineWithWarmupRestartsSchedule`: All training progress is divided in `cycles` (default=1.) parts of equal length.
     Every part follows a schedule with the first `warmup` fraction of the training steps linearly increasing from 0. to 1.,
     followed by a learning rate decreasing from 1. to 0. following a cosine curve.
     Note that the total number of all warmup steps over all cycles together is equal to `warmup` * `cycles`
-    ![](docs/imgs/warmup_cosine_warm_restarts_schedule.png)
+    ![warmup cosine warm restarts schedule](docs/imgs/warmup_cosine_warm_restarts_schedule.png)
 
 ## Examples
 
@@ -1158,9 +1187,11 @@ Here is how to use these techniques in our scripts:
 To use 16-bits training and distributed training, you need to install NVIDIA's apex extension [as detailed here](https://github.com/nvidia/apex). You will find more information regarding the internals of `apex` and how to use `apex` in [the doc and the associated repository](https://github.com/nvidia/apex). The results of the tests performed on pytorch-BERT by the NVIDIA team (and my trials at reproducing them) can be consulted in [the relevant PR of the present repository](https://github.com/huggingface/pytorch-transformers/pull/116).
 
 Note: To use *Distributed Training*, you will need to run one training script on each of your machines. This can be done for example by running the following command on each server (see [the above mentioned blog post]((https://medium.com/huggingface/training-larger-batches-practical-tips-on-1-gpu-multi-gpu-distributed-setups-ec88c3e51255)) for more details):
+
 ```bash
 python -m torch.distributed.launch --nproc_per_node=4 --nnodes=2 --node_rank=$THIS_MACHINE_INDEX --master_addr="192.168.1.1" --master_port=1234 run_bert_classifier.py (--arg1 --arg2 --arg3 and all other arguments of the run_classifier script)
 ```
+
 Where `$THIS_MACHINE_INDEX` is an sequential index assigned to each of your machine (0, 1, 2...) and the machine with rank 0 has an IP address `192.168.1.1` and an open port `1234`.
 
 ### Fine-tuning with BERT: running the examples
@@ -1174,7 +1205,7 @@ We showcase several fine-tuning examples based on (and extended from) [the origi
 
 #### GLUE results on dev set
 
-We get the following results on the dev set of GLUE benchmark with an uncased BERT base 
+We get the following results on the dev set of GLUE benchmark with an uncased BERT base
 model. All experiments were run on a P100 GPU with a batch size of 32.
 
 | Task | Metric | Result |
@@ -1253,6 +1284,7 @@ Our test ran on a few seeds with [the original implementation hyper-parameters](
 **Fast run with apex and 16 bit precision: fine-tuning on MRPC in 27 seconds!**
 First install apex as indicated [here](https://github.com/NVIDIA/apex).
 Then run
+
 ```shell
 export GLUE_DIR=/path/to/glue
 
@@ -1279,6 +1311,7 @@ python -m torch.distributed.launch --nproc_per_node 8 run_bert_classifier.py   -
 ```
 
 Training with these hyper-parameters gave us the following results:
+
 ```bash
   acc = 0.8823529411764706
   acc_and_f1 = 0.901702786377709
@@ -1310,16 +1343,15 @@ python -m torch.distributed.launch --nproc_per_node 8 run_bert_classifier.py   -
 
 This is the example of the `bert-large-uncased-whole-word-masking-finetuned-mnli` model
 
-
 #### SQuAD
 
 This example code fine-tunes BERT on the SQuAD dataset. It runs in 24 min (with BERT-base) or 68 min (with BERT-large) on a single tesla V100 16GB.
 
 The data for SQuAD can be downloaded with the following links and should be saved in a `$SQUAD_DIR` directory.
 
-*   [train-v1.1.json](https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json)
-*   [dev-v1.1.json](https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json)
-*   [evaluate-v1.1.py](https://github.com/allenai/bi-att-flow/blob/master/squad/evaluate-v1.1.py)
+- [train-v1.1.json](https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json)
+- [dev-v1.1.json](https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json)
+- [evaluate-v1.1.py](https://github.com/allenai/bi-att-flow/blob/master/squad/evaluate-v1.1.py)
 
 ```shell
 export SQUAD_DIR=/path/to/SQUAD
@@ -1340,12 +1372,13 @@ python run_bert_squad.py \
 ```
 
 Training with the previous hyper-parameters gave us the following results:
+
 ```bash
 python $SQUAD_DIR/evaluate-v1.1.py $SQUAD_DIR/dev-v1.1.json /tmp/debug_squad/predictions.json
 {"f1": 88.52381567990474, "exact_match": 81.22043519394512}
 ```
 
-**distributed training**
+##### distributed training
 
 Here is an example using distributed training on 8 V100 GPUs and Bert Whole Word Masking uncased model to reach a F1 > 93 on SQuAD:
 
@@ -1368,6 +1401,7 @@ python -m torch.distributed.launch --nproc_per_node=8 \
 ```
 
 Training with these hyper-parameters gave us the following results:
+
 ```bash
 python $SQUAD_DIR/evaluate-v1.1.py $SQUAD_DIR/dev-v1.1.json ../models/wwm_uncased_finetuned_squad/predictions.json
 {"exact_match": 86.91579943235573, "f1": 93.1532499015869}
@@ -1382,6 +1416,7 @@ python -m torch.distributed.launch --nproc_per_node=8  run_bert_squad.py  --bert
 ```
 
 Training with these hyper-parameters gave us the following results:
+
 ```bash
 python $SQUAD_DIR/evaluate-v1.1.py $SQUAD_DIR/dev-v1.1.json ../models/wwm_uncased_finetuned_squad/predictions.json
 {"exact_match": 84.18164616840113, "f1": 91.58645594850135}
@@ -1409,7 +1444,8 @@ python run_bert_swag.py \
 ```
 
 Training with the previous hyper-parameters on a single GPU gave us the following results:
-```
+
+```bash
 eval_accuracy = 0.8062081375587323
 eval_loss = 0.5966546792367169
 global_step = 13788
@@ -1422,7 +1458,6 @@ The data should be a text file in the same format as [sample_text.txt](./samples
 You can download an [exemplary training corpus](https://ext-bert-sample.obs.eu-de.otc.t-systems.com/small_wiki_sentence_corpus.txt) generated from wikipedia articles and splitted into ~500k sentences with spaCy.
 Training one epoch on this corpus takes about 1:20h on 4 x NVIDIA Tesla P100 with `train_batch_size=200` and `max_seq_length=128`:
 
-
 Thank to the work of @Rocketknight1 and @tholor there are now **several scripts** that can be used to fine-tune BERT using the pretraining objective (combination of masked-language modeling and next sentence prediction loss). These scripts are detailed in the [`README`](./examples/lm_finetuning/README.md) of the [`examples/lm_finetuning/`](./examples/lm_finetuning/) folder.
 
 ### OpenAI GPT, Transformer-XL and GPT-2: running the examples
@@ -1471,11 +1506,13 @@ This command runs in about 1 min on a V100 and gives an evaluation perplexity of
 This example code is identical to the original unconditional and conditional generation codes.
 
 Conditional generation:
+
 ```shell
 python run_gpt2.py
 ```
 
 Unconditional generation:
+
 ```shell
 python run_gpt2.py --unconditional
 ```
@@ -1487,15 +1524,19 @@ The same option as in the original scripts are provided, please refere to the co
 The options we list above allow to fine-tune BERT-large rather easily on GPU(s) instead of the TPU used by the original implementation.
 
 For example, fine-tuning BERT-large on SQuAD can be done on a server with 4 k-80 (these are pretty old now) in 18 hours. Our results are similar to the TensorFlow implementation results (actually slightly higher):
+
 ```bash
 {"exact_match": 84.56953642384106, "f1": 91.04028647786927}
 ```
+
 To get these results we used a combination of:
+
 - multi-GPU training (automatically activated on a multi-GPU server),
 - 2 steps of gradient accumulation and
 - perform the optimization step on CPU to store Adam's averages in RAM.
 
 Here is the full list of hyper-parameters for this run:
+
 ```bash
 export SQUAD_DIR=/path/to/SQUAD
 
@@ -1518,6 +1559,7 @@ python ./run_bert_squad.py \
 If you have a recent GPU (starting from NVIDIA Volta series), you should try **16-bit fine-tuning** (FP16).
 
 Here is an example of hyper-parameters for a FP16 run we tried:
+
 ```bash
 export SQUAD_DIR=/path/to/SQUAD
 
@@ -1539,6 +1581,7 @@ python ./run_bert_squad.py \
 ```
 
 The results were similar to the above FP32 results (actually slightly higher):
+
 ```bash
 {"exact_match": 84.65468306527909, "f1": 91.238669287002}
 ```
@@ -1565,7 +1608,7 @@ python -m torch.distributed.launch --nproc_per_node=8 \
 
 ## Fine-tuning XLNet
 
-#### STS-B
+### STS-B
 
 This example code fine-tunes XLNet on the STS-B corpus.
 
@@ -1592,7 +1635,8 @@ python run_xlnet_classifier.py \
 
 Our test ran on a few seeds with [the original implementation hyper-parameters](https://github.com/zihangdai/xlnet#1-sts-b-sentence-pair-relevance-regression-with-gpus) gave evaluation results between 84% and 88%.
 
-**Distributed training**
+### Distributed training
+
 Here is an example using distributed training on 8 V100 GPUs to reach XXXX:
 
 ```bash
@@ -1611,6 +1655,7 @@ python -m torch.distributed.launch --nproc_per_node 8 \
 ```
 
 Training with these hyper-parameters gave us the following results:
+
 ```bash
   acc = 0.8823529411764706
   acc_and_f1 = 0.901702786377709
@@ -1646,15 +1691,15 @@ This is the example of the `bert-large-uncased-whole-word-masking-finetuned-mnli
 
 There is a growing field of study concerned with investigating the inner working of large-scale transformers like BERT (that some call "BERTology"). Some good examples of this field are:
 
-- BERT Rediscovers the Classical NLP Pipeline by Ian Tenney, Dipanjan Das, Ellie Pavlick: https://arxiv.org/abs/1905.05950
-- Are Sixteen Heads Really Better than One? by Paul Michel, Omer Levy, Graham Neubig: https://arxiv.org/abs/1905.10650
-- What Does BERT Look At? An Analysis of BERT's Attention by Kevin Clark, Urvashi Khandelwal, Omer Levy, Christopher D. Manning: https://arxiv.org/abs/1906.04341
+- [BERT Rediscovers the Classical NLP Pipeline](https://arxiv.org/abs/1905.05950) by Ian Tenney, Dipanjan Das, Ellie Pavlick
+- [Are Sixteen Heads Really Better than One?](https://arxiv.org/abs/1905.10650) by Paul Michel, Omer Levy, Graham Neubig
+- [What Does BERT Look At? An Analysis of BERT's Attention](https://arxiv.org/abs/1906.04341) by Kevin Clark, Urvashi Khandelwal, Omer Levy, Christopher D. Manning
 
-In order to help this new field develop, we have included a few additional features in the BERT/GPT/GPT-2 models to help people access the inner representations, mainly adapted  from the great work of Paul Michel (https://arxiv.org/abs/1905.10650):
+In order to help this new field develop, we have included a few additional features in the BERT/GPT/GPT-2 models to help people access the inner representations, mainly adapted  from the great work of [Michel et al.](https://arxiv.org/abs/1905.10650):
 
 - accessing all the hidden-states of BERT/GPT/GPT-2,
 - accessing all the attention weights for each head of BERT/GPT/GPT-2,
-- retrieving heads output values and gradients to be able to compute head importance score and prune head as explained in https://arxiv.org/abs/1905.10650.
+- retrieving heads output values and gradients to be able to compute head importance score and prune head as explained in [Michel et al.](https://arxiv.org/abs/1905.10650).
 
 To help you understand and use these features, we have added a specific example script: [`bertology.py`](./examples/bertology.py) while extract information and prune a model pre-trained on MRPC.
 
@@ -1674,7 +1719,7 @@ Please follow the instructions given in the notebooks to run and modify them.
 
 A command-line interface is provided to convert a TensorFlow checkpoint in a PyTorch dump of the `BertForPreTraining` class  (for BERT) or NumPy checkpoint in a PyTorch dump of the `OpenAIGPTModel` class  (for OpenAI GPT).
 
-### BERT
+### BERT CLI
 
 You can convert any TensorFlow checkpoint for BERT (in particular [the pre-trained models released by Google](https://github.com/google-research/bert#pre-trained-models)) in a PyTorch save file by using the [`convert_tf_checkpoint_to_pytorch.py`](./pytorch_transformers/convert_tf_checkpoint_to_pytorch.py ) script.
 
@@ -1697,7 +1742,7 @@ pytorch_transformers bert \
 
 You can download Google's pre-trained models for the conversion [here](https://github.com/google-research/bert#pre-trained-models).
 
-### OpenAI GPT
+### OpenAI GPT CLI
 
 Here is an example of the conversion process for a pre-trained OpenAI GPT model, assuming that your NumPy checkpoint save as the same format than OpenAI pretrained model (see [here](https://github.com/openai/finetune-transformer-lm))
 
@@ -1710,7 +1755,7 @@ pytorch_transformers gpt \
   [OPENAI_GPT_CONFIG]
 ```
 
-### Transformer-XL
+### Transformer-XL CLI
 
 Here is an example of the conversion process for a pre-trained Transformer-XL model (see [here](https://github.com/kimiyoung/transformer-xl/tree/master/tf#obtain-and-evaluate-pretrained-sota-models))
 
@@ -1751,7 +1796,6 @@ pytorch_transformers xlnet \
   STS-B \
 ```
 
-
 ## TPU
 
 TPU support and pretraining scripts
diff --git a/setup.py b/setup.py
index 09b8c01ad5..fcb70fbc0b 100644
--- a/setup.py
+++ b/setup.py
@@ -41,10 +41,10 @@ setup(
     version="0.7.0",
     author="Thomas Wolf, Lysandre Debut, Victor Sanh, Tim Rault, Google AI Language Team Authors, Open AI team Authors",
     author_email="thomas@huggingface.co",
-    description="PyTorch version of Google AI BERT model with script to load Google pre-trained models",
+    description="Repository of pre-trained NLP Transformer models: BERT, GPT & GPT-2, Transformer-XL, XLNet and XLM",
     long_description=open("README.md", "r", encoding='utf-8').read(),
     long_description_content_type="text/markdown",
-    keywords='BERT NLP deep learning google',
+    keywords='NLP deep learning transformer pytorch BERT GPT GPT-2 google openai CMU',
     license='Apache',
     url="https://github.com/huggingface/pytorch-transformers",
     packages=find_packages(exclude=["*.tests", "*.tests.*",

From a4f980547f298f0d3a2cdfe6da919294c967cd06 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Fri, 5 Jul 2019 12:31:34 +0200
Subject: [PATCH 058/139] remove circle ci parallelism

---
 .circleci/config.yml | 2 --
 1 file changed, 2 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index ac23723f98..996eab6815 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -12,7 +12,6 @@ jobs:
             - run: sudo python -m spacy download en
             - run: python -m pytest -sv ./pytorch_transformers/tests/ --cov
             - run: codecov
-        parallelism: 4
     build_py2:
         working_directory: ~/pytorch-transformers
         docker:
@@ -25,7 +24,6 @@ jobs:
             - run: sudo python -m spacy download en
             - run: python -m pytest -sv ./pytorch_transformers/tests/ --cov
             - run: codecov
-        parallelism: 4
 workflows:
   version: 2
   build_and_test:

From 36bca545ff1c13eb7af710d38af4270ef6a965ed Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Fri, 5 Jul 2019 15:02:59 +0200
Subject: [PATCH 059/139] tokenization abstract class - tests for examples

---
 examples/run_squad.py                         | 400 ++++++++++++++++++
 examples/test_examples.py                     |  48 +++
 pytorch_transformers/__init__.py              |   6 +-
 pytorch_transformers/modeling_bert.py         |   2 +-
 pytorch_transformers/modeling_gpt2.py         |   2 +-
 pytorch_transformers/modeling_openai.py       |   2 +-
 pytorch_transformers/modeling_transfo_xl.py   |   2 +-
 .../{model_utils.py => modeling_utils.py}     |   6 -
 pytorch_transformers/modeling_xlm.py          |   2 +-
 pytorch_transformers/modeling_xlnet.py        |   2 +-
 .../tests/model_utils_test.py                 |  50 ---
 .../tests/modeling_bert_test.py               |   2 +-
 .../tests/modeling_gpt2_test.py               |   2 +-
 .../tests/modeling_openai_test.py             |   2 +-
 ...s_commons.py => modeling_tests_commons.py} |   0
 .../tests/modeling_transfo_xl_test.py         |   2 +-
 .../tests/modeling_utils_test.py              |   9 +-
 .../tests/modeling_xlm_test.py                |   2 +-
 .../tests/modeling_xlnet_test.py              |   2 +-
 .../tests/tokenization_bert_test.py           |  10 +-
 .../tests/tokenization_gpt2_test.py           |  11 +-
 .../tests/tokenization_openai_test.py         |  10 +-
 .../tests/tokenization_transfo_xl_test.py     |   9 +-
 .../tests/tokenization_utils_test.py          |  36 ++
 .../tests/tokenization_xlm_test.py            |  12 +-
 .../tests/tokenization_xlnet_test.py          |  12 +-
 pytorch_transformers/tokenization_bert.py     |  66 +--
 pytorch_transformers/tokenization_gpt2.py     | 117 ++---
 pytorch_transformers/tokenization_openai.py   | 110 ++---
 .../tokenization_transfo_xl.py                |  78 ++--
 pytorch_transformers/tokenization_utils.py    | 114 +++++
 pytorch_transformers/tokenization_xlm.py      | 122 ++----
 pytorch_transformers/tokenization_xlnet.py    | 131 ++----
 33 files changed, 815 insertions(+), 566 deletions(-)
 create mode 100644 examples/run_squad.py
 create mode 100644 examples/test_examples.py
 rename pytorch_transformers/{model_utils.py => modeling_utils.py} (98%)
 delete mode 100644 pytorch_transformers/tests/model_utils_test.py
 rename pytorch_transformers/tests/{model_tests_commons.py => modeling_tests_commons.py} (100%)
 rename examples/tests/examples_tests.py => pytorch_transformers/tests/modeling_utils_test.py (92%)
 create mode 100644 pytorch_transformers/tests/tokenization_utils_test.py
 create mode 100644 pytorch_transformers/tokenization_utils.py

diff --git a/examples/run_squad.py b/examples/run_squad.py
new file mode 100644
index 0000000000..d6d7279cb8
--- /dev/null
+++ b/examples/run_squad.py
@@ -0,0 +1,400 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Run BERT on SQuAD."""
+
+from __future__ import absolute_import, division, print_function
+
+import argparse
+import logging
+import os
+import random
+import sys
+from io import open
+
+import numpy as np
+import torch
+from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
+                              TensorDataset)
+from torch.utils.data.distributed import DistributedSampler
+from tqdm import tqdm, trange
+
+from tensorboardX import SummaryWriter
+
+from pytorch_transformers import WEIGHTS_NAME, CONFIG_NAME
+from pytorch_transformers.modeling_bert import BertForQuestionAnswering
+from pytorch_transformers.optimization import BertAdam, WarmupLinearSchedule
+from pytorch_transformers.tokenization_bert import BertTokenizer
+
+from utils_squad import read_squad_examples, convert_examples_to_features, RawResult, write_predictions
+
+if sys.version_info[0] == 2:
+    import cPickle as pickle
+else:
+    import pickle
+
+logger = logging.getLogger(__name__)
+
+
+def main():
+    parser = argparse.ArgumentParser()
+
+    ## Required parameters
+    parser.add_argument("--bert_model", default=None, type=str, required=True,
+                        help="Bert pre-trained model selected in the list: bert-base-uncased, "
+                        "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, "
+                        "bert-base-multilingual-cased, bert-base-chinese.")
+    parser.add_argument("--output_dir", default=None, type=str, required=True,
+                        help="The output directory where the model checkpoints and predictions will be written.")
+
+    ## Other parameters
+    parser.add_argument("--train_file", default=None, type=str, help="SQuAD json for training. E.g., train-v1.1.json")
+    parser.add_argument("--predict_file", default=None, type=str,
+                        help="SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json")
+    parser.add_argument("--max_seq_length", default=384, type=int,
+                        help="The maximum total input sequence length after WordPiece tokenization. Sequences "
+                             "longer than this will be truncated, and sequences shorter than this will be padded.")
+    parser.add_argument("--doc_stride", default=128, type=int,
+                        help="When splitting up a long document into chunks, how much stride to take between chunks.")
+    parser.add_argument("--max_query_length", default=64, type=int,
+                        help="The maximum number of tokens for the question. Questions longer than this will "
+                             "be truncated to this length.")
+    parser.add_argument("--do_train", action='store_true', help="Whether to run training.")
+    parser.add_argument("--do_predict", action='store_true', help="Whether to run eval on the dev set.")
+    parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.")
+    parser.add_argument("--predict_batch_size", default=8, type=int, help="Total batch size for predictions.")
+    parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
+    parser.add_argument("--num_train_epochs", default=3.0, type=float,
+                        help="Total number of training epochs to perform.")
+    parser.add_argument("--warmup_proportion", default=0.1, type=float,
+                        help="Proportion of training to perform linear learning rate warmup for. E.g., 0.1 = 10%% "
+                             "of training.")
+    parser.add_argument("--n_best_size", default=20, type=int,
+                        help="The total number of n-best predictions to generate in the nbest_predictions.json "
+                             "output file.")
+    parser.add_argument("--max_answer_length", default=30, type=int,
+                        help="The maximum length of an answer that can be generated. This is needed because the start "
+                             "and end predictions are not conditioned on one another.")
+    parser.add_argument("--verbose_logging", action='store_true',
+                        help="If true, all of the warnings related to data processing will be printed. "
+                             "A number of warnings are expected for a normal SQuAD evaluation.")
+    parser.add_argument("--no_cuda",
+                        action='store_true',
+                        help="Whether not to use CUDA when available")
+    parser.add_argument('--seed',
+                        type=int,
+                        default=42,
+                        help="random seed for initialization")
+    parser.add_argument('--gradient_accumulation_steps',
+                        type=int,
+                        default=1,
+                        help="Number of updates steps to accumulate before performing a backward/update pass.")
+    parser.add_argument("--do_lower_case",
+                        action='store_true',
+                        help="Whether to lower case the input text. True for uncased models, False for cased models.")
+    parser.add_argument("--local_rank",
+                        type=int,
+                        default=-1,
+                        help="local_rank for distributed training on gpus")
+    parser.add_argument('--fp16',
+                        action='store_true',
+                        help="Whether to use 16-bit float precision instead of 32-bit")
+    parser.add_argument('--overwrite_output_dir',
+                        action='store_true',
+                        help="Overwrite the content of the output directory")
+    parser.add_argument('--loss_scale',
+                        type=float, default=0,
+                        help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
+                             "0 (default value): dynamic loss scaling.\n"
+                             "Positive power of 2: static loss scaling value.\n")
+    parser.add_argument('--version_2_with_negative',
+                        action='store_true',
+                        help='If true, the SQuAD examples contain some that do not have an answer.')
+    parser.add_argument('--null_score_diff_threshold',
+                        type=float, default=0.0,
+                        help="If null_score - best_non_null is greater than the threshold predict null.")
+    parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.")
+    parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.")
+    args = parser.parse_args()
+    print(args)
+
+    if args.server_ip and args.server_port:
+        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
+        import ptvsd
+        print("Waiting for debugger attach")
+        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
+        ptvsd.wait_for_attach()
+
+    if args.local_rank == -1 or args.no_cuda:
+        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
+        n_gpu = torch.cuda.device_count()
+    else:
+        torch.cuda.set_device(args.local_rank)
+        device = torch.device("cuda", args.local_rank)
+        n_gpu = 1
+        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
+        torch.distributed.init_process_group(backend='nccl')
+
+    logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
+                        datefmt = '%m/%d/%Y %H:%M:%S',
+                        level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
+
+    logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format(
+        device, n_gpu, bool(args.local_rank != -1), args.fp16))
+
+    if args.gradient_accumulation_steps < 1:
+        raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(
+                            args.gradient_accumulation_steps))
+
+    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps
+
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+    torch.manual_seed(args.seed)
+    if n_gpu > 0:
+        torch.cuda.manual_seed_all(args.seed)
+
+    if not args.do_train and not args.do_predict:
+        raise ValueError("At least one of `do_train` or `do_predict` must be True.")
+
+    if args.do_train:
+        if not args.train_file:
+            raise ValueError(
+                "If `do_train` is True, then `train_file` must be specified.")
+    if args.do_predict:
+        if not args.predict_file:
+            raise ValueError(
+                "If `do_predict` is True, then `predict_file` must be specified.")
+
+    if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir:
+        raise ValueError("Output directory {} already exists and is not empty. Use --overwrite_output_dir to overcome.".format(args.output_dir))
+    if not os.path.exists(args.output_dir):
+        os.makedirs(args.output_dir)
+
+    if args.local_rank not in [-1, 0]:
+        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
+
+    tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)
+    model = BertForQuestionAnswering.from_pretrained(args.bert_model)
+    if args.local_rank == 0:
+        torch.distributed.barrier()
+
+    if args.fp16:
+        model.half()
+    model.to(device)
+    if args.local_rank != -1:
+        model = torch.nn.parallel.DistributedDataParallel(model,
+                                                          device_ids=[args.local_rank],
+                                                          output_device=args.local_rank,
+                                                          find_unused_parameters=True)
+    elif n_gpu > 1:
+        model = torch.nn.DataParallel(model)
+
+    if args.do_train:
+        if args.local_rank in [-1, 0]:
+            tb_writer = SummaryWriter()
+        # Prepare data loader
+        train_examples = read_squad_examples(
+            input_file=args.train_file, is_training=True, version_2_with_negative=args.version_2_with_negative)
+        cached_train_features_file = args.train_file+'_{0}_{1}_{2}_{3}'.format(
+            list(filter(None, args.bert_model.split('/'))).pop(), str(args.max_seq_length), str(args.doc_stride), str(args.max_query_length))
+        try:
+            with open(cached_train_features_file, "rb") as reader:
+                train_features = pickle.load(reader)
+        except:
+            train_features = convert_examples_to_features(
+                examples=train_examples,
+                tokenizer=tokenizer,
+                max_seq_length=args.max_seq_length,
+                doc_stride=args.doc_stride,
+                max_query_length=args.max_query_length,
+                is_training=True)
+            if args.local_rank == -1 or torch.distributed.get_rank() == 0:
+                logger.info("  Saving train features into cached file %s", cached_train_features_file)
+                with open(cached_train_features_file, "wb") as writer:
+                    pickle.dump(train_features, writer)
+
+        all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
+        all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
+        all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)
+        all_start_positions = torch.tensor([f.start_position for f in train_features], dtype=torch.long)
+        all_end_positions = torch.tensor([f.end_position for f in train_features], dtype=torch.long)
+        train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
+                                   all_start_positions, all_end_positions)
+        if args.local_rank == -1:
+            train_sampler = RandomSampler(train_data)
+        else:
+            train_sampler = DistributedSampler(train_data)
+
+        train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)
+        num_train_optimization_steps = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
+        # if args.local_rank != -1:
+        #     num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size()
+
+        # Prepare optimizer
+        param_optimizer = list(model.named_parameters())
+
+        # hack to remove pooler, which is not used
+        # thus it produce None grad that break apex
+        param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]]
+
+        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
+        optimizer_grouped_parameters = [
+            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
+            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
+            ]
+
+        if args.fp16:
+            try:
+                from apex.optimizers import FP16_Optimizer
+                from apex.optimizers import FusedAdam
+            except ImportError:
+                raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
+
+            optimizer = FusedAdam(optimizer_grouped_parameters,
+                                  lr=args.learning_rate,
+                                  bias_correction=False,
+                                  max_grad_norm=1.0)
+            if args.loss_scale == 0:
+                optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
+            else:
+                optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale)
+            warmup_linear = WarmupLinearSchedule(warmup=args.warmup_proportion,
+                                                 t_total=num_train_optimization_steps)
+        else:
+            optimizer = BertAdam(optimizer_grouped_parameters,
+                                 lr=args.learning_rate,
+                                 warmup=args.warmup_proportion,
+                                 t_total=num_train_optimization_steps)
+
+        global_step = 0
+
+        logger.info("***** Running training *****")
+        logger.info("  Num orig examples = %d", len(train_examples))
+        logger.info("  Num split examples = %d", len(train_features))
+        logger.info("  Batch size = %d", args.train_batch_size)
+        logger.info("  Num steps = %d", num_train_optimization_steps)
+
+        model.train()
+        for epoch in trange(int(args.num_train_epochs), desc="Epoch"):
+            for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])):
+                if n_gpu == 1:
+                    batch = tuple(t.to(device) for t in batch) # multi-gpu does scattering it-self
+                input_ids, input_mask, segment_ids, start_positions, end_positions = batch
+                loss = model(input_ids, segment_ids, input_mask, start_positions, end_positions)
+                if n_gpu > 1:
+                    loss = loss.mean() # mean() to average on multi-gpu.
+                if args.gradient_accumulation_steps > 1:
+                    loss = loss / args.gradient_accumulation_steps
+
+                if args.fp16:
+                    optimizer.backward(loss)
+                else:
+                    loss.backward()
+                if (step + 1) % args.gradient_accumulation_steps == 0:
+                    if args.fp16:
+                        # modify learning rate with special warm up BERT uses
+                        # if args.fp16 is False, BertAdam is used and handles this automatically
+                        lr_this_step = args.learning_rate * warmup_linear.get_lr(global_step, args.warmup_proportion)
+                        for param_group in optimizer.param_groups:
+                            param_group['lr'] = lr_this_step
+                    optimizer.step()
+                    optimizer.zero_grad()
+                    global_step += 1
+                    if args.local_rank in [-1, 0]:
+                        if not args.fp16:
+                            tb_writer.add_scalar('lr', optimizer.get_lr()[0], global_step)
+                        tb_writer.add_scalar('loss', loss.item(), global_step)
+
+    if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
+        # Save a trained model, configuration and tokenizer
+        model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
+
+        # If we save using the predefined names, we can load using `from_pretrained`
+        output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
+        output_config_file = os.path.join(args.output_dir, CONFIG_NAME)
+
+        torch.save(model_to_save.state_dict(), output_model_file)
+        model_to_save.config.to_json_file(output_config_file)
+        tokenizer.save_vocabulary(args.output_dir)
+
+        # Load a trained model and vocabulary that you have fine-tuned
+        model = BertForQuestionAnswering.from_pretrained(args.output_dir)
+        tokenizer = BertTokenizer.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
+
+        # Good practice: save your training arguments together with the trained model
+        output_args_file = os.path.join(args.output_dir, 'training_args.bin')
+        torch.save(args, output_args_file)
+    else:
+        model = BertForQuestionAnswering.from_pretrained(args.bert_model)
+
+    model.to(device)
+
+    if args.do_predict and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
+        eval_examples = read_squad_examples(
+            input_file=args.predict_file, is_training=False, version_2_with_negative=args.version_2_with_negative)
+        eval_features = convert_examples_to_features(
+            examples=eval_examples,
+            tokenizer=tokenizer,
+            max_seq_length=args.max_seq_length,
+            doc_stride=args.doc_stride,
+            max_query_length=args.max_query_length,
+            is_training=False)
+
+        logger.info("***** Running predictions *****")
+        logger.info("  Num orig examples = %d", len(eval_examples))
+        logger.info("  Num split examples = %d", len(eval_features))
+        logger.info("  Batch size = %d", args.predict_batch_size)
+
+        all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
+        all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
+        all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
+        all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
+        eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_example_index)
+        # Run prediction for full data
+        eval_sampler = SequentialSampler(eval_data)
+        eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.predict_batch_size)
+
+        model.eval()
+        all_results = []
+        logger.info("Start evaluating")
+        for input_ids, input_mask, segment_ids, example_indices in tqdm(eval_dataloader, desc="Evaluating", disable=args.local_rank not in [-1, 0]):
+            if len(all_results) % 1000 == 0:
+                logger.info("Processing example: %d" % (len(all_results)))
+            input_ids = input_ids.to(device)
+            input_mask = input_mask.to(device)
+            segment_ids = segment_ids.to(device)
+            with torch.no_grad():
+                batch_start_logits, batch_end_logits = model(input_ids, segment_ids, input_mask)
+            for i, example_index in enumerate(example_indices):
+                start_logits = batch_start_logits[i].detach().cpu().tolist()
+                end_logits = batch_end_logits[i].detach().cpu().tolist()
+                eval_feature = eval_features[example_index.item()]
+                unique_id = int(eval_feature.unique_id)
+                all_results.append(RawResult(unique_id=unique_id,
+                                             start_logits=start_logits,
+                                             end_logits=end_logits))
+        output_prediction_file = os.path.join(args.output_dir, "predictions.json")
+        output_nbest_file = os.path.join(args.output_dir, "nbest_predictions.json")
+        output_null_log_odds_file = os.path.join(args.output_dir, "null_odds.json")
+        write_predictions(eval_examples, eval_features, all_results,
+                          args.n_best_size, args.max_answer_length,
+                          args.do_lower_case, output_prediction_file,
+                          output_nbest_file, output_null_log_odds_file, args.verbose_logging,
+                          args.version_2_with_negative, args.null_score_diff_threshold)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/test_examples.py b/examples/test_examples.py
new file mode 100644
index 0000000000..fada43dae2
--- /dev/null
+++ b/examples/test_examples.py
@@ -0,0 +1,48 @@
+# coding=utf-8
+# Copyright 2018 HuggingFace Inc..
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import sys
+import unittest
+import argparse
+
+try:
+    # python 3.4+ can use builtin unittest.mock instead of mock package
+    from unittest.mock import patch
+except ImportError:
+    from mock import patch
+
+import run_bert_squad as rbs
+
+def get_setup_file():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-f')
+    args = parser.parse_args()
+    return args.f
+
+class ExamplesTests(unittest.TestCase):
+
+    def test_run_squad(self):
+        testargs = ["prog", "-f", "/home/test/setup.py"]
+        with patch.object(sys, 'argv', testargs):
+            setup = get_setup_file()
+            assert setup == "/home/test/setup.py"
+            # rbs.main()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/pytorch_transformers/__init__.py b/pytorch_transformers/__init__.py
index cbd007f872..6dd78dfd02 100644
--- a/pytorch_transformers/__init__.py
+++ b/pytorch_transformers/__init__.py
@@ -5,6 +5,7 @@ from .tokenization_transfo_xl import (TransfoXLTokenizer, TransfoXLCorpus)
 from .tokenization_gpt2 import GPT2Tokenizer
 from .tokenization_xlnet import XLNetTokenizer, SPIECE_UNDERLINE
 from .tokenization_xlm import XLMTokenizer
+from .tokenization_utils import (PreTrainedTokenizer, clean_up_tokenization)
 
 from .modeling_bert import (BertConfig, BertModel, BertForPreTraining,
                        BertForMaskedLM, BertForNextSentencePrediction,
@@ -26,11 +27,10 @@ from .modeling_xlnet import (XLNetConfig,
 from .modeling_xlm import (XLMConfig, XLMModel,
                            XLMWithLMHeadModel, XLMForSequenceClassification,
                            XLMForQuestionAnswering)
+from .modeling_utils import (WEIGHTS_NAME, CONFIG_NAME, TF_WEIGHTS_NAME,
+                          PretrainedConfig, PreTrainedModel, prune_layer, Conv1D)
 
 from .optimization import BertAdam
 from .optimization_openai import OpenAIAdam
 
 from .file_utils import (PYTORCH_PRETRAINED_BERT_CACHE, cached_path)
-
-from .model_utils import (WEIGHTS_NAME, CONFIG_NAME, TF_WEIGHTS_NAME,
-                          PretrainedConfig, PreTrainedModel, prune_layer, Conv1D)
diff --git a/pytorch_transformers/modeling_bert.py b/pytorch_transformers/modeling_bert.py
index d4967b3718..b2a456209d 100644
--- a/pytorch_transformers/modeling_bert.py
+++ b/pytorch_transformers/modeling_bert.py
@@ -29,7 +29,7 @@ from torch import nn
 from torch.nn import CrossEntropyLoss, MSELoss
 
 from .file_utils import cached_path
-from .model_utils import WEIGHTS_NAME, CONFIG_NAME, PretrainedConfig, PreTrainedModel, prune_linear_layer
+from .modeling_utils import WEIGHTS_NAME, CONFIG_NAME, PretrainedConfig, PreTrainedModel, prune_linear_layer
 
 logger = logging.getLogger(__name__)
 
diff --git a/pytorch_transformers/modeling_gpt2.py b/pytorch_transformers/modeling_gpt2.py
index c16ad2f763..090763cda1 100644
--- a/pytorch_transformers/modeling_gpt2.py
+++ b/pytorch_transformers/modeling_gpt2.py
@@ -31,7 +31,7 @@ from torch.nn import CrossEntropyLoss
 from torch.nn.parameter import Parameter
 
 from .file_utils import cached_path
-from .model_utils import (Conv1D, CONFIG_NAME, WEIGHTS_NAME, PretrainedConfig,
+from .modeling_utils import (Conv1D, CONFIG_NAME, WEIGHTS_NAME, PretrainedConfig,
                           PreTrainedModel, prune_conv1d_layer, SequenceSummary)
 from .modeling_bert import BertLayerNorm as LayerNorm
 
diff --git a/pytorch_transformers/modeling_openai.py b/pytorch_transformers/modeling_openai.py
index 1a3e7fbbb4..b715b18371 100644
--- a/pytorch_transformers/modeling_openai.py
+++ b/pytorch_transformers/modeling_openai.py
@@ -31,7 +31,7 @@ from torch.nn import CrossEntropyLoss
 from torch.nn.parameter import Parameter
 
 from .file_utils import cached_path
-from .model_utils import (Conv1D, CONFIG_NAME, WEIGHTS_NAME, PretrainedConfig,
+from .modeling_utils import (Conv1D, CONFIG_NAME, WEIGHTS_NAME, PretrainedConfig,
                           PreTrainedModel, prune_conv1d_layer, SequenceSummary)
 from .modeling_bert import BertLayerNorm as LayerNorm
 
diff --git a/pytorch_transformers/modeling_transfo_xl.py b/pytorch_transformers/modeling_transfo_xl.py
index 9a882bce96..465577b002 100644
--- a/pytorch_transformers/modeling_transfo_xl.py
+++ b/pytorch_transformers/modeling_transfo_xl.py
@@ -37,7 +37,7 @@ from torch.nn.parameter import Parameter
 from .modeling_bert import BertLayerNorm as LayerNorm
 from .modeling_transfo_xl_utilities import ProjectedAdaptiveLogSoftmax, sample_logits
 from .file_utils import cached_path
-from .model_utils import CONFIG_NAME, WEIGHTS_NAME, PretrainedConfig, PreTrainedModel
+from .modeling_utils import CONFIG_NAME, WEIGHTS_NAME, PretrainedConfig, PreTrainedModel
 
 logger = logging.getLogger(__name__)
 
diff --git a/pytorch_transformers/model_utils.py b/pytorch_transformers/modeling_utils.py
similarity index 98%
rename from pytorch_transformers/model_utils.py
rename to pytorch_transformers/modeling_utils.py
index 051fbdefbc..b72707ce08 100644
--- a/pytorch_transformers/model_utils.py
+++ b/pytorch_transformers/modeling_utils.py
@@ -598,9 +598,3 @@ def prune_layer(layer, index, dim=None):
         return prune_conv1d_layer(layer, index, dim=1 if dim is None else dim)
     else:
         raise ValueError("Can't prune layer of class {}".format(layer.__class__))
-
-def clean_up_tokenization(out_string):
-    out_string.replace(' .', '.').replace(' ?', '?').replace(' !', '!').replace(' ,', ','
-                    ).replace(" ' ", "'").replace(" n't", "n't").replace(" 'm", "'m").replace(" do not", " don't"
-                    ).replace(" 's", "'s").replace(" 've", "'ve").replace(" 're", "'re")
-    return out_string
diff --git a/pytorch_transformers/modeling_xlm.py b/pytorch_transformers/modeling_xlm.py
index 6decba3cce..14f8848a42 100644
--- a/pytorch_transformers/modeling_xlm.py
+++ b/pytorch_transformers/modeling_xlm.py
@@ -35,7 +35,7 @@ from torch.nn import functional as F
 from torch.nn import CrossEntropyLoss, MSELoss
 
 from .file_utils import cached_path
-from .model_utils import (CONFIG_NAME, WEIGHTS_NAME, PretrainedConfig, PreTrainedModel,
+from .modeling_utils import (CONFIG_NAME, WEIGHTS_NAME, PretrainedConfig, PreTrainedModel,
                           prune_linear_layer, SequenceSummary, SQuADHead)
 
 logger = logging.getLogger(__name__)
diff --git a/pytorch_transformers/modeling_xlnet.py b/pytorch_transformers/modeling_xlnet.py
index f5841e0601..289dcbd9db 100644
--- a/pytorch_transformers/modeling_xlnet.py
+++ b/pytorch_transformers/modeling_xlnet.py
@@ -32,7 +32,7 @@ from torch.nn import functional as F
 from torch.nn import CrossEntropyLoss, MSELoss
 
 from .file_utils import cached_path
-from .model_utils import (CONFIG_NAME, WEIGHTS_NAME, PretrainedConfig, PreTrainedModel,
+from .modeling_utils import (CONFIG_NAME, WEIGHTS_NAME, PretrainedConfig, PreTrainedModel,
                           SequenceSummary, PoolerAnswerClass, PoolerEndLogits, PoolerStartLogits)
 
 
diff --git a/pytorch_transformers/tests/model_utils_test.py b/pytorch_transformers/tests/model_utils_test.py
deleted file mode 100644
index 120df35f82..0000000000
--- a/pytorch_transformers/tests/model_utils_test.py
+++ /dev/null
@@ -1,50 +0,0 @@
-# coding=utf-8
-# Copyright 2018 HuggingFace Inc..
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-import unittest
-import json
-import random
-import shutil
-import pytest
-
-import torch
-
-from pytorch_transformers import PretrainedConfig, PreTrainedModel
-from pytorch_transformers.modeling_bert import BertModel, BertConfig, PRETRAINED_MODEL_ARCHIVE_MAP, PRETRAINED_CONFIG_ARCHIVE_MAP
-
-
-class ModelUtilsTest(unittest.TestCase):
-    def test_model_from_pretrained(self):
-        for model_name in list(PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
-            config = BertConfig.from_pretrained(model_name)
-            self.assertIsNotNone(config)
-            self.assertIsInstance(config, PretrainedConfig)
-
-            model = BertModel.from_pretrained(model_name)
-            self.assertIsNotNone(model)
-            self.assertIsInstance(model, PreTrainedModel)
-
-            config = BertConfig.from_pretrained(model_name, output_attentions=True, output_hidden_states=True)
-            model = BertModel.from_pretrained(model_name, output_attentions=True, output_hidden_states=True)
-            self.assertEqual(model.config.output_attentions, True)
-            self.assertEqual(model.config.output_hidden_states, True)
-            self.assertEqual(model.config, config)
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/pytorch_transformers/tests/modeling_bert_test.py b/pytorch_transformers/tests/modeling_bert_test.py
index b140f5e647..2ba59317be 100644
--- a/pytorch_transformers/tests/modeling_bert_test.py
+++ b/pytorch_transformers/tests/modeling_bert_test.py
@@ -26,7 +26,7 @@ from pytorch_transformers import (BertConfig, BertModel, BertForMaskedLM,
                                      BertForTokenClassification, BertForMultipleChoice)
 from pytorch_transformers.modeling_bert import PRETRAINED_MODEL_ARCHIVE_MAP
 
-from .model_tests_commons import (create_and_check_commons, ConfigTester, ids_tensor)
+from .modeling_tests_commons import (create_and_check_commons, ConfigTester, ids_tensor)
 
 
 class BertModelTest(unittest.TestCase):
diff --git a/pytorch_transformers/tests/modeling_gpt2_test.py b/pytorch_transformers/tests/modeling_gpt2_test.py
index 4ace52571a..7400c9f64d 100644
--- a/pytorch_transformers/tests/modeling_gpt2_test.py
+++ b/pytorch_transformers/tests/modeling_gpt2_test.py
@@ -28,7 +28,7 @@ import torch
 from pytorch_transformers import (GPT2Config, GPT2Model,
                                      GPT2LMHeadModel, GPT2DoubleHeadsModel)
 
-from .model_tests_commons import (create_and_check_commons, ConfigTester, GPTModelTester)
+from .modeling_tests_commons import (create_and_check_commons, ConfigTester, GPTModelTester)
 
 class GPT2ModelTest(unittest.TestCase):
 
diff --git a/pytorch_transformers/tests/modeling_openai_test.py b/pytorch_transformers/tests/modeling_openai_test.py
index fe81157023..27263ecb24 100644
--- a/pytorch_transformers/tests/modeling_openai_test.py
+++ b/pytorch_transformers/tests/modeling_openai_test.py
@@ -24,7 +24,7 @@ import torch
 from pytorch_transformers import (OpenAIGPTConfig, OpenAIGPTModel,
                                      OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel)
 
-from .model_tests_commons import (create_and_check_commons, ConfigTester, GPTModelTester)
+from .modeling_tests_commons import (create_and_check_commons, ConfigTester, GPTModelTester)
 
 class OpenAIModelTest(unittest.TestCase):
 
diff --git a/pytorch_transformers/tests/model_tests_commons.py b/pytorch_transformers/tests/modeling_tests_commons.py
similarity index 100%
rename from pytorch_transformers/tests/model_tests_commons.py
rename to pytorch_transformers/tests/modeling_tests_commons.py
diff --git a/pytorch_transformers/tests/modeling_transfo_xl_test.py b/pytorch_transformers/tests/modeling_transfo_xl_test.py
index d15a19eb64..f2906d879f 100644
--- a/pytorch_transformers/tests/modeling_transfo_xl_test.py
+++ b/pytorch_transformers/tests/modeling_transfo_xl_test.py
@@ -28,7 +28,7 @@ import torch
 from pytorch_transformers import (TransfoXLConfig, TransfoXLModel, TransfoXLLMHeadModel)
 from pytorch_transformers.modeling_transfo_xl import PRETRAINED_MODEL_ARCHIVE_MAP
 
-from .model_tests_commons import ConfigTester, create_and_check_commons, ids_tensor
+from .modeling_tests_commons import ConfigTester, create_and_check_commons, ids_tensor
 
 class TransfoXLModelTest(unittest.TestCase):
     class TransfoXLModelTester(object):
diff --git a/examples/tests/examples_tests.py b/pytorch_transformers/tests/modeling_utils_test.py
similarity index 92%
rename from examples/tests/examples_tests.py
rename to pytorch_transformers/tests/modeling_utils_test.py
index 120df35f82..1866d35353 100644
--- a/examples/tests/examples_tests.py
+++ b/pytorch_transformers/tests/modeling_utils_test.py
@@ -16,17 +16,10 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import os
 import unittest
-import json
-import random
-import shutil
-import pytest
-
-import torch
 
 from pytorch_transformers import PretrainedConfig, PreTrainedModel
-from pytorch_transformers.modeling_bert import BertModel, BertConfig, PRETRAINED_MODEL_ARCHIVE_MAP, PRETRAINED_CONFIG_ARCHIVE_MAP
+from pytorch_transformers.modeling_bert import BertModel, BertConfig, PRETRAINED_MODEL_ARCHIVE_MAP
 
 
 class ModelUtilsTest(unittest.TestCase):
diff --git a/pytorch_transformers/tests/modeling_xlm_test.py b/pytorch_transformers/tests/modeling_xlm_test.py
index 8a8905cc31..9c511f21a8 100644
--- a/pytorch_transformers/tests/modeling_xlm_test.py
+++ b/pytorch_transformers/tests/modeling_xlm_test.py
@@ -23,7 +23,7 @@ import pytest
 from pytorch_transformers import (XLMConfig, XLMModel, XLMWithLMHeadModel, XLMForQuestionAnswering, XLMForSequenceClassification)
 from pytorch_transformers.modeling_xlm import PRETRAINED_MODEL_ARCHIVE_MAP
 
-from .model_tests_commons import (create_and_check_commons, ConfigTester, ids_tensor)
+from .modeling_tests_commons import (create_and_check_commons, ConfigTester, ids_tensor)
 
 
 class XLMModelTest(unittest.TestCase):
diff --git a/pytorch_transformers/tests/modeling_xlnet_test.py b/pytorch_transformers/tests/modeling_xlnet_test.py
index b9d55a26c7..b762426d2c 100644
--- a/pytorch_transformers/tests/modeling_xlnet_test.py
+++ b/pytorch_transformers/tests/modeling_xlnet_test.py
@@ -28,7 +28,7 @@ import torch
 from pytorch_transformers import (XLNetConfig, XLNetModel, XLNetLMHeadModel, XLNetForSequenceClassification, XLNetForQuestionAnswering)
 from pytorch_transformers.modeling_xlnet import PRETRAINED_MODEL_ARCHIVE_MAP
 
-from .model_tests_commons import ConfigTester, create_and_check_commons, ids_tensor
+from .modeling_tests_commons import ConfigTester, create_and_check_commons, ids_tensor
 
 class XLNetModelTest(unittest.TestCase):
     class XLNetModelTester(object):
diff --git a/pytorch_transformers/tests/tokenization_bert_test.py b/pytorch_transformers/tests/tokenization_bert_test.py
index 59a87a4cb9..37e20cc286 100644
--- a/pytorch_transformers/tests/tokenization_bert_test.py
+++ b/pytorch_transformers/tests/tokenization_bert_test.py
@@ -24,7 +24,7 @@ from pytorch_transformers.tokenization_bert import (BasicTokenizer,
                                                   BertTokenizer,
                                                   WordpieceTokenizer,
                                                   _is_control, _is_punctuation,
-                                                  _is_whitespace, PRETRAINED_VOCAB_ARCHIVE_MAP)
+                                                  _is_whitespace)
 
 from .tokenization_tests_commons import create_and_check_tokenizer_commons
 
@@ -49,14 +49,6 @@ class TokenizationTest(unittest.TestCase):
 
         os.remove(vocab_file)
 
-    @pytest.mark.slow
-    def test_tokenizer_from_pretrained(self):
-        cache_dir = "/tmp/pytorch_transformers_test/"
-        for model_name in list(PRETRAINED_VOCAB_ARCHIVE_MAP.keys())[:1]:
-            tokenizer = BertTokenizer.from_pretrained(model_name, cache_dir=cache_dir)
-            shutil.rmtree(cache_dir)
-            self.assertIsNotNone(tokenizer)
-
     def test_chinese(self):
         tokenizer = BasicTokenizer()
 
diff --git a/pytorch_transformers/tests/tokenization_gpt2_test.py b/pytorch_transformers/tests/tokenization_gpt2_test.py
index c6d926bdd4..8b06161b53 100644
--- a/pytorch_transformers/tests/tokenization_gpt2_test.py
+++ b/pytorch_transformers/tests/tokenization_gpt2_test.py
@@ -17,10 +17,8 @@ from __future__ import absolute_import, division, print_function, unicode_litera
 import os
 import unittest
 import json
-import shutil
-import pytest
 
-from pytorch_transformers.tokenization_gpt2 import GPT2Tokenizer, PRETRAINED_VOCAB_ARCHIVE_MAP
+from pytorch_transformers.tokenization_gpt2 import GPT2Tokenizer
 
 from .tokenization_tests_commons import create_and_check_tokenizer_commons
 
@@ -56,13 +54,6 @@ class GPT2TokenizationTest(unittest.TestCase):
         os.remove(vocab_file)
         os.remove(merges_file)
 
-    # @pytest.mark.slow
-    def test_tokenizer_from_pretrained(self):
-        cache_dir = "/tmp/pytorch_transformers_test/"
-        for model_name in list(PRETRAINED_VOCAB_ARCHIVE_MAP.keys())[:1]:
-            tokenizer = GPT2Tokenizer.from_pretrained(model_name, cache_dir=cache_dir)
-            shutil.rmtree(cache_dir)
-            self.assertIsNotNone(tokenizer)
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/pytorch_transformers/tests/tokenization_openai_test.py b/pytorch_transformers/tests/tokenization_openai_test.py
index 38315f927b..3f8c49f888 100644
--- a/pytorch_transformers/tests/tokenization_openai_test.py
+++ b/pytorch_transformers/tests/tokenization_openai_test.py
@@ -20,7 +20,7 @@ import json
 import shutil
 import pytest
 
-from pytorch_transformers.tokenization_openai import OpenAIGPTTokenizer, PRETRAINED_VOCAB_ARCHIVE_MAP
+from pytorch_transformers.tokenization_openai import OpenAIGPTTokenizer
 
 from.tokenization_tests_commons import create_and_check_tokenizer_commons
 
@@ -58,14 +58,6 @@ class OpenAIGPTTokenizationTest(unittest.TestCase):
         self.assertListEqual(
             tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
 
-    @pytest.mark.slow
-    def test_tokenizer_from_pretrained(self):
-        cache_dir = "/tmp/pytorch_transformers_test/"
-        for model_name in list(PRETRAINED_VOCAB_ARCHIVE_MAP.keys())[:1]:
-            tokenizer = OpenAIGPTTokenizer.from_pretrained(model_name, cache_dir=cache_dir)
-            shutil.rmtree(cache_dir)
-            self.assertIsNotNone(tokenizer)
-
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/pytorch_transformers/tests/tokenization_transfo_xl_test.py b/pytorch_transformers/tests/tokenization_transfo_xl_test.py
index f744e319c8..f583e30b56 100644
--- a/pytorch_transformers/tests/tokenization_transfo_xl_test.py
+++ b/pytorch_transformers/tests/tokenization_transfo_xl_test.py
@@ -20,7 +20,7 @@ from io import open
 import shutil
 import pytest
 
-from pytorch_transformers.tokenization_transfo_xl import TransfoXLTokenizer, PRETRAINED_VOCAB_ARCHIVE_MAP
+from pytorch_transformers.tokenization_transfo_xl import TransfoXLTokenizer
 
 from.tokenization_tests_commons import create_and_check_tokenizer_commons
 
@@ -59,13 +59,6 @@ class TransfoXLTokenizationTest(unittest.TestCase):
             tokenizer.tokenize(u" \tHeLLo ! how  \n Are yoU ?  "),
             ["HeLLo", "!", "how", "Are", "yoU", "?"])
 
-    @pytest.mark.slow
-    def test_tokenizer_from_pretrained(self):
-        cache_dir = "/tmp/pytorch_transformers_test/"
-        for model_name in list(PRETRAINED_VOCAB_ARCHIVE_MAP.keys())[:1]:
-            tokenizer = TransfoXLTokenizer.from_pretrained(model_name, cache_dir=cache_dir)
-            shutil.rmtree(cache_dir)
-            self.assertIsNotNone(tokenizer)
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/pytorch_transformers/tests/tokenization_utils_test.py b/pytorch_transformers/tests/tokenization_utils_test.py
new file mode 100644
index 0000000000..e8856d50c2
--- /dev/null
+++ b/pytorch_transformers/tests/tokenization_utils_test.py
@@ -0,0 +1,36 @@
+# coding=utf-8
+# Copyright 2018 HuggingFace Inc..
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import unittest
+
+from pytorch_transformers import PreTrainedTokenizer
+from pytorch_transformers.tokenization_gpt2 import GPT2Tokenizer
+
+class TokenizerUtilsTest(unittest.TestCase):
+    def check_tokenizer_from_pretrained(self, tokenizer_class):
+        s3_models = list(tokenizer_class.max_model_input_sizes.keys())
+        for model_name in s3_models[:1]:
+            tokenizer = tokenizer_class.from_pretrained(model_name)
+            self.assertIsNotNone(tokenizer)
+            self.assertIsInstance(tokenizer, PreTrainedTokenizer)
+
+    def test_pretrained_tokenizers(self):
+        self.check_tokenizer_from_pretrained(GPT2Tokenizer)
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/pytorch_transformers/tests/tokenization_xlm_test.py b/pytorch_transformers/tests/tokenization_xlm_test.py
index 9cc18f3d60..00d273a628 100644
--- a/pytorch_transformers/tests/tokenization_xlm_test.py
+++ b/pytorch_transformers/tests/tokenization_xlm_test.py
@@ -20,9 +20,9 @@ import json
 import shutil
 import pytest
 
-from pytorch_transformers.tokenization_xlm import XLMTokenizer, PRETRAINED_VOCAB_ARCHIVE_MAP
+from pytorch_transformers.tokenization_xlm import XLMTokenizer
 
-from.tokenization_tests_commons import create_and_check_tokenizer_commons
+from .tokenization_tests_commons import create_and_check_tokenizer_commons
 
 class XLMTokenizationTest(unittest.TestCase):
 
@@ -57,14 +57,6 @@ class XLMTokenizationTest(unittest.TestCase):
         self.assertListEqual(
             tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
 
-    @pytest.mark.slow
-    def test_tokenizer_from_pretrained(self):
-        cache_dir = "/tmp/pytorch_transformers_test/"
-        for model_name in list(PRETRAINED_VOCAB_ARCHIVE_MAP.keys())[:1]:
-            tokenizer = XLMTokenizer.from_pretrained(model_name, cache_dir=cache_dir)
-            shutil.rmtree(cache_dir)
-            self.assertIsNotNone(tokenizer)
-
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/pytorch_transformers/tests/tokenization_xlnet_test.py b/pytorch_transformers/tests/tokenization_xlnet_test.py
index 4dd76e114b..6e81f214b7 100644
--- a/pytorch_transformers/tests/tokenization_xlnet_test.py
+++ b/pytorch_transformers/tests/tokenization_xlnet_test.py
@@ -19,9 +19,7 @@ import unittest
 import shutil
 import pytest
 
-from pytorch_transformers.tokenization_xlnet import (XLNetTokenizer,
-                                                        PRETRAINED_VOCAB_ARCHIVE_MAP,
-                                                        SPIECE_UNDERLINE)
+from pytorch_transformers.tokenization_xlnet import (XLNetTokenizer, SPIECE_UNDERLINE)
 
 from.tokenization_tests_commons import create_and_check_tokenizer_commons
 
@@ -60,14 +58,6 @@ class XLNetTokenizationTest(unittest.TestCase):
                                            SPIECE_UNDERLINE + u'is', SPIECE_UNDERLINE + u'f', u'al', u's',
                                            u'<unk>', u'.'])
 
-    @pytest.mark.slow
-    def test_tokenizer_from_pretrained(self):
-        cache_dir = "/tmp/pytorch_transformers_test/"
-        for model_name in list(PRETRAINED_VOCAB_ARCHIVE_MAP.keys())[:1]:
-            tokenizer = XLNetTokenizer.from_pretrained(model_name, cache_dir=cache_dir)
-            shutil.rmtree(cache_dir)
-            self.assertIsNotNone(tokenizer)
-
     def test_tokenizer_lower(self):
         tokenizer = XLNetTokenizer(SAMPLE_VOCAB, do_lower_case=True)
         tokens = tokenizer.tokenize(u"I was born in 92000, and this is falsé.")
diff --git a/pytorch_transformers/tokenization_bert.py b/pytorch_transformers/tokenization_bert.py
index c8db62b9c0..b26e5066e9 100644
--- a/pytorch_transformers/tokenization_bert.py
+++ b/pytorch_transformers/tokenization_bert.py
@@ -23,11 +23,15 @@ import unicodedata
 from io import open
 
 from .file_utils import cached_path
-from .model_utils import clean_up_tokenization
+from .tokenization_utils import PreTrainedTokenizer, clean_up_tokenization
 
 logger = logging.getLogger(__name__)
 
-PRETRAINED_VOCAB_ARCHIVE_MAP = {
+VOCAB_FILES_NAMES = {'vocab_file': 'vocab.txt'}
+
+PRETRAINED_VOCAB_FILES_MAP = {
+    'vocab_file':
+    {
     'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt",
     'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt",
     'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-vocab.txt",
@@ -41,8 +45,9 @@ PRETRAINED_VOCAB_ARCHIVE_MAP = {
     'bert-large-uncased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-vocab.txt",
     'bert-large-cased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-vocab.txt",
     'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-vocab.txt",
-}
-PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP = {
+}}
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
     'bert-base-uncased': 512,
     'bert-large-uncased': 512,
     'bert-base-cased': 512,
@@ -57,7 +62,6 @@ PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP = {
     'bert-large-cased-whole-word-masking-finetuned-squad': 512,
     'bert-base-cased-finetuned-mrpc': 512,
 }
-VOCAB_NAME = 'vocab.txt'
 
 def load_vocab(vocab_file):
     """Loads a vocabulary file into a dictionary."""
@@ -83,8 +87,11 @@ def whitespace_tokenize(text):
     return tokens
 
 
-class BertTokenizer(object):
+class BertTokenizer(PreTrainedTokenizer):
     """Runs end-to-end tokenization: punctuation splitting + wordpiece"""
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
 
     def __init__(self, vocab_file, do_lower_case=True, max_len=None, do_basic_tokenize=True,
                  never_split=("[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]")):
@@ -203,7 +210,7 @@ class BertTokenizer(object):
         """Save the tokenizer vocabulary to a directory or file."""
         index = 0
         if os.path.isdir(vocab_path):
-            vocab_file = os.path.join(vocab_path, VOCAB_NAME)
+            vocab_file = os.path.join(vocab_path, VOCAB_FILES_NAMES['vocab_file'])
         with open(vocab_file, "w", encoding="utf-8") as writer:
             for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
                 if index != token_index:
@@ -215,13 +222,10 @@ class BertTokenizer(object):
         return (vocab_file,)
 
     @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, *inputs, **kwargs):
+    def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
+        """ Instantiate a BertTokenizer from pre-trained vocabulary files.
         """
-        Instantiate a PreTrainedBertModel from a pre-trained model file.
-        Download and cache the pre-trained model file if needed.
-        """
-        if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP:
-            vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[pretrained_model_name_or_path]
+        if pretrained_model_name_or_path in PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES:
             if '-cased' in pretrained_model_name_or_path and kwargs.get('do_lower_case', True):
                 logger.warning("The pre-trained model you are loading is a cased model but you have not set "
                                "`do_lower_case` to False. We are setting `do_lower_case=False` for you but "
@@ -232,40 +236,8 @@ class BertTokenizer(object):
                                "`do_lower_case` to False. We are setting `do_lower_case=True` for you "
                                "but you may want to check this behavior.")
                 kwargs['do_lower_case'] = True
-        else:
-            vocab_file = pretrained_model_name_or_path
-        if os.path.isdir(vocab_file):
-            vocab_file = os.path.join(vocab_file, VOCAB_NAME)
-        # redirect to the cache, if necessary
-        try:
-            resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir)
-        except EnvironmentError:
-            if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP:
-                logger.error(
-                    "Couldn't reach server at '{}' to download vocabulary.".format(
-                        vocab_file))
-            else:
-                logger.error(
-                    "Model name '{}' was not found in model name list ({}). "
-                    "We assumed '{}' was a path or url but couldn't find any file "
-                    "associated to this path or url.".format(
-                        pretrained_model_name_or_path,
-                        ', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()),
-                        vocab_file))
-            return None
-        if resolved_vocab_file == vocab_file:
-            logger.info("loading vocabulary file {}".format(vocab_file))
-        else:
-            logger.info("loading vocabulary file {} from cache at {}".format(
-                vocab_file, resolved_vocab_file))
-        if pretrained_model_name_or_path in PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP:
-            # if we're using a pretrained model, ensure the tokenizer wont index sequences longer
-            # than the number of positional embeddings
-            max_len = PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP[pretrained_model_name_or_path]
-            kwargs['max_len'] = min(kwargs.get('max_len', int(1e12)), max_len)
-        # Instantiate tokenizer.
-        tokenizer = cls(resolved_vocab_file, *inputs, **kwargs)
-        return tokenizer
+
+        return super(BertTokenizer, cls)._from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
 
 
 class BasicTokenizer(object):
diff --git a/pytorch_transformers/tokenization_gpt2.py b/pytorch_transformers/tokenization_gpt2.py
index 2947ce66b8..abdfe39c1c 100644
--- a/pytorch_transformers/tokenization_gpt2.py
+++ b/pytorch_transformers/tokenization_gpt2.py
@@ -23,8 +23,6 @@ import os
 import regex as re
 from io import open
 
-from .model_utils import clean_up_tokenization
-
 try:
     from functools import lru_cache
 except ImportError:
@@ -33,24 +31,38 @@ except ImportError:
     def lru_cache():
         return lambda func: func
 
-from .file_utils import cached_path
+from .tokenization_utils import PreTrainedTokenizer, clean_up_tokenization
 
 logger = logging.getLogger(__name__)
 
-PRETRAINED_VOCAB_ARCHIVE_MAP = {
-    'gpt2': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json",
-    'gpt2-medium': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-vocab.json",
+VOCAB_FILES_NAMES = {
+    'vocab_file': 'vocab.json',
+    'merges_file': 'merges.txt',
+    'special_tokens_file': 'special_tokens.txt'
 }
-PRETRAINED_MERGES_ARCHIVE_MAP = {
-    'gpt2': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt",
-    'gpt2-medium': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-merges.txt",
+
+PRETRAINED_VOCAB_FILES_MAP = {
+    'vocab_file':
+    {
+        'gpt2': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json",
+        'gpt2-medium': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-vocab.json",
+    },
+    'merges_file':
+    {
+        'gpt2': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt",
+        'gpt2-medium': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-merges.txt",
+    },
+    'special_tokens_file':
+    {
+        'gpt2': None,
+        'gpt2-medium': None,
+    }
 }
-PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP = {
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
     'gpt2': 1024,
+    'gpt2-medium': 1024,
 }
-VOCAB_NAME = 'vocab.json'
-MERGES_NAME = 'merges.txt'
-SPECIAL_TOKENS_NAME = 'special_tokens.txt'
 
 @lru_cache()
 def bytes_to_unicode():
@@ -87,70 +99,16 @@ def get_pairs(word):
         prev_char = char
     return pairs
 
-class GPT2Tokenizer(object):
+class GPT2Tokenizer(PreTrainedTokenizer):
     """
     GPT-2 BPE tokenizer. Peculiarities:
         - Byte-level BPE
     """
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, *inputs, **kwargs):
-        """
-        Instantiate a GPT2Tokenizer from a pre-trained model file.
-        Download and cache the pre-trained model file if needed.
-        """
-        if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP:
-            vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[pretrained_model_name_or_path]
-            merges_file = PRETRAINED_MERGES_ARCHIVE_MAP[pretrained_model_name_or_path]
-            special_tokens_file = None
-        else:
-            vocab_file = os.path.join(pretrained_model_name_or_path, VOCAB_NAME)
-            merges_file = os.path.join(pretrained_model_name_or_path, MERGES_NAME)
-            special_tokens_file = os.path.join(pretrained_model_name_or_path, SPECIAL_TOKENS_NAME)
-            if not os.path.exists(special_tokens_file):
-                special_tokens_file = None
-            else:
-                logger.info("loading special tokens file {}".format(special_tokens_file))
-        # redirect to the cache, if necessary
-        try:
-            resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir)
-            resolved_merges_file = cached_path(merges_file, cache_dir=cache_dir)
-        except EnvironmentError:
-            if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP:
-                logger.error(
-                    "Couldn't reach server at '{}' to download vocabulary.".format(
-                        vocab_file))
-            else:
-                logger.error(
-                    "Model name '{}' was not found in model name list ({}). "
-                    "We assumed '{}' was a path or url but couldn't find files {} and {} "
-                    "at this path or url.".format(
-                        pretrained_model_name_or_path,
-                        ', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()),
-                        pretrained_model_name_or_path,
-                        vocab_file, merges_file))
-            return None
-        if resolved_vocab_file == vocab_file and resolved_merges_file == merges_file:
-            logger.info("loading vocabulary file {}".format(vocab_file))
-            logger.info("loading merges file {}".format(merges_file))
-        else:
-            logger.info("loading vocabulary file {} from cache at {}".format(
-                vocab_file, resolved_vocab_file))
-            logger.info("loading merges file {} from cache at {}".format(
-                merges_file, resolved_merges_file))
-        if pretrained_model_name_or_path in PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP:
-            # if we're using a pretrained model, ensure the tokenizer wont index sequences longer
-            # than the number of positional embeddings
-            max_len = PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP[pretrained_model_name_or_path]
-            kwargs['max_len'] = min(kwargs.get('max_len', int(1e12)), max_len)
-        # Instantiate tokenizer.
-        if special_tokens_file and 'special_tokens' not in kwargs:
-            special_tokens = open(special_tokens_file, encoding='utf-8').read().split('\n')[:-1]
-        else:
-            special_tokens = kwargs.pop('special_tokens', [])
-        tokenizer = cls(resolved_vocab_file, resolved_merges_file, special_tokens=special_tokens, *inputs, **kwargs)
-        return tokenizer
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
 
-    def __init__(self, vocab_file, merges_file, errors='replace', special_tokens=None, max_len=None):
+    def __init__(self, vocab_file, merges_file, special_tokens_file=None, special_tokens=None, errors='replace', max_len=None):
         self.max_len = max_len if max_len is not None else int(1e12)
         self.encoder = json.load(open(vocab_file))
         self.decoder = {v:k for k,v in self.encoder.items()}
@@ -165,9 +123,16 @@ class GPT2Tokenizer(object):
         # Should haved added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
         self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
 
+        all_special_tokens = []
+        if special_tokens_file is not None:
+            special_tokens_to_add = open(special_tokens_file, encoding='utf-8').read().split('\n')[:-1]
+            all_special_tokens.extend(special_tokens_to_add)
+        if special_tokens is not None and special_tokens:
+            all_special_tokens.extend(special_tokens)
+
         self.special_tokens = {}
         self.special_tokens_decoder = {}
-        self.set_special_tokens(special_tokens)
+        self.set_special_tokens(all_special_tokens)
 
     def __len__(self):
         return len(self.encoder) + len(self.special_tokens)
@@ -285,9 +250,9 @@ class GPT2Tokenizer(object):
         if not os.path.isdir(vocab_path):
             logger.error("Vocabulary path ({}) should be a directory".format(vocab_path))
             return
-        vocab_file = os.path.join(vocab_path, VOCAB_NAME)
-        merge_file = os.path.join(vocab_path, MERGES_NAME)
-        special_tokens_file = os.path.join(vocab_path, SPECIAL_TOKENS_NAME)
+        vocab_file = os.path.join(vocab_path, VOCAB_FILES_NAMES['vocab_file'])
+        merge_file = os.path.join(vocab_path, VOCAB_FILES_NAMES['merges_file'])
+        special_tokens_file = os.path.join(vocab_path, VOCAB_FILES_NAMES['special_tokens_file'])
 
         with open(vocab_file, 'w', encoding='utf-8') as f:
             f.write(json.dumps(self.encoder, ensure_ascii=False))
diff --git a/pytorch_transformers/tokenization_openai.py b/pytorch_transformers/tokenization_openai.py
index 7d005a8260..419dfdad92 100644
--- a/pytorch_transformers/tokenization_openai.py
+++ b/pytorch_transformers/tokenization_openai.py
@@ -26,23 +26,35 @@ from io import open
 from tqdm import tqdm
 
 from .file_utils import cached_path
-from .model_utils import clean_up_tokenization
+from .tokenization_utils import PreTrainedTokenizer, clean_up_tokenization
 from .tokenization_bert import BasicTokenizer
 
 logger = logging.getLogger(__name__)
 
-PRETRAINED_VOCAB_ARCHIVE_MAP = {
-    'openai-gpt': "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-vocab.json",
+VOCAB_FILES_NAMES = {
+    'vocab_file': 'vocab.json',
+    'merges_file': 'merges.txt',
+    'special_tokens_file': 'special_tokens.txt'
 }
-PRETRAINED_MERGES_ARCHIVE_MAP = {
-    'openai-gpt': "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-merges.txt",
+
+PRETRAINED_VOCAB_FILES_MAP = {
+    'vocab_file':
+    {
+        'openai-gpt': "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-vocab.json",
+    },
+    'merges_file':
+    {
+        'openai-gpt': "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-merges.txt",
+    },
+    'special_tokens_file':
+    {
+        'openai-gpt': None,
+    }
 }
-PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP = {
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
     'openai-gpt': 512,
 }
-VOCAB_NAME = 'vocab.json'
-MERGES_NAME = 'merges.txt'
-SPECIAL_TOKENS_NAME = 'special_tokens.txt'
 
 def get_pairs(word):
     """
@@ -71,7 +83,7 @@ def text_standardize(text):
     text = re.sub(r'[^\S\n]+', ' ', text)
     return text.strip()
 
-class OpenAIGPTTokenizer(object):
+class OpenAIGPTTokenizer(PreTrainedTokenizer):
     """
     BPE tokenizer. Peculiarities:
         - lower case all inputs
@@ -79,65 +91,11 @@ class OpenAIGPTTokenizer(object):
         - argument special_tokens and function set_special_tokens:
             can be used to add additional symbols (ex: "__classify__") to a vocabulary.
     """
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, *inputs, **kwargs):
-        """
-        Instantiate a PreTrainedBertModel from a pre-trained model file.
-        Download and cache the pre-trained model file if needed.
-        """
-        if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP:
-            vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[pretrained_model_name_or_path]
-            merges_file = PRETRAINED_MERGES_ARCHIVE_MAP[pretrained_model_name_or_path]
-            special_tokens_file = None
-        else:
-            vocab_file = os.path.join(pretrained_model_name_or_path, VOCAB_NAME)
-            merges_file = os.path.join(pretrained_model_name_or_path, MERGES_NAME)
-            special_tokens_file = os.path.join(pretrained_model_name_or_path, SPECIAL_TOKENS_NAME)
-            if not os.path.exists(special_tokens_file):
-                special_tokens_file = None
-            else:
-                logger.info("loading special tokens file {}".format(special_tokens_file))
-        # redirect to the cache, if necessary
-        try:
-            resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir)
-            resolved_merges_file = cached_path(merges_file, cache_dir=cache_dir)
-        except EnvironmentError:
-            if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP:
-                logger.error(
-                    "Couldn't reach server at '{}' to download vocabulary.".format(
-                        vocab_file))
-            else:
-                logger.error(
-                    "Model name '{}' was not found in model name list ({}). "
-                    "We assumed '{}' was a path or url but couldn't find files {} and {} "
-                    "at this path or url.".format(
-                        pretrained_model_name_or_path,
-                        ', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()),
-                        pretrained_model_name_or_path,
-                        vocab_file, merges_file))
-            return None
-        if resolved_vocab_file == vocab_file and resolved_merges_file == merges_file:
-            logger.info("loading vocabulary file {}".format(vocab_file))
-            logger.info("loading merges file {}".format(merges_file))
-        else:
-            logger.info("loading vocabulary file {} from cache at {}".format(
-                vocab_file, resolved_vocab_file))
-            logger.info("loading merges file {} from cache at {}".format(
-                merges_file, resolved_merges_file))
-        if pretrained_model_name_or_path in PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP:
-            # if we're using a pretrained model, ensure the tokenizer wont index sequences longer
-            # than the number of positional embeddings
-            max_len = PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP[pretrained_model_name_or_path]
-            kwargs['max_len'] = min(kwargs.get('max_len', int(1e12)), max_len)
-        # Instantiate tokenizer.
-        if special_tokens_file and 'special_tokens' not in kwargs:
-            special_tokens = open(special_tokens_file, encoding='utf-8').read().split('\n')[:-1]
-        else:
-            special_tokens = kwargs.pop('special_tokens', [])
-        tokenizer = cls(resolved_vocab_file, resolved_merges_file, special_tokens=special_tokens, *inputs, **kwargs)
-        return tokenizer
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
 
-    def __init__(self, vocab_file, merges_file, special_tokens=None, max_len=None):
+    def __init__(self, vocab_file, merges_file, special_tokens_file=None, special_tokens=None, max_len=None):
         try:
             import ftfy
             import spacy
@@ -156,9 +114,17 @@ class OpenAIGPTTokenizer(object):
         merges = [tuple(merge.split()) for merge in merges]
         self.bpe_ranks = dict(zip(merges, range(len(merges))))
         self.cache = {}
+
+        all_special_tokens = []
+        if special_tokens_file is not None:
+            special_tokens_to_add = open(special_tokens_file, encoding='utf-8').read().split('\n')[:-1]
+            all_special_tokens.extend(special_tokens_to_add)
+        if special_tokens is not None and special_tokens:
+            all_special_tokens.extend(special_tokens)
+
         self.special_tokens = {}
         self.special_tokens_decoder = {}
-        self.set_special_tokens(special_tokens)
+        self.set_special_tokens(all_special_tokens)
 
     def __len__(self):
         return len(self.encoder) + len(self.special_tokens)
@@ -286,9 +252,9 @@ class OpenAIGPTTokenizer(object):
         if not os.path.isdir(vocab_path):
             logger.error("Vocabulary path ({}) should be a directory".format(vocab_path))
             return
-        vocab_file = os.path.join(vocab_path, VOCAB_NAME)
-        merge_file = os.path.join(vocab_path, MERGES_NAME)
-        special_tokens_file = os.path.join(vocab_path, SPECIAL_TOKENS_NAME)
+        vocab_file = os.path.join(vocab_path, VOCAB_FILES_NAMES['vocab_file'])
+        merge_file = os.path.join(vocab_path, VOCAB_FILES_NAMES['merges_file'])
+        special_tokens_file = os.path.join(vocab_path, VOCAB_FILES_NAMES['special_tokens_file'])
 
         with open(vocab_file, 'w', encoding='utf-8') as f:
             f.write(json.dumps(self.encoder, ensure_ascii=False))
diff --git a/pytorch_transformers/tokenization_transfo_xl.py b/pytorch_transformers/tokenization_transfo_xl.py
index 7e83680770..a86c8fe460 100644
--- a/pytorch_transformers/tokenization_transfo_xl.py
+++ b/pytorch_transformers/tokenization_transfo_xl.py
@@ -31,7 +31,7 @@ import torch
 import numpy as np
 
 from .file_utils import cached_path
-from .model_utils import clean_up_tokenization
+from .tokenization_utils import PreTrainedTokenizer, clean_up_tokenization
 
 if sys.version_info[0] == 2:
     import cPickle as pickle
@@ -41,66 +41,35 @@ else:
 
 logger = logging.getLogger(__name__)
 
-PRETRAINED_VOCAB_ARCHIVE_MAP = {
-    'transfo-xl-wt103': "https://s3.amazonaws.com/models.huggingface.co/bert/transfo-xl-wt103-vocab.bin",
+VOCAB_FILES_NAMES = {'pretrained_vocab_file': 'vocab.bin'}
+
+PRETRAINED_VOCAB_FILES_MAP = {
+    'pretrained_vocab_file':
+    {
+        'transfo-xl-wt103': "https://s3.amazonaws.com/models.huggingface.co/bert/transfo-xl-wt103-vocab.bin",
+    }
+}
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+    'transfo-xl-wt103': 512,
 }
-VOCAB_NAME = 'vocab.bin'
 
 PRETRAINED_CORPUS_ARCHIVE_MAP = {
     'transfo-xl-wt103': "https://s3.amazonaws.com/models.huggingface.co/bert/transfo-xl-wt103-corpus.bin",
 }
 CORPUS_NAME = 'corpus.bin'
 
-class TransfoXLTokenizer(object):
+class TransfoXLTokenizer(PreTrainedTokenizer):
     """
     Transformer-XL tokenizer adapted from Vocab class in https://github.com/kimiyoung/transformer-xl
     """
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, *inputs, **kwargs):
-        """
-        Instantiate a TransfoXLTokenizer.
-        The TransfoXLTokenizer.
-        """
-        if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP:
-            vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[pretrained_model_name_or_path]
-        else:
-            if os.path.isdir(pretrained_model_name_or_path):
-                vocab_file = os.path.join(pretrained_model_name_or_path, VOCAB_NAME)
-            else:
-                vocab_file = pretrained_model_name_or_path
-        # redirect to the cache, if necessary
-        try:
-            resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir)
-        except EnvironmentError:
-            if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP:
-                logger.error(
-                    "Couldn't reach server at '{}' to download vocabulary.".format(
-                        vocab_file))
-            else:
-                logger.error(
-                    "Model name '{}' was not found in model name list ({}). "
-                    "We assumed '{}' was a path or url but couldn't find files {} "
-                    "at this path or url.".format(
-                        pretrained_model_name_or_path,
-                        ', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()),
-                        pretrained_model_name_or_path,
-                        vocab_file))
-            return None
-        if resolved_vocab_file == vocab_file:
-            logger.info("loading vocabulary file {}".format(vocab_file))
-        else:
-            logger.info("loading vocabulary file {} from cache at {}".format(
-                vocab_file, resolved_vocab_file))
-
-        # Instantiate tokenizer.
-        tokenizer = cls(*inputs, **kwargs)
-        vocab_dict = torch.load(resolved_vocab_file)
-        for key, value in vocab_dict.items():
-            tokenizer.__dict__[key] = value
-        return tokenizer
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
 
     def __init__(self, special=[], min_freq=0, max_size=None, lower_case=False,
-                 delimiter=None, vocab_file=None, never_split=("<unk>", "<eos>", "<formula>")):
+                 delimiter=None, vocab_file=None, pretrained_vocab_file=None,
+                 never_split=("<unk>", "<eos>", "<formula>")):
         self.counter = Counter()
         self.special = special
         self.min_freq = min_freq
@@ -110,6 +79,13 @@ class TransfoXLTokenizer(object):
         self.vocab_file = vocab_file
         self.never_split = never_split
 
+        if pretrained_vocab_file is not None:
+            # Hack because, honestly this tokenizer was not made to be used
+            # in a library like ours, at all.
+            vocab_dict = torch.load(pretrained_vocab_file)
+            for key, value in vocab_dict.items():
+                self.__dict__[key] = value
+
         if vocab_file is not None:
             self.build_vocab()
 
@@ -157,7 +133,7 @@ class TransfoXLTokenizer(object):
         """Save the tokenizer vocabulary to a directory or file."""
         index = 0
         if os.path.isdir(vocab_path):
-            vocab_file = os.path.join(vocab_path, VOCAB_NAME)
+            vocab_file = os.path.join(vocab_path, VOCAB_FILES_NAMES['pretrained_vocab_file'])
         torch.save(self.__dict__, vocab_file)
         return (vocab_file,)
 
@@ -484,7 +460,7 @@ class TransfoXLCorpus(object):
                 "We assumed '{}' was a path or url but couldn't find files {} "
                 "at this path or url.".format(
                     pretrained_model_name_or_path,
-                    ', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()),
+                    ', '.join(PRETRAINED_CORPUS_ARCHIVE_MAP.keys()),
                     pretrained_model_name_or_path,
                     corpus_file))
             return None
diff --git a/pytorch_transformers/tokenization_utils.py b/pytorch_transformers/tokenization_utils.py
new file mode 100644
index 0000000000..98a2968539
--- /dev/null
+++ b/pytorch_transformers/tokenization_utils.py
@@ -0,0 +1,114 @@
+# coding=utf-8
+# Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes for OpenAI GPT."""
+from __future__ import (absolute_import, division, print_function,
+                        unicode_literals)
+
+import sys
+import json
+import logging
+import os
+import regex as re
+from io import open
+
+try:
+    from functools import lru_cache
+except ImportError:
+    # Just a dummy decorator to get the checks to run on python2
+    # because honestly I don't want to support a byte-level unicode BPE tokenizer on python 2 right now.
+    def lru_cache():
+        return lambda func: func
+
+from .file_utils import cached_path
+
+logger = logging.getLogger(__name__)
+
+
+class PreTrainedTokenizer(object):
+    """ An abstract class to handle dowloading and loading pretrained tokenizers.
+    """
+    vocab_files_names = {}
+    pretrained_vocab_files_map = {}
+    max_model_input_sizes = {}
+
+    @classmethod
+    def from_pretrained(cls, *inputs, **kwargs):
+        return cls._from_pretrained(*inputs, **kwargs)
+
+    @classmethod
+    def _from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, *inputs, **kwargs):
+        """
+        Instantiate a PreTrainedTokenizer from pre-trained vocabulary files.
+        Download and cache the vocabulary files if needed.
+        """
+        s3_models = list(cls.max_model_input_sizes.keys())
+        vocab_files = {}
+        if pretrained_model_name_or_path in s3_models:
+            for file_id, map_list in cls.pretrained_vocab_files_map.items():
+                vocab_files[file_id] = map_list[pretrained_model_name_or_path]
+        else:
+            for file_id, file_name in cls.vocab_files_names.items():
+                if os.path.isdir(pretrained_model_name_or_path):
+                    full_file_name = os.path.join(pretrained_model_name_or_path, file_name)
+                else:
+                    full_file_name = pretrained_model_name_or_path
+                if not os.path.exists(full_file_name):
+                    logger.info("Didn't find file {}. We don't load it.".format(full_file_name))
+                    full_file_name = None
+                vocab_files[file_id] = full_file_name
+        # redirect to the cache, if necessary
+        try:
+            resolved_vocab_files = {}
+            for file_id, file_path in vocab_files.items():
+                if file_path is None:
+                    resolved_vocab_files[file_id] = None
+                else:
+                    resolved_vocab_files[file_id] = cached_path(file_path, cache_dir=cache_dir)
+        except EnvironmentError:
+            if pretrained_model_name_or_path in s3_models:
+                logger.error("Couldn't reach server to download vocabulary.")
+            else:
+                logger.error(
+                    "Model name '{}' was not found in model name list ({}). "
+                    "We assumed '{}' was a path or url but couldn't find files {} "
+                    "at this path or url.".format(
+                        pretrained_model_name_or_path, ', '.join(s3_models),
+                        pretrained_model_name_or_path, str(vocab_files.keys())))
+            return None
+
+        for file_id, file_path in vocab_files.items():
+            if file_path == resolved_vocab_files[file_id]:
+                logger.info("loading file {}".format(file_path))
+            else:
+                logger.info("loading file {} from cache at {}".format(
+                    file_path, resolved_vocab_files[file_id]))
+
+        if pretrained_model_name_or_path in cls.max_model_input_sizes:
+            # if we're using a pretrained model, ensure the tokenizer
+            # wont index sequences longer than the number of positional embeddings
+            max_len = cls.max_model_input_sizes[pretrained_model_name_or_path]
+            kwargs['max_len'] = min(kwargs.get('max_len', int(1e12)), max_len)
+
+        # Instantiate tokenizer.
+        tokenizer = cls(*inputs, **resolved_vocab_files, **kwargs)
+
+        return tokenizer
+
+
+def clean_up_tokenization(out_string):
+    out_string.replace(' .', '.').replace(' ?', '?').replace(' !', '!').replace(' ,', ','
+                    ).replace(" ' ", "'").replace(" n't", "n't").replace(" 'm", "'m").replace(" do not", " don't"
+                    ).replace(" 's", "'s").replace(" 've", "'ve").replace(" 're", "'re")
+    return out_string
diff --git a/pytorch_transformers/tokenization_xlm.py b/pytorch_transformers/tokenization_xlm.py
index 26c73c56b2..e37f3888a3 100644
--- a/pytorch_transformers/tokenization_xlm.py
+++ b/pytorch_transformers/tokenization_xlm.py
@@ -26,30 +26,42 @@ from io import open
 from tqdm import tqdm
 
 from .file_utils import cached_path
-from .model_utils import clean_up_tokenization
+from .tokenization_utils import PreTrainedTokenizer, clean_up_tokenization
 from .tokenization_bert import BasicTokenizer
 
 logger = logging.getLogger(__name__)
 
-PRETRAINED_VOCAB_ARCHIVE_MAP = {
-    'xlm-mlm-en-2048': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-en-2048-vocab.json",
+VOCAB_FILES_NAMES = {
+    'vocab_file': 'vocab.json',
+    'merges_file': 'merges.txt',
+    'special_tokens_file': 'special_tokens.txt'
 }
-PRETRAINED_MERGES_ARCHIVE_MAP = {
-    'xlm-mlm-en-2048': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-en-2048-merges.txt",
+
+PRETRAINED_VOCAB_FILES_MAP = {
+    'vocab_file':
+    {
+        'xlm-mlm-en-2048': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-en-2048-vocab.json",
+    },
+    'merges_file':
+    {
+        'xlm-mlm-en-2048': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-en-2048-merges.txt",
+    },
+    'special_tokens_file':
+    {
+        'xlm-mlm-en-2048': None,
+    }
 }
-PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP = {
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
     'xlm-mlm-en-2048': 512,
 }
-VOCAB_NAME = 'vocab.json'
-MERGES_NAME = 'merges.txt'
-SPECIAL_TOKENS_NAME = 'special_tokens.txt'
 
-INDEX= {
-  "bos_index": 0,
-  "eos_index": 1,
-  "pad_index": 2,
-  "unk_index": 3,
-  "mask_index": 5
+INDEX = {
+    "bos_index": 0,
+    "eos_index": 1,
+    "pad_index": 2,
+    "unk_index": 3,
+    "mask_index": 5
 }
 
 def get_pairs(word):
@@ -79,7 +91,7 @@ def text_standardize(text):
     text = re.sub(r'[^\S\n]+', ' ', text)
     return text.strip()
 
-class XLMTokenizer(object):
+class XLMTokenizer(PreTrainedTokenizer):
     """
     BPE tokenizer for XLM, adapted from OpenAI BPE tokenizer. Peculiarities:
         - lower case all inputs
@@ -87,65 +99,11 @@ class XLMTokenizer(object):
         - argument special_tokens and function set_special_tokens:
             can be used to add additional symbols (ex: "__classify__") to a vocabulary.
     """
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, *inputs, **kwargs):
-        """
-        Instantiate a PreTrainedBertModel from a pre-trained model file.
-        Download and cache the pre-trained model file if needed.
-        """
-        if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP:
-            vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[pretrained_model_name_or_path]
-            merges_file = PRETRAINED_MERGES_ARCHIVE_MAP[pretrained_model_name_or_path]
-            special_tokens_file = None
-        else:
-            vocab_file = os.path.join(pretrained_model_name_or_path, VOCAB_NAME)
-            merges_file = os.path.join(pretrained_model_name_or_path, MERGES_NAME)
-            special_tokens_file = os.path.join(pretrained_model_name_or_path, SPECIAL_TOKENS_NAME)
-            if not os.path.exists(special_tokens_file):
-                special_tokens_file = None
-            else:
-                logger.info("loading special tokens file {}".format(special_tokens_file))
-        # redirect to the cache, if necessary
-        try:
-            resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir)
-            resolved_merges_file = cached_path(merges_file, cache_dir=cache_dir)
-        except EnvironmentError:
-            if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP:
-                logger.error(
-                    "Couldn't reach server at '{}' to download vocabulary.".format(
-                        vocab_file))
-            else:
-                logger.error(
-                    "Model name '{}' was not found in model name list ({}). "
-                    "We assumed '{}' was a path or url but couldn't find files {} and {} "
-                    "at this path or url.".format(
-                        pretrained_model_name_or_path,
-                        ', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()),
-                        pretrained_model_name_or_path,
-                        vocab_file, merges_file))
-            return None
-        if resolved_vocab_file == vocab_file and resolved_merges_file == merges_file:
-            logger.info("loading vocabulary file {}".format(vocab_file))
-            logger.info("loading merges file {}".format(merges_file))
-        else:
-            logger.info("loading vocabulary file {} from cache at {}".format(
-                vocab_file, resolved_vocab_file))
-            logger.info("loading merges file {} from cache at {}".format(
-                merges_file, resolved_merges_file))
-        if pretrained_model_name_or_path in PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP:
-            # if we're using a pretrained model, ensure the tokenizer wont index sequences longer
-            # than the number of positional embeddings
-            max_len = PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP[pretrained_model_name_or_path]
-            kwargs['max_len'] = min(kwargs.get('max_len', int(1e12)), max_len)
-        # Instantiate tokenizer.
-        if special_tokens_file and 'special_tokens' not in kwargs:
-            special_tokens = open(special_tokens_file, encoding='utf-8').read().split('\n')[:-1]
-        else:
-            special_tokens = kwargs.pop('special_tokens', [])
-        tokenizer = cls(resolved_vocab_file, resolved_merges_file, special_tokens=special_tokens, *inputs, **kwargs)
-        return tokenizer
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
 
-    def __init__(self, vocab_file, merges_file, special_tokens=None, max_len=None):
+    def __init__(self, vocab_file, merges_file, special_tokens_file=None, special_tokens=None, max_len=None):
         try:
             import ftfy
             import spacy
@@ -164,9 +122,17 @@ class XLMTokenizer(object):
         merges = [tuple(merge.split()[:2]) for merge in merges]
         self.bpe_ranks = dict(zip(merges, range(len(merges))))
         self.cache = {}
+
+        all_special_tokens = []
+        if special_tokens_file is not None:
+            special_tokens_to_add = open(special_tokens_file, encoding='utf-8').read().split('\n')[:-1]
+            all_special_tokens.extend(special_tokens_to_add)
+        if special_tokens is not None and special_tokens:
+            all_special_tokens.extend(special_tokens)
+
         self.special_tokens = {}
         self.special_tokens_decoder = {}
-        self.set_special_tokens(special_tokens)
+        self.set_special_tokens(all_special_tokens)
 
     def __len__(self):
         return len(self.encoder) + len(self.special_tokens)
@@ -294,9 +260,9 @@ class XLMTokenizer(object):
         if not os.path.isdir(vocab_path):
             logger.error("Vocabulary path ({}) should be a directory".format(vocab_path))
             return
-        vocab_file = os.path.join(vocab_path, VOCAB_NAME)
-        merge_file = os.path.join(vocab_path, MERGES_NAME)
-        special_tokens_file = os.path.join(vocab_path, SPECIAL_TOKENS_NAME)
+        vocab_file = os.path.join(vocab_path, VOCAB_FILES_NAMES['vocab_file'])
+        merge_file = os.path.join(vocab_path, VOCAB_FILES_NAMES['merges_file'])
+        special_tokens_file = os.path.join(vocab_path, VOCAB_FILES_NAMES['special_tokens_file'])
 
         with open(vocab_file, 'w', encoding='utf-8') as f:
             f.write(json.dumps(self.encoder, ensure_ascii=False))
diff --git a/pytorch_transformers/tokenization_xlnet.py b/pytorch_transformers/tokenization_xlnet.py
index 76b9a9f870..a30e6db8da 100644
--- a/pytorch_transformers/tokenization_xlnet.py
+++ b/pytorch_transformers/tokenization_xlnet.py
@@ -27,15 +27,24 @@ import unicodedata
 import six
 
 from .file_utils import cached_path
-from .model_utils import clean_up_tokenization
+from .tokenization_utils import PreTrainedTokenizer, clean_up_tokenization
 
 logger = logging.getLogger(__name__)
 
-PRETRAINED_VOCAB_ARCHIVE_MAP = {
+VOCAB_FILES_NAMES = {'vocab_file': 'spiece.model'}
+
+PRETRAINED_VOCAB_FILES_MAP = {
+    'vocab_file':
+    {
     'xlnet-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-large-cased-spiece.model",
+    }
 }
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+    'xlnet-large-cased': 512,
+}
+
 VOCAB_NAME = 'spiece.model'
-SPECIAL_TOKENS_NAME = 'special_tokens.txt'
 
 SPIECE_UNDERLINE = u'▁'
 
@@ -46,7 +55,7 @@ SEG_ID_CLS = 2
 SEG_ID_SEP = 3
 SEG_ID_PAD = 4
 
-class XLNetTokenizer(object):
+class XLNetTokenizer(PreTrainedTokenizer):
     """
         SentencePiece based tokenizer. Peculiarities:
             - requires SentencePiece: https://github.com/google/sentencepiece
@@ -63,64 +72,11 @@ class XLNetTokenizer(object):
         "<eod>"  : 7,
         "<eop>"  : 8,
     }
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, *inputs, **kwargs):
-        """
-        Instantiate a PreTrainedBertModel from a pre-trained model file.
-        Download and cache the pre-trained model file if needed.
-        """
-        if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP:
-            vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[pretrained_model_name_or_path]
-            special_tokens_file = None
-            if '-cased' in pretrained_model_name_or_path and kwargs.get('do_lower_case', True):
-                logger.warning("The pre-trained model you are loading is a cased model but you have not set "
-                               "`do_lower_case` to False. We are setting `do_lower_case=False` for you but "
-                               "you may want to check this behavior.")
-                kwargs['do_lower_case'] = False
-            elif '-cased' not in pretrained_model_name_or_path and not kwargs.get('do_lower_case', True):
-                logger.warning("The pre-trained model you are loading is an uncased model but you have set "
-                               "`do_lower_case` to False. We are setting `do_lower_case=True` for you "
-                               "but you may want to check this behavior.")
-                kwargs['do_lower_case'] = True
-        else:
-            vocab_file = os.path.join(pretrained_model_name_or_path, VOCAB_NAME)
-            special_tokens_file = os.path.join(pretrained_model_name_or_path, SPECIAL_TOKENS_NAME)
-            if not os.path.exists(special_tokens_file):
-                special_tokens_file = None
-            else:
-                logger.info("loading special tokens file {}".format(special_tokens_file))
-        # redirect to the cache, if necessary
-        try:
-            resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir)
-        except EnvironmentError:
-            if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP:
-                logger.error(
-                    "Couldn't reach server at '{}' to download vocabulary.".format(
-                        vocab_file))
-            else:
-                logger.error(
-                    "Model name '{}' was not found in model name list ({}). "
-                    "We assumed '{}' was a path or url but couldn't find files {}"
-                    "at this path or url.".format(
-                        pretrained_model_name_or_path,
-                        ', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()),
-                        pretrained_model_name_or_path,
-                        vocab_file))
-            return None
-        if resolved_vocab_file == vocab_file:
-            logger.info("loading vocabulary file {}".format(vocab_file))
-        else:
-            logger.info("loading vocabulary file {} from cache at {}".format(
-                vocab_file, resolved_vocab_file))
-        # Instantiate tokenizer.
-        if special_tokens_file and 'special_tokens' not in kwargs:
-            special_tokens = open(special_tokens_file, encoding='utf-8').read().split('\n')[:-1]
-        else:
-            special_tokens = kwargs.pop('special_tokens', [])
-        tokenizer = cls(resolved_vocab_file, special_tokens=special_tokens, *inputs, **kwargs)
-        return tokenizer
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
 
-    def __init__(self, vocab_file, special_tokens=None, max_len=None,
+    def __init__(self, vocab_file, max_len=None,
                  do_lower_case=False, remove_space=True, keep_accents=False):
         try:
             import sentencepiece as spm
@@ -136,9 +92,6 @@ class XLNetTokenizer(object):
 
         self.sp_model = spm.SentencePieceProcessor()
         self.sp_model.Load(vocab_file)
-        self.special_tokens = {}
-        self.special_tokens_decoder = {}
-        self.set_special_tokens(special_tokens)
 
     @property
     def UNK_TOKEN(self):
@@ -181,7 +134,7 @@ class XLNetTokenizer(object):
         return self.special_symbols["<mask>"]
 
     def __len__(self):
-        return len(self.encoder) + len(self.special_tokens)
+        return len(self.sp_model)
 
     def __getstate__(self):
         state = self.__dict__.copy()
@@ -198,19 +151,6 @@ class XLNetTokenizer(object):
         self.sp_model = spm.SentencePieceProcessor()
         self.sp_model.Load(self.vocab_file)
 
-    def set_special_tokens(self, special_tokens):
-        """ Add a list of additional tokens to the encoder.
-            The additional tokens are indexed starting from the last index of the
-            current vocabulary in the order of the `special_tokens` list.
-        """
-        if not special_tokens:
-            self.special_tokens = {}
-            self.special_tokens_decoder = {}
-            return
-        self.special_tokens = dict((tok, len(self.sp_model) + i) for i, tok in enumerate(special_tokens))
-        self.special_tokens_decoder = {v:k for k, v in self.special_tokens.items()}
-        logger.info("Special tokens: %s", str(self.special_tokens))
-
     def preprocess_text(self, inputs):
         if self.remove_space:
             outputs = ' '.join(inputs.strip().split())
@@ -272,15 +212,9 @@ class XLNetTokenizer(object):
         """ Converts a sequence of tokens into ids using the vocab. """
         ids = []
         if isinstance(tokens, str) or (sys.version_info[0] == 2 and isinstance(tokens, unicode)):
-            if tokens in self.special_tokens:
-                return self.special_tokens[tokens]
-            else:
-                return self.sp_model.PieceToId(tokens)
+            return self.sp_model.PieceToId(tokens)
         for token in tokens:
-            if token in self.special_tokens:
-                ids.append(self.special_tokens[token])
-            else:
-                ids.append(self.sp_model.PieceToId(token))
+            ids.append(self.sp_model.PieceToId(token))
         if len(ids) > self.max_len:
             logger.warning(
                 "Token indices sequence length is longer than the specified maximum "
@@ -289,15 +223,11 @@ class XLNetTokenizer(object):
             )
         return ids
 
-    def convert_ids_to_tokens(self, ids, return_unicode=True, skip_special_tokens=False):
+    def convert_ids_to_tokens(self, ids, return_unicode=True):
         """Converts a sequence of ids in tokens."""
         tokens = []
         for i in ids:
-            if i in self.special_tokens_decoder:
-                if not skip_special_tokens:
-                    tokens.append(self.special_tokens_decoder[i])
-            else:
-                tokens.append(self.sp_model.IdToPiece(i))
+            tokens.append(self.sp_model.IdToPiece(i))
 
         if six.PY2 and return_unicode:
             ret_pieces = []
@@ -311,9 +241,9 @@ class XLNetTokenizer(object):
     def encode(self, text, sample=False):
         return self.convert_tokens_to_ids(self.tokenize(text, sample=sample))
 
-    def decode(self, ids, skip_special_tokens=False, clean_up_tokenization_spaces=True):
+    def decode(self, ids, clean_up_tokenization_spaces=True):
         """Converts a sequence of ids in a string."""
-        tokens = self.convert_ids_to_tokens(ids, skip_special_tokens=skip_special_tokens)
+        tokens = self.convert_ids_to_tokens(ids)
         out_string = ''.join(tokens)
         if clean_up_tokenization_spaces:
             out_string = out_string.strip().replace('<unk>', '')
@@ -328,18 +258,7 @@ class XLNetTokenizer(object):
             logger.error("Vocabulary path ({}) should be a directory".format(vocab_path))
             return
         out_vocab_file = os.path.join(vocab_path, VOCAB_NAME)
-        special_tokens_file = os.path.join(vocab_path, SPECIAL_TOKENS_NAME)
 
         copyfile(self.vocab_file, out_vocab_file)
 
-        index = len(self.sp_model)
-        with open(special_tokens_file, 'w', encoding='utf-8') as writer:
-            for token, token_index in sorted(self.special_tokens.items(), key=lambda kv: kv[1]):
-                if index != token_index:
-                    logger.warning("Saving special tokens vocabulary to {}: BPE indices are not consecutive."
-                                   " Please check that the tokenizer is not corrupted!".format(special_tokens_file))
-                    index = token_index
-                writer.write(token + u'\n')
-                index += 1
-
-        return out_vocab_file, special_tokens_file
+        return (out_vocab_file,)

From 6dacc79d395bd41e0ef76c2a043c2ef90cc79925 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Fri, 5 Jul 2019 15:11:59 +0200
Subject: [PATCH 060/139] fix python2 tests

---
 pytorch_transformers/tests/tokenization_tests_commons.py | 6 ++----
 pytorch_transformers/tokenization_utils.py               | 6 +++++-
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/pytorch_transformers/tests/tokenization_tests_commons.py b/pytorch_transformers/tests/tokenization_tests_commons.py
index e8f7ee7a25..876f7747be 100644
--- a/pytorch_transformers/tests/tokenization_tests_commons.py
+++ b/pytorch_transformers/tests/tokenization_tests_commons.py
@@ -12,9 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+from __future__ import absolute_import, division, print_function, unicode_literals
 
 import os
 import sys
@@ -47,7 +45,7 @@ def create_and_check_save_and_load_tokenizer(tester, tokenizer_class, *inputs, *
 def create_and_check_pickle_tokenizer(tester, tokenizer_class, *inputs, **kwargs):
     tokenizer = tokenizer_class(*inputs, **kwargs)
 
-    text = "Munich and Berlin are nice cities"
+    text = u"Munich and Berlin are nice cities"
     filename = u"/tmp/tokenizer.bin"
 
     subwords = tokenizer.tokenize(text)
diff --git a/pytorch_transformers/tokenization_utils.py b/pytorch_transformers/tokenization_utils.py
index 98a2968539..c6f08c41ae 100644
--- a/pytorch_transformers/tokenization_utils.py
+++ b/pytorch_transformers/tokenization_utils.py
@@ -101,8 +101,12 @@ class PreTrainedTokenizer(object):
             max_len = cls.max_model_input_sizes[pretrained_model_name_or_path]
             kwargs['max_len'] = min(kwargs.get('max_len', int(1e12)), max_len)
 
+        # Merge resolved_vocab_files arguments in kwargs.
+        for args_name, file_path in resolved_vocab_files.items():
+            kwargs[args_name] = file_path
+
         # Instantiate tokenizer.
-        tokenizer = cls(*inputs, **resolved_vocab_files, **kwargs)
+        tokenizer = cls(*inputs, **kwargs)
 
         return tokenizer
 

From 162ba383b05e502b9fc5df4d4abb5951c020d3bc Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Fri, 5 Jul 2019 15:57:14 +0200
Subject: [PATCH 061/139] fix model loading

---
 examples/run_bert_classifier.py               |  3 ++-
 pytorch_transformers/modeling_utils.py        | 22 ++++++++++++++++++-
 .../tests/modeling_utils_test.py              |  7 ++++--
 3 files changed, 28 insertions(+), 4 deletions(-)

diff --git a/examples/run_bert_classifier.py b/examples/run_bert_classifier.py
index 506aecc5b1..6f3d26cee1 100644
--- a/examples/run_bert_classifier.py
+++ b/examples/run_bert_classifier.py
@@ -308,7 +308,8 @@ def main():
                 input_ids, input_mask, segment_ids, label_ids = batch
 
                 # define a new function to compute loss values for both output_modes
-                logits = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask)
+                ouputs = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids)
+                loss = 
 
                 if output_mode == "classification":
                     loss_fct = CrossEntropyLoss()
diff --git a/pytorch_transformers/modeling_utils.py b/pytorch_transformers/modeling_utils.py
index b72707ce08..96558704ea 100644
--- a/pytorch_transformers/modeling_utils.py
+++ b/pytorch_transformers/modeling_utils.py
@@ -193,7 +193,8 @@ class PreTrainedModel(nn.Module):
         """
         state_dict = kwargs.pop('state_dict', None)
         cache_dir = kwargs.pop('cache_dir', None)
-        from_tf = kwargs.pop('from_tf', None)
+        from_tf = kwargs.pop('from_tf', False)
+        output_loading_info = kwargs.pop('output_loading_info', False)
 
         # Load config
         config = cls.config_class.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
@@ -239,6 +240,21 @@ class PreTrainedModel(nn.Module):
             # Directly load from a TensorFlow checkpoint
             return cls.load_tf_weights(model, config, resolved_archive_file[:-6])  # Remove the '.index'
 
+        # Convert old format to new format if needed from a PyTorch state_dict
+        old_keys = []
+        new_keys = []
+        for key in state_dict.keys():
+            new_key = None
+            if 'gamma' in key:
+                new_key = key.replace('gamma', 'weight')
+            if 'beta' in key:
+                new_key = key.replace('beta', 'bias')
+            if new_key:
+                old_keys.append(key)
+                new_keys.append(new_key)
+        for old_key, new_key in zip(old_keys, new_keys):
+            state_dict[new_key] = state_dict.pop(old_key)
+
         # Load from a PyTorch state_dict
         missing_keys = []
         unexpected_keys = []
@@ -279,6 +295,10 @@ class PreTrainedModel(nn.Module):
         if hasattr(model, 'tie_weights'):
             model.tie_weights()  # make sure word embedding weights are still tied
 
+        if output_loading_info:
+            loading_info = {"missing_keys": missing_keys, "unexpected_keys": unexpected_keys, "error_msgs": error_msgs}
+            return model, loading_info
+
         return model
 
 
diff --git a/pytorch_transformers/tests/modeling_utils_test.py b/pytorch_transformers/tests/modeling_utils_test.py
index 1866d35353..5e3b8e676a 100644
--- a/pytorch_transformers/tests/modeling_utils_test.py
+++ b/pytorch_transformers/tests/modeling_utils_test.py
@@ -17,21 +17,24 @@ from __future__ import division
 from __future__ import print_function
 
 import unittest
+import logging
 
 from pytorch_transformers import PretrainedConfig, PreTrainedModel
 from pytorch_transformers.modeling_bert import BertModel, BertConfig, PRETRAINED_MODEL_ARCHIVE_MAP
 
-
 class ModelUtilsTest(unittest.TestCase):
     def test_model_from_pretrained(self):
+        logging.basicConfig(level=logging.INFO)
         for model_name in list(PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
             config = BertConfig.from_pretrained(model_name)
             self.assertIsNotNone(config)
             self.assertIsInstance(config, PretrainedConfig)
 
-            model = BertModel.from_pretrained(model_name)
+            model, loading_info = BertModel.from_pretrained(model_name, output_loading_info=True)
             self.assertIsNotNone(model)
             self.assertIsInstance(model, PreTrainedModel)
+            for value in loading_info.values():
+                self.assertEqual(len(value), 0)
 
             config = BertConfig.from_pretrained(model_name, output_attentions=True, output_hidden_states=True)
             model = BertModel.from_pretrained(model_name, output_attentions=True, output_hidden_states=True)

From 1113f97f33ca939457370c767cad345a1a949fda Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Fri, 5 Jul 2019 16:31:13 +0200
Subject: [PATCH 062/139] clean up glue example

---
 examples/run_bert_classifier.py            |  20 +-
 examples/run_glue.py                       | 401 +++++++++++++++++++++
 examples/utils_glue.py                     |   1 +
 pytorch_transformers/tokenization_utils.py |  18 +
 4 files changed, 423 insertions(+), 17 deletions(-)
 create mode 100644 examples/run_glue.py

diff --git a/examples/run_bert_classifier.py b/examples/run_bert_classifier.py
index 6f3d26cee1..27b8e6165d 100644
--- a/examples/run_bert_classifier.py
+++ b/examples/run_bert_classifier.py
@@ -309,14 +309,7 @@ def main():
 
                 # define a new function to compute loss values for both output_modes
                 ouputs = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids)
-                loss = 
-
-                if output_mode == "classification":
-                    loss_fct = CrossEntropyLoss()
-                    loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1))
-                elif output_mode == "regression":
-                    loss_fct = MSELoss()
-                    loss = loss_fct(logits.view(-1), label_ids.view(-1))
+                loss = ouputs[0]
 
                 if n_gpu > 1:
                     loss = loss.mean() # mean() to average on multi-gpu.
@@ -423,15 +416,8 @@ def main():
             label_ids = label_ids.to(device)
 
             with torch.no_grad():
-                logits = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask)
-
-            # create eval loss and other metric required by the task
-            if output_mode == "classification":
-                loss_fct = CrossEntropyLoss()
-                tmp_eval_loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1))
-            elif output_mode == "regression":
-                loss_fct = MSELoss()
-                tmp_eval_loss = loss_fct(logits.view(-1), label_ids.view(-1))
+                outputs = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids)
+                tmp_eval_loss, logits = outputs[:2]
 
             eval_loss += tmp_eval_loss.mean().item()
             nb_eval_steps += 1
diff --git a/examples/run_glue.py b/examples/run_glue.py
new file mode 100644
index 0000000000..da1e8d8123
--- /dev/null
+++ b/examples/run_glue.py
@@ -0,0 +1,401 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""BERT finetuning runner."""
+
+from __future__ import absolute_import, division, print_function
+
+import argparse
+import logging
+import os
+import sys
+import random
+from tqdm import tqdm, trange
+
+import numpy as np
+
+import torch
+from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
+                              TensorDataset)
+from torch.utils.data.distributed import DistributedSampler
+from torch.nn import CrossEntropyLoss, MSELoss
+
+from tensorboardX import SummaryWriter
+
+from pytorch_transformers import WEIGHTS_NAME, CONFIG_NAME
+from pytorch_transformers.modeling_bert import BertForSequenceClassification
+from pytorch_transformers.tokenization_bert import BertTokenizer
+from pytorch_transformers.optimization import BertAdam, WarmupLinearSchedule
+
+from utils_glue import processors, output_modes, convert_examples_to_features, compute_metrics
+
+
+logger = logging.getLogger(__name__)
+
+
+def main():
+    parser = argparse.ArgumentParser()
+
+    ## Required parameters
+    parser.add_argument("--data_dir", default=None, type=str, required=True,
+                        help="The input data dir. Should contain the .tsv files (or other data files) for the task.")
+    parser.add_argument("--bert_model", default=None, type=str, required=True,
+                        help="Bert pre-trained model selected in the list: bert-base-uncased, "
+                        "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, "
+                        "bert-base-multilingual-cased, bert-base-chinese.")
+    parser.add_argument("--task_name", default=None, type=str, required=True,
+                        help="The name of the task to train.")
+    parser.add_argument("--output_dir", default=None, type=str, required=True,
+                        help="The output directory where the model predictions and checkpoints will be written.")
+
+    ## Other parameters
+    parser.add_argument("--cache_dir", default="", type=str,
+                        help="Where do you want to store the pre-trained models downloaded from s3")
+    parser.add_argument("--max_seq_length", default=128, type=int,
+                        help="The maximum total input sequence length after WordPiece tokenization. \n"
+                             "Sequences longer than this will be truncated, and sequences shorter \n"
+                             "than this will be padded.")
+    parser.add_argument("--do_train", action='store_true',
+                        help="Whether to run training.")
+    parser.add_argument("--do_eval", action='store_true',
+                        help="Whether to run eval on the dev set.")
+    parser.add_argument("--do_lower_case", action='store_true',
+                        help="Set this flag if you are using an uncased model.")
+    parser.add_argument("--train_batch_size", default=32, type=int,
+                        help="Total batch size for training.")
+    parser.add_argument("--eval_batch_size", default=8, type=int,
+                        help="Total batch size for eval.")
+    parser.add_argument('--gradient_accumulation_steps', type=int, default=1,
+                        help="Number of updates steps to accumulate before performing a backward/update pass.")
+    parser.add_argument("--learning_rate", default=5e-5, type=float,
+                        help="The initial learning rate for Adam.")
+    parser.add_argument("--num_train_epochs", default=3.0, type=float,
+                        help="Total number of training epochs to perform.")
+    parser.add_argument("--warmup_proportion", default=0.1, type=float,
+                        help="Proportion of training to perform linear learning rate warmup for. "
+                             "E.g., 0.1 = 10%% of training.")
+    parser.add_argument("--no_cuda", action='store_true',
+                        help="Avoid using CUDA when available")
+    parser.add_argument('--overwrite_output_dir', action='store_true',
+                        help="Overwrite the content of the output directory")
+    parser.add_argument('--seed', type=int, default=42,
+                        help="random seed for initialization")
+
+    parser.add_argument('--fp16', action='store_true',
+                        help="Whether to use 16-bit float precision instead of 32-bit")
+    parser.add_argument('--loss_scale', type=float, default=0,
+                        help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
+                             "0 (default value): dynamic loss scaling.\n"
+                             "Positive power of 2: static loss scaling value.\n")
+
+    parser.add_argument("--local_rank", type=int, default=-1,
+                        help="local_rank for distributed training on gpus")
+
+    parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.")
+    parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.")
+    args = parser.parse_args()
+
+    # Setup distant debugging if needed
+    if args.server_ip and args.server_port:
+        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
+        import ptvsd
+        print("Waiting for debugger attach")
+        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
+        ptvsd.wait_for_attach()
+
+    # Setup CUDA, GPU & distributed training
+    if args.local_rank == -1 or args.no_cuda:
+        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
+        n_gpu = torch.cuda.device_count()
+    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
+        torch.cuda.set_device(args.local_rank)
+        device = torch.device("cuda", args.local_rank)
+        torch.distributed.init_process_group(backend='nccl')
+        n_gpu = 1
+    args.device = device
+
+    # Setup logging
+    logging.basicConfig(level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
+    logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format(
+                device, n_gpu, bool(args.local_rank != -1), args.fp16))
+
+    # Setup seeds
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+    torch.manual_seed(args.seed)
+    if n_gpu > 0:
+        torch.cuda.manual_seed_all(args.seed)
+
+    # Safety checks and create output directory
+    if not args.do_train and not args.do_eval:
+        raise ValueError("At least one of `do_train` or `do_eval` must be True.")
+    if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir:
+        raise ValueError("Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(args.output_dir))
+    if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
+        os.makedirs(args.output_dir)
+
+    # Prepare GLUE task
+    task_name = args.task_name.lower()
+    if task_name not in processors:
+        raise ValueError("Task not found: %s" % (task_name))
+    processor = processors[task_name]()
+    output_mode = output_modes[task_name]
+    label_list = processor.get_labels()
+    num_labels = len(label_list)
+
+    # Load pretrained model and tokenizer
+    if args.local_rank not in [-1, 0]:
+        # Make sure only the first process in distributed training will download model & vocab
+        torch.distributed.barrier()
+
+    tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)
+    model = BertForSequenceClassification.from_pretrained(args.bert_model, num_labels=num_labels)
+
+    if args.local_rank == 0:
+        torch.distributed.barrier()
+
+    # Distributed, parrallel and fp16 model
+    if args.fp16:
+        model.half()
+    model.to(device)
+    if args.local_rank != -1:
+        model = torch.nn.parallel.DistributedDataParallel(model,
+                                                          device_ids=[args.local_rank],
+                                                          output_device=args.local_rank,
+                                                          find_unused_parameters=True)
+    elif n_gpu > 1:
+        model = torch.nn.DataParallel(model)
+
+    global_step = 0
+    tr_loss = 0
+    if args.do_train:
+        if args.local_rank in [-1, 0]:
+            tb_writer = SummaryWriter()
+
+        # Load and cache data
+        train_examples = processor.get_train_examples(args.data_dir)
+        cached_train_features_file = os.path.join(args.data_dir, 'train_{0}_{1}_{2}'.format(
+            list(filter(None, args.bert_model.split('/'))).pop(), str(args.max_seq_length), str(task_name)))
+        if os.path.exists(cached_train_features_file):
+            train_features = torch.load(cached_train_features_file)
+        else:
+            train_features = convert_examples_to_features(
+                train_examples, label_list, args.max_seq_length, tokenizer, output_mode)
+            if args.local_rank == -1 or torch.distributed.get_rank() == 0:
+                logger.info("  Saving train features into cached file %s", cached_train_features_file)
+                torch.save(train_features, cached_train_features_file)
+
+        # Convert in tensors and build dataloader
+        all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
+        all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
+        all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)
+        if output_mode == "classification":
+            all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long)
+        elif output_mode == "regression":
+            all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.float)
+
+        args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps
+
+        train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
+        train_sampler = RandomSampler(train_data) if args.local_rank == -1 else DistributedSampler(train_data)
+        train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)
+
+        num_train_optimization_steps = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
+
+        # Prepare optimizer
+        param_optimizer = list(model.named_parameters())
+        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
+        optimizer_grouped_parameters = [
+            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
+            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
+            ]
+        if args.fp16:
+            try:
+                from apex.optimizers import FP16_Optimizer, FusedAdam
+            except ImportError:
+                raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
+            optimizer = FusedAdam(optimizer_grouped_parameters,
+                                  lr=args.learning_rate,
+                                  bias_correction=False,
+                                  max_grad_norm=1.0)
+            if args.loss_scale == 0:
+                optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
+            else:
+                optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale)
+            warmup_linear = WarmupLinearSchedule(warmup=args.warmup_proportion,
+                                                 t_total=num_train_optimization_steps)
+
+        else:
+            optimizer = BertAdam(optimizer_grouped_parameters,
+                                 lr=args.learning_rate,
+                                 warmup=args.warmup_proportion,
+                                 t_total=num_train_optimization_steps)
+
+        # Train!
+        logger.info("***** Running training *****")
+        logger.info("  Num examples = %d", len(train_examples))
+        logger.info("  Batch size = %d", args.train_batch_size)
+        logger.info("  Num steps = %d", num_train_optimization_steps)
+        model.train()
+        for _ in trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]):
+            for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])):
+                batch = tuple(t.to(device) for t in batch)
+                input_ids, input_mask, segment_ids, label_ids = batch
+
+                ouputs = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids)
+                loss = ouputs[0]
+
+                if n_gpu > 1:
+                    loss = loss.mean() # mean() to average on multi-gpu parallel training
+                if args.gradient_accumulation_steps > 1:
+                    loss = loss / args.gradient_accumulation_steps
+
+                if args.fp16:
+                    optimizer.backward(loss)
+                else:
+                    loss.backward()
+
+                tr_loss += loss.item()
+                if (step + 1) % args.gradient_accumulation_steps == 0:
+                    if args.fp16:
+                        # modify learning rate with special warm up BERT uses
+                        # if args.fp16 is False, BertAdam is used that handles this automatically
+                        lr_this_step = args.learning_rate * warmup_linear.get_lr(global_step, args.warmup_proportion)
+                        for param_group in optimizer.param_groups:
+                            param_group['lr'] = lr_this_step
+                    optimizer.step()
+                    optimizer.zero_grad()
+                    global_step += 1
+                    if args.local_rank in [-1, 0]:
+                        if not args.fp16:
+                            tb_writer.add_scalar('lr', optimizer.get_lr()[0], global_step)
+                        tb_writer.add_scalar('loss', loss.item(), global_step)
+
+    ### Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
+    ### Example:
+    if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
+        # Save a trained model, configuration and tokenizer
+        model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
+
+        # If we save using the predefined names, we can load using `from_pretrained`
+        output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
+        output_config_file = os.path.join(args.output_dir, CONFIG_NAME)
+
+        torch.save(model_to_save.state_dict(), output_model_file)
+        model_to_save.config.to_json_file(output_config_file)
+        tokenizer.save_vocabulary(args.output_dir)
+
+        # Load a trained model and vocabulary that you have fine-tuned
+        model = BertForSequenceClassification.from_pretrained(args.output_dir)
+        tokenizer = BertTokenizer.from_pretrained(args.output_dir)
+
+        # Good practice: save your training arguments together with the trained model
+        output_args_file = os.path.join(args.output_dir, 'training_args.bin')
+        torch.save(args, output_args_file)
+    else:
+        model = BertForSequenceClassification.from_pretrained(args.bert_model)
+
+    model.to(device)
+
+    ### Evaluation
+    if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
+        eval_task_names = ("mnli", "mnli-mm") if task_name == "mnli" else (task_name,)
+        eval_outputs_dirs = (args.output_dir, args.output_dir + '-MM') if task_name == "mnli" else (args.output_dir,)
+        for eval_task, output_dir in zip(eval_task_names, eval_outputs_dirs):
+            if os.path.exists(output_dir) and os.listdir(output_dir) and args.do_train:
+                raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir))
+            if not os.path.exists(output_dir):
+                os.makedirs(output_dir)
+
+            # Load and cache data
+            processor = processors[eval_task]()
+            eval_examples = processor.get_dev_examples(args.data_dir)
+            cached_eval_features_file = os.path.join(args.data_dir, 'dev_{0}_{1}_{2}'.format(
+                list(filter(None, args.bert_model.split('/'))).pop(), str(args.max_seq_length), str(eval_task)))
+            if os.path.exists(cached_eval_features_file):
+                eval_features = torch.load(cached_eval_features_file)
+            else:
+                eval_features = convert_examples_to_features(
+                    eval_examples, label_list, args.max_seq_length, tokenizer, output_mode)
+                if args.local_rank == -1 or torch.distributed.get_rank() == 0:
+                    logger.info("  Saving eval features into cached file %s", cached_eval_features_file)
+                torch.save(eval_features, cached_eval_features_file)
+
+            # Convert in tensors and build dataloader
+            all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
+            all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
+            all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
+            if output_mode == "classification":
+                all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long)
+            elif output_mode == "regression":
+                all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.float)
+
+            eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
+            # Note that DistributedSampler samples randomly
+            eval_sampler = SequentialSampler(eval_data) if args.local_rank == -1 else DistributedSampler(eval_data)
+            eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)
+
+            # Eval!
+            logger.info("***** Running evaluation *****")
+            logger.info("  Num examples = %d", len(eval_examples))
+            logger.info("  Batch size = %d", args.eval_batch_size)
+            model.eval()
+            eval_loss = 0
+            nb_eval_steps = 0
+            preds = None
+            out_label_ids = None
+            for batch in tqdm(eval_dataloader, desc="Evaluating"):
+                batch = tuple(t.to(device) for t in batch)
+                input_ids, input_mask, segment_ids, label_ids = batch
+
+                with torch.no_grad():
+                    outputs = model(input_ids,
+                                    token_type_ids=segment_ids,
+                                    attention_mask=input_mask,
+                                    labels=label_ids)
+                    tmp_eval_loss, logits = outputs[:2]
+
+                eval_loss += tmp_eval_loss.mean().item()
+                nb_eval_steps += 1
+                if preds is None:
+                    preds = logits.detach().cpu().numpy()
+                    out_label_ids = label_ids.detach().cpu().numpy()
+                else:
+                    preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
+                    out_label_ids = np.append(out_label_ids, label_ids.detach().cpu().numpy(), axis=0)
+
+            eval_loss = eval_loss / nb_eval_steps
+            if output_mode == "classification":
+                preds = np.argmax(preds, axis=1)
+            elif output_mode == "regression":
+                preds = np.squeeze(preds)
+            result = compute_metrics(eval_task, preds, out_label_ids)
+
+            loss = tr_loss/global_step if args.do_train else None
+
+            result['eval_loss'] = eval_loss
+            result['global_step'] = global_step
+            result['loss'] = loss
+
+            output_eval_file = os.path.join(output_dir, "eval_results.txt")
+            with open(output_eval_file, "w") as writer:
+                logger.info("***** Eval results *****")
+                for key in sorted(result.keys()):
+                    logger.info("  %s = %s", key, str(result[key]))
+                    writer.write("%s = %s\n" % (key, str(result[key])))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/utils_glue.py b/examples/utils_glue.py
index e3e4179fae..18e733567d 100644
--- a/examples/utils_glue.py
+++ b/examples/utils_glue.py
@@ -583,6 +583,7 @@ processors = {
 output_modes = {
     "cola": "classification",
     "mnli": "classification",
+    "mnli-mm": "classification",
     "mrpc": "classification",
     "sst-2": "classification",
     "sts-b": "regression",
diff --git a/pytorch_transformers/tokenization_utils.py b/pytorch_transformers/tokenization_utils.py
index c6f08c41ae..9004315657 100644
--- a/pytorch_transformers/tokenization_utils.py
+++ b/pytorch_transformers/tokenization_utils.py
@@ -110,6 +110,24 @@ class PreTrainedTokenizer(object):
 
         return tokenizer
 
+    def tokenize(self, text):
+        raise NotImplementedError
+
+    def convert_tokens_to_ids(self, tokens):
+        raise NotImplementedError
+
+    def convert_ids_to_tokens(self, ids):
+        raise NotImplementedError
+
+    def encode(self, text):
+        raise NotImplementedError
+
+    def decode(self, token_ids, *input, **kwargs):
+        raise NotImplementedError
+
+    def save_vocabulary(self, vocab_path):
+        raise NotImplementedError
+
 
 def clean_up_tokenization(out_string):
     out_string.replace(' .', '.').replace(' ?', '?').replace(' !', '!').replace(' ,', ','

From 99b90edab1c144ba92ff59e50a2811936a09550c Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Fri, 5 Jul 2019 17:09:35 +0200
Subject: [PATCH 063/139] cleaning up run_glue example

---
 examples/run_glue.py | 413 +++++++++++++++++++++----------------------
 1 file changed, 204 insertions(+), 209 deletions(-)

diff --git a/examples/run_glue.py b/examples/run_glue.py
index da1e8d8123..1c2e921ef7 100644
--- a/examples/run_glue.py
+++ b/examples/run_glue.py
@@ -20,7 +20,6 @@ from __future__ import absolute_import, division, print_function
 import argparse
 import logging
 import os
-import sys
 import random
 from tqdm import tqdm, trange
 
@@ -30,7 +29,6 @@ import torch
 from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
                               TensorDataset)
 from torch.utils.data.distributed import DistributedSampler
-from torch.nn import CrossEntropyLoss, MSELoss
 
 from tensorboardX import SummaryWriter
 
@@ -45,6 +43,186 @@ from utils_glue import processors, output_modes, convert_examples_to_features, c
 logger = logging.getLogger(__name__)
 
 
+def train(args, train_features, model):
+    """ Train the model """
+    if args.local_rank in [-1, 0]:
+        tb_writer = SummaryWriter()
+
+    # Convert in tensors and build dataloader
+    all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
+    all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
+    all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)
+    if args.output_mode == "classification":
+        all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long)
+    elif args.output_mode == "regression":
+        all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.float)
+
+    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps
+
+    train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
+    train_sampler = RandomSampler(train_data) if args.local_rank == -1 else DistributedSampler(train_data)
+    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)
+
+    num_train_optimization_steps = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
+
+    # Prepare optimizer
+    param_optimizer = list(model.named_parameters())
+    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
+    optimizer_grouped_parameters = [
+        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
+        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
+        ]
+    if args.fp16:
+        try:
+            from apex.optimizers import FP16_Optimizer, FusedAdam
+        except ImportError:
+            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
+        optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0)
+        if args.loss_scale == 0:
+            optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
+        else:
+            optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale)
+        warmup_linear = WarmupLinearSchedule(warmup=args.warmup_proportion, t_total=num_train_optimization_steps)
+
+    else:
+        optimizer = BertAdam(optimizer_grouped_parameters,
+                                lr=args.learning_rate,
+                                warmup=args.warmup_proportion,
+                                t_total=num_train_optimization_steps)
+
+    # Train!
+    logger.info("***** Running training *****")
+    logger.info("  Num examples = %d", len(train_features))
+    logger.info("  Batch size = %d", args.train_batch_size)
+    logger.info("  Num steps = %d", num_train_optimization_steps)
+
+    global_step = 0
+    tr_loss = 0
+    model.train()
+    for _ in trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]):
+        for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])):
+            batch = tuple(t.to(args.device) for t in batch)
+            input_ids, input_mask, segment_ids, label_ids = batch
+
+            ouputs = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids)
+            loss = ouputs[0]
+
+            if args.n_gpu > 1:
+                loss = loss.mean() # mean() to average on multi-gpu parallel training
+            if args.gradient_accumulation_steps > 1:
+                loss = loss / args.gradient_accumulation_steps
+
+            loss.backward() if not args.fp16 else optimizer.backward(loss)
+
+            tr_loss += loss.item()
+            if (step + 1) % args.gradient_accumulation_steps == 0:
+                if args.fp16:
+                    # modify learning rate with special warm up BERT uses
+                    # if args.fp16 is False, BertAdam is used that handles this automatically
+                    lr_this_step = args.learning_rate * warmup_linear.get_lr(global_step, args.warmup_proportion)
+                    for param_group in optimizer.param_groups:
+                        param_group['lr'] = lr_this_step
+                optimizer.step()
+                optimizer.zero_grad()
+                global_step += 1
+                if args.local_rank in [-1, 0]:
+                    if not args.fp16:
+                        tb_writer.add_scalar('lr', optimizer.get_lr()[0], global_step)
+                    tb_writer.add_scalar('loss', loss.item(), global_step)
+
+    return global_step, tr_loss / global_step
+
+
+def evalutate(args, eval_task, eval_output_dir, eval_features, model):
+    """ Evaluate the model """
+    if os.path.exists(eval_output_dir) and os.listdir(eval_output_dir) and args.do_train and not args.overwrite_output_dir:
+        raise ValueError("Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(eval_output_dir))
+    if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]:
+        os.makedirs(eval_output_dir)
+
+    # Convert in tensors and build dataloader
+    all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
+    all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
+    all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
+    if args.output_mode == "classification":
+        all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long)
+    elif args.output_mode == "regression":
+        all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.float)
+
+    eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
+    # Note that DistributedSampler samples randomly
+    eval_sampler = SequentialSampler(eval_data) if args.local_rank == -1 else DistributedSampler(eval_data)
+    eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)
+
+    # Eval!
+    logger.info("***** Running evaluation *****")
+    logger.info("  Num examples = %d", len(eval_examples))
+    logger.info("  Batch size = %d", args.eval_batch_size)
+    model.eval()
+    eval_loss = 0
+    nb_eval_steps = 0
+    preds = None
+    out_label_ids = None
+    for batch in tqdm(eval_dataloader, desc="Evaluating"):
+        batch = tuple(t.to(args.device) for t in batch)
+        input_ids, input_mask, segment_ids, label_ids = batch
+
+        with torch.no_grad():
+            outputs = model(input_ids,
+                            token_type_ids=segment_ids,
+                            attention_mask=input_mask,
+                            labels=label_ids)
+            tmp_eval_loss, logits = outputs[:2]
+
+        eval_loss += tmp_eval_loss.mean().item()
+        nb_eval_steps += 1
+        if preds is None:
+            preds = logits.detach().cpu().numpy()
+            out_label_ids = label_ids.detach().cpu().numpy()
+        else:
+            preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
+            out_label_ids = np.append(out_label_ids, label_ids.detach().cpu().numpy(), axis=0)
+
+    eval_loss = eval_loss / nb_eval_steps
+    if args.output_mode == "classification":
+        preds = np.argmax(preds, axis=1)
+    elif args.output_mode == "regression":
+        preds = np.squeeze(preds)
+    result = compute_metrics(eval_task, preds, out_label_ids)
+
+    output_eval_file = os.path.join(eval_output_dir, "eval_results.txt")
+    with open(output_eval_file, "w") as writer:
+        logger.info("***** Eval results *****")
+        for key in sorted(result.keys()):
+            logger.info("  %s = %s", key, str(result[key]))
+            writer.write("%s = %s\n" % (key, str(result[key])))
+
+
+def load_and_cache_examples(args, task, tokenizer, eval=False):
+    processor = processors[task]()
+    output_mode = output_modes[task]
+    label_list = processor.get_labels()
+
+    # Load and cache data
+    processor = processors[task]()
+    examples = processor.get_dev_examples(args.data_dir)
+    cached_features_file = os.path.join(args.data_dir, '{}_{}_{}_{}'.format(
+        'dev' if eval else 'train',
+        list(filter(None, args.bert_model.split('/'))).pop(),
+        str(args.max_seq_length),
+        str(task)))
+
+    if os.path.exists(cached_features_file):
+        features = torch.load(cached_features_file)
+    else:
+        features = convert_examples_to_features(examples, label_list, args.max_seq_length, tokenizer, output_mode)
+        if args.local_rank == -1 or torch.distributed.get_rank() == 0:
+            logger.info("  Saving eval features into cached file %s", cached_features_file)
+            torch.save(features, cached_features_file)
+
+    return features
+
+
 def main():
     parser = argparse.ArgumentParser()
 
@@ -118,40 +296,32 @@ def main():
     # Setup CUDA, GPU & distributed training
     if args.local_rank == -1 or args.no_cuda:
         device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
-        n_gpu = torch.cuda.device_count()
+        args.n_gpu = torch.cuda.device_count()
     else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
         torch.cuda.set_device(args.local_rank)
         device = torch.device("cuda", args.local_rank)
         torch.distributed.init_process_group(backend='nccl')
-        n_gpu = 1
+        args.n_gpu = 1
     args.device = device
 
     # Setup logging
     logging.basicConfig(level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
     logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format(
-                device, n_gpu, bool(args.local_rank != -1), args.fp16))
+        device, args.n_gpu, bool(args.local_rank != -1), args.fp16))
 
     # Setup seeds
     random.seed(args.seed)
     np.random.seed(args.seed)
     torch.manual_seed(args.seed)
-    if n_gpu > 0:
+    if args.n_gpu > 0:
         torch.cuda.manual_seed_all(args.seed)
 
-    # Safety checks and create output directory
-    if not args.do_train and not args.do_eval:
-        raise ValueError("At least one of `do_train` or `do_eval` must be True.")
-    if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir:
-        raise ValueError("Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(args.output_dir))
-    if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
-        os.makedirs(args.output_dir)
-
     # Prepare GLUE task
-    task_name = args.task_name.lower()
-    if task_name not in processors:
-        raise ValueError("Task not found: %s" % (task_name))
-    processor = processors[task_name]()
-    output_mode = output_modes[task_name]
+    args.task_name = args.task_name.lower()
+    if args.task_name not in processors:
+        raise ValueError("Task not found: %s" % (args.task_name))
+    processor = processors[args.task_name]()
+    args.output_mode = output_modes[args.task_name]
     label_list = processor.get_labels()
     num_labels = len(label_list)
 
@@ -169,122 +339,23 @@ def main():
     # Distributed, parrallel and fp16 model
     if args.fp16:
         model.half()
-    model.to(device)
+    model.to(args.device)
     if args.local_rank != -1:
         model = torch.nn.parallel.DistributedDataParallel(model,
                                                           device_ids=[args.local_rank],
                                                           output_device=args.local_rank,
                                                           find_unused_parameters=True)
-    elif n_gpu > 1:
+    elif args.n_gpu > 1:
         model = torch.nn.DataParallel(model)
 
-    global_step = 0
-    tr_loss = 0
+    # Training
     if args.do_train:
-        if args.local_rank in [-1, 0]:
-            tb_writer = SummaryWriter()
+        train_features = load_and_cache_examples(args, args.task_name, tokenizer, eval=False)
+        global_step, tr_loss = train(args, train_features, model)
+        logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
 
-        # Load and cache data
-        train_examples = processor.get_train_examples(args.data_dir)
-        cached_train_features_file = os.path.join(args.data_dir, 'train_{0}_{1}_{2}'.format(
-            list(filter(None, args.bert_model.split('/'))).pop(), str(args.max_seq_length), str(task_name)))
-        if os.path.exists(cached_train_features_file):
-            train_features = torch.load(cached_train_features_file)
-        else:
-            train_features = convert_examples_to_features(
-                train_examples, label_list, args.max_seq_length, tokenizer, output_mode)
-            if args.local_rank == -1 or torch.distributed.get_rank() == 0:
-                logger.info("  Saving train features into cached file %s", cached_train_features_file)
-                torch.save(train_features, cached_train_features_file)
 
-        # Convert in tensors and build dataloader
-        all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
-        all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
-        all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)
-        if output_mode == "classification":
-            all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long)
-        elif output_mode == "regression":
-            all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.float)
-
-        args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps
-
-        train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
-        train_sampler = RandomSampler(train_data) if args.local_rank == -1 else DistributedSampler(train_data)
-        train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)
-
-        num_train_optimization_steps = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
-
-        # Prepare optimizer
-        param_optimizer = list(model.named_parameters())
-        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
-        optimizer_grouped_parameters = [
-            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
-            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
-            ]
-        if args.fp16:
-            try:
-                from apex.optimizers import FP16_Optimizer, FusedAdam
-            except ImportError:
-                raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
-            optimizer = FusedAdam(optimizer_grouped_parameters,
-                                  lr=args.learning_rate,
-                                  bias_correction=False,
-                                  max_grad_norm=1.0)
-            if args.loss_scale == 0:
-                optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
-            else:
-                optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale)
-            warmup_linear = WarmupLinearSchedule(warmup=args.warmup_proportion,
-                                                 t_total=num_train_optimization_steps)
-
-        else:
-            optimizer = BertAdam(optimizer_grouped_parameters,
-                                 lr=args.learning_rate,
-                                 warmup=args.warmup_proportion,
-                                 t_total=num_train_optimization_steps)
-
-        # Train!
-        logger.info("***** Running training *****")
-        logger.info("  Num examples = %d", len(train_examples))
-        logger.info("  Batch size = %d", args.train_batch_size)
-        logger.info("  Num steps = %d", num_train_optimization_steps)
-        model.train()
-        for _ in trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]):
-            for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])):
-                batch = tuple(t.to(device) for t in batch)
-                input_ids, input_mask, segment_ids, label_ids = batch
-
-                ouputs = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids)
-                loss = ouputs[0]
-
-                if n_gpu > 1:
-                    loss = loss.mean() # mean() to average on multi-gpu parallel training
-                if args.gradient_accumulation_steps > 1:
-                    loss = loss / args.gradient_accumulation_steps
-
-                if args.fp16:
-                    optimizer.backward(loss)
-                else:
-                    loss.backward()
-
-                tr_loss += loss.item()
-                if (step + 1) % args.gradient_accumulation_steps == 0:
-                    if args.fp16:
-                        # modify learning rate with special warm up BERT uses
-                        # if args.fp16 is False, BertAdam is used that handles this automatically
-                        lr_this_step = args.learning_rate * warmup_linear.get_lr(global_step, args.warmup_proportion)
-                        for param_group in optimizer.param_groups:
-                            param_group['lr'] = lr_this_step
-                    optimizer.step()
-                    optimizer.zero_grad()
-                    global_step += 1
-                    if args.local_rank in [-1, 0]:
-                        if not args.fp16:
-                            tb_writer.add_scalar('lr', optimizer.get_lr()[0], global_step)
-                        tb_writer.add_scalar('loss', loss.item(), global_step)
-
-    ### Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
-    ### Example:
+    # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
     if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
         # Save a trained model, configuration and tokenizer
         model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
@@ -307,94 +378,18 @@ def main():
     else:
         model = BertForSequenceClassification.from_pretrained(args.bert_model)
 
-    model.to(device)
+    model.to(args.device)
 
-    ### Evaluation
+    # Evaluation
     if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
-        eval_task_names = ("mnli", "mnli-mm") if task_name == "mnli" else (task_name,)
-        eval_outputs_dirs = (args.output_dir, args.output_dir + '-MM') if task_name == "mnli" else (args.output_dir,)
-        for eval_task, output_dir in zip(eval_task_names, eval_outputs_dirs):
-            if os.path.exists(output_dir) and os.listdir(output_dir) and args.do_train:
-                raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir))
-            if not os.path.exists(output_dir):
-                os.makedirs(output_dir)
+        # Handle MNLI double evaluation
+        eval_task_names = ("mnli", "mnli-mm") if args.task_name == "mnli" else (args.task_name,)
+        eval_outputs_dirs = (args.output_dir, args.output_dir + '-MM') if args.task_name == "mnli" else (args.output_dir,)
 
-            # Load and cache data
-            processor = processors[eval_task]()
-            eval_examples = processor.get_dev_examples(args.data_dir)
-            cached_eval_features_file = os.path.join(args.data_dir, 'dev_{0}_{1}_{2}'.format(
-                list(filter(None, args.bert_model.split('/'))).pop(), str(args.max_seq_length), str(eval_task)))
-            if os.path.exists(cached_eval_features_file):
-                eval_features = torch.load(cached_eval_features_file)
-            else:
-                eval_features = convert_examples_to_features(
-                    eval_examples, label_list, args.max_seq_length, tokenizer, output_mode)
-                if args.local_rank == -1 or torch.distributed.get_rank() == 0:
-                    logger.info("  Saving eval features into cached file %s", cached_eval_features_file)
-                torch.save(eval_features, cached_eval_features_file)
+        for eval_task, eval_output_dir in zip(eval_task_names, eval_outputs_dirs):
+            eval_features = load_and_cache_examples(args, eval_task, tokenizer, eval=True)
 
-            # Convert in tensors and build dataloader
-            all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
-            all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
-            all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
-            if output_mode == "classification":
-                all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long)
-            elif output_mode == "regression":
-                all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.float)
-
-            eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
-            # Note that DistributedSampler samples randomly
-            eval_sampler = SequentialSampler(eval_data) if args.local_rank == -1 else DistributedSampler(eval_data)
-            eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)
-
-            # Eval!
-            logger.info("***** Running evaluation *****")
-            logger.info("  Num examples = %d", len(eval_examples))
-            logger.info("  Batch size = %d", args.eval_batch_size)
-            model.eval()
-            eval_loss = 0
-            nb_eval_steps = 0
-            preds = None
-            out_label_ids = None
-            for batch in tqdm(eval_dataloader, desc="Evaluating"):
-                batch = tuple(t.to(device) for t in batch)
-                input_ids, input_mask, segment_ids, label_ids = batch
-
-                with torch.no_grad():
-                    outputs = model(input_ids,
-                                    token_type_ids=segment_ids,
-                                    attention_mask=input_mask,
-                                    labels=label_ids)
-                    tmp_eval_loss, logits = outputs[:2]
-
-                eval_loss += tmp_eval_loss.mean().item()
-                nb_eval_steps += 1
-                if preds is None:
-                    preds = logits.detach().cpu().numpy()
-                    out_label_ids = label_ids.detach().cpu().numpy()
-                else:
-                    preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
-                    out_label_ids = np.append(out_label_ids, label_ids.detach().cpu().numpy(), axis=0)
-
-            eval_loss = eval_loss / nb_eval_steps
-            if output_mode == "classification":
-                preds = np.argmax(preds, axis=1)
-            elif output_mode == "regression":
-                preds = np.squeeze(preds)
-            result = compute_metrics(eval_task, preds, out_label_ids)
-
-            loss = tr_loss/global_step if args.do_train else None
-
-            result['eval_loss'] = eval_loss
-            result['global_step'] = global_step
-            result['loss'] = loss
-
-            output_eval_file = os.path.join(output_dir, "eval_results.txt")
-            with open(output_eval_file, "w") as writer:
-                logger.info("***** Eval results *****")
-                for key in sorted(result.keys()):
-                    logger.info("  %s = %s", key, str(result[key]))
-                    writer.write("%s = %s\n" % (key, str(result[key])))
+            evalutate(args, eval_task, eval_output_dir, eval_features, model)
 
 
 if __name__ == "__main__":

From 3d5f2913864de28a57a339c4c0c9f7b6000a7d03 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Fri, 5 Jul 2019 17:22:15 +0200
Subject: [PATCH 064/139] updates to run_glue

---
 examples/run_glue.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/examples/run_glue.py b/examples/run_glue.py
index 1c2e921ef7..8dd845a553 100644
--- a/examples/run_glue.py
+++ b/examples/run_glue.py
@@ -213,11 +213,12 @@ def load_and_cache_examples(args, task, tokenizer, eval=False):
         str(task)))
 
     if os.path.exists(cached_features_file):
+        logger.info("Loading features from cached file %s", cached_features_file)
         features = torch.load(cached_features_file)
     else:
         features = convert_examples_to_features(examples, label_list, args.max_seq_length, tokenizer, output_mode)
         if args.local_rank == -1 or torch.distributed.get_rank() == 0:
-            logger.info("  Saving eval features into cached file %s", cached_features_file)
+            logger.info("Saving features into cached file %s", cached_features_file)
             torch.save(features, cached_features_file)
 
     return features

From 03de9686a7365c1d986451bf94de8060c5328a8e Mon Sep 17 00:00:00 2001
From: LysandreJik <lysandre.debut@reseau.eseo.fr>
Date: Fri, 5 Jul 2019 17:11:13 -0400
Subject: [PATCH 065/139] Initial folder structure for the documentation. A
 draft of documentation change has been made in the BertModel class.

---
 docs/Makefile                            |  19 +
 docs/README.md                           |  23 +
 docs/source/cli.rst                      |  86 ++++
 docs/source/conf.py                      | 180 +++++++
 docs/source/doc.rst                      | 521 ++++++++++++++++++++
 docs/source/examples.rst                 | 593 +++++++++++++++++++++++
 docs/source/index.rst                    | 207 ++++++++
 docs/source/installation.rst             |  48 ++
 docs/source/notebooks.rst                |  16 +
 docs/source/tpu.rst                      |  13 +
 docs/source/usage.rst                    | 339 +++++++++++++
 pytorch_pretrained_bert/modeling_bert.py | 104 ++--
 12 files changed, 2109 insertions(+), 40 deletions(-)
 create mode 100644 docs/Makefile
 create mode 100644 docs/README.md
 create mode 100644 docs/source/cli.rst
 create mode 100644 docs/source/conf.py
 create mode 100644 docs/source/doc.rst
 create mode 100644 docs/source/examples.rst
 create mode 100644 docs/source/index.rst
 create mode 100644 docs/source/installation.rst
 create mode 100644 docs/source/notebooks.rst
 create mode 100644 docs/source/tpu.rst
 create mode 100644 docs/source/usage.rst

diff --git a/docs/Makefile b/docs/Makefile
new file mode 100644
index 0000000000..8879933e6c
--- /dev/null
+++ b/docs/Makefile
@@ -0,0 +1,19 @@
+# Minimal makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line.
+SPHINXOPTS    =
+SPHINXBUILD   = sphinx-build
+SOURCEDIR     = source
+BUILDDIR      = _build
+
+# Put it first so that "make" without argument is like "make help".
+help:
+	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+.PHONY: help Makefile
+
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
\ No newline at end of file
diff --git a/docs/README.md b/docs/README.md
new file mode 100644
index 0000000000..22f1116c87
--- /dev/null
+++ b/docs/README.md
@@ -0,0 +1,23 @@
+# Generating the documentation
+
+To generate the documentation, you first have to build it. Building it requires the package `sphinx` that you can 
+install using:
+
+```bash
+pip install -U sphinx
+```
+
+You would also need the custom installed [theme](https://github.com/readthedocs/sphinx_rtd_theme) by 
+[Read The Docs](https://readthedocs.org/). You can install it using the following command:
+
+```bash
+pip install sphinx_rtd_theme
+```
+
+Once you have setup `sphinx`, you can build the documentation by running the following command in the `/docs` folder:
+
+```bash
+make html
+```
+
+It should build the static app that will be available under `/docs/_build/html`
\ No newline at end of file
diff --git a/docs/source/cli.rst b/docs/source/cli.rst
new file mode 100644
index 0000000000..22da24550b
--- /dev/null
+++ b/docs/source/cli.rst
@@ -0,0 +1,86 @@
+CLI
+================================================
+
+A command-line interface is provided to convert a TensorFlow checkpoint in a PyTorch dump of the ``BertForPreTraining`` class  (for BERT) or NumPy checkpoint in a PyTorch dump of the ``OpenAIGPTModel`` class  (for OpenAI GPT).
+
+BERT
+^^^^
+
+You can convert any TensorFlow checkpoint for BERT (in particular `the pre-trained models released by Google <https://github.com/google-research/bert#pre-trained-models>`_\ ) in a PyTorch save file by using the `\ ``convert_tf_checkpoint_to_pytorch.py`` <./pytorch_pretrained_bert/convert_tf_checkpoint_to_pytorch.py>`_ script.
+
+This CLI takes as input a TensorFlow checkpoint (three files starting with ``bert_model.ckpt``\ ) and the associated configuration file (\ ``bert_config.json``\ ), and creates a PyTorch model for this configuration, loads the weights from the TensorFlow checkpoint in the PyTorch model and saves the resulting model in a standard PyTorch save file that can be imported using ``torch.load()`` (see examples in `\ ``run_bert_extract_features.py`` <./examples/run_bert_extract_features.py>`_\ , `\ ``run_bert_classifier.py`` <./examples/run_bert_classifier.py>`_ and `\ ``run_bert_squad.py`` <./examples/run_bert_squad.py>`_\ ).
+
+You only need to run this conversion script **once** to get a PyTorch model. You can then disregard the TensorFlow checkpoint (the three files starting with ``bert_model.ckpt``\ ) but be sure to keep the configuration file (\ ``bert_config.json``\ ) and the vocabulary file (\ ``vocab.txt``\ ) as these are needed for the PyTorch model too.
+
+To run this specific conversion script you will need to have TensorFlow and PyTorch installed (\ ``pip install tensorflow``\ ). The rest of the repository only requires PyTorch.
+
+Here is an example of the conversion process for a pre-trained ``BERT-Base Uncased`` model:
+
+.. code-block:: shell
+
+   export BERT_BASE_DIR=/path/to/bert/uncased_L-12_H-768_A-12
+
+   pytorch_pretrained_bert bert \
+     $BERT_BASE_DIR/bert_model.ckpt \
+     $BERT_BASE_DIR/bert_config.json \
+     $BERT_BASE_DIR/pytorch_model.bin
+
+You can download Google's pre-trained models for the conversion `here <https://github.com/google-research/bert#pre-trained-models>`__.
+
+OpenAI GPT
+^^^^^^^^^^
+
+Here is an example of the conversion process for a pre-trained OpenAI GPT model, assuming that your NumPy checkpoint save as the same format than OpenAI pretrained model (see `here <https://github.com/openai/finetune-transformer-lm>`__\ )
+
+.. code-block:: shell
+
+   export OPENAI_GPT_CHECKPOINT_FOLDER_PATH=/path/to/openai/pretrained/numpy/weights
+
+   pytorch_pretrained_bert gpt \
+     $OPENAI_GPT_CHECKPOINT_FOLDER_PATH \
+     $PYTORCH_DUMP_OUTPUT \
+     [OPENAI_GPT_CONFIG]
+
+Transformer-XL
+^^^^^^^^^^^^^^
+
+Here is an example of the conversion process for a pre-trained Transformer-XL model (see `here <https://github.com/kimiyoung/transformer-xl/tree/master/tf#obtain-and-evaluate-pretrained-sota-models>`__\ )
+
+.. code-block:: shell
+
+   export TRANSFO_XL_CHECKPOINT_FOLDER_PATH=/path/to/transfo/xl/checkpoint
+
+   pytorch_pretrained_bert transfo_xl \
+     $TRANSFO_XL_CHECKPOINT_FOLDER_PATH \
+     $PYTORCH_DUMP_OUTPUT \
+     [TRANSFO_XL_CONFIG]
+
+GPT-2
+^^^^^
+
+Here is an example of the conversion process for a pre-trained OpenAI's GPT-2 model.
+
+.. code-block:: shell
+
+   export GPT2_DIR=/path/to/gpt2/checkpoint
+
+   pytorch_pretrained_bert gpt2 \
+     $GPT2_DIR/model.ckpt \
+     $PYTORCH_DUMP_OUTPUT \
+     [GPT2_CONFIG]
+
+XLNet
+^^^^^
+
+Here is an example of the conversion process for a pre-trained XLNet model, fine-tuned on STS-B using the TensorFlow script:
+
+.. code-block:: shell
+
+   export TRANSFO_XL_CHECKPOINT_PATH=/path/to/xlnet/checkpoint
+   export TRANSFO_XL_CONFIG_PATH=/path/to/xlnet/config
+
+   pytorch_pretrained_bert xlnet \
+     $TRANSFO_XL_CHECKPOINT_PATH \
+     $TRANSFO_XL_CONFIG_PATH \
+     $PYTORCH_DUMP_OUTPUT \
+     STS-B \
diff --git a/docs/source/conf.py b/docs/source/conf.py
new file mode 100644
index 0000000000..7675393807
--- /dev/null
+++ b/docs/source/conf.py
@@ -0,0 +1,180 @@
+# -*- coding: utf-8 -*-
+#
+# Configuration file for the Sphinx documentation builder.
+#
+# This file does only contain a selection of the most common options. For a
+# full list see the documentation:
+# http://www.sphinx-doc.org/en/master/config
+
+# -- Path setup --------------------------------------------------------------
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+#
+import os
+import sys
+sys.path.insert(0, os.path.abspath('../..'))
+
+
+# -- Project information -----------------------------------------------------
+
+project = u'pytorch-transformers'
+copyright = u'2019, huggingface'
+author = u'huggingface'
+
+# The short X.Y version
+version = u''
+# The full version, including alpha/beta/rc tags
+release = u'1.0.0'
+
+
+# -- General configuration ---------------------------------------------------
+
+# If your documentation needs a minimal Sphinx version, state it here.
+#
+# needs_sphinx = '1.0'
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+extensions = [
+    'sphinx.ext.autodoc',
+    'sphinx.ext.coverage',
+    'sphinx.ext.napoleon'
+
+]
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ['_templates']
+
+# The suffix(es) of source filenames.
+# You can specify multiple suffix as a list of string:
+#
+source_suffix = ['.rst', '.md']
+# source_suffix = '.rst'
+
+# The master toctree document.
+master_doc = 'index'
+
+# The language for content autogenerated by Sphinx. Refer to documentation
+# for a list of supported languages.
+#
+# This is also used if you do content translation via gettext catalogs.
+# Usually you set "language" from the command line for these cases.
+language = None
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+# This pattern also affects html_static_path and html_extra_path.
+exclude_patterns = [u'_build', 'Thumbs.db', '.DS_Store']
+
+# The name of the Pygments (syntax highlighting) style to use.
+pygments_style = None
+
+
+# -- Options for HTML output -------------------------------------------------
+
+# The theme to use for HTML and HTML Help pages.  See the documentation for
+# a list of builtin themes.
+#
+html_theme = 'sphinx_rtd_theme'
+
+# Theme options are theme-specific and customize the look and feel of a theme
+# further.  For a list of options available for each theme, see the
+# documentation.
+#
+# html_theme_options = {}
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ['_static']
+
+# Custom sidebar templates, must be a dictionary that maps document names
+# to template names.
+#
+# The default sidebars (for documents that don't match any pattern) are
+# defined by theme itself.  Builtin themes are using these templates by
+# default: ``['localtoc.html', 'relations.html', 'sourcelink.html',
+# 'searchbox.html']``.
+#
+# html_sidebars = {}
+
+
+# -- Options for HTMLHelp output ---------------------------------------------
+
+# Output file base name for HTML help builder.
+htmlhelp_basename = 'pytorch-transformersdoc'
+
+
+# -- Options for LaTeX output ------------------------------------------------
+
+latex_elements = {
+    # The paper size ('letterpaper' or 'a4paper').
+    #
+    # 'papersize': 'letterpaper',
+
+    # The font size ('10pt', '11pt' or '12pt').
+    #
+    # 'pointsize': '10pt',
+
+    # Additional stuff for the LaTeX preamble.
+    #
+    # 'preamble': '',
+
+    # Latex figure (float) alignment
+    #
+    # 'figure_align': 'htbp',
+}
+
+# Grouping the document tree into LaTeX files. List of tuples
+# (source start file, target name, title,
+#  author, documentclass [howto, manual, or own class]).
+latex_documents = [
+    (master_doc, 'pytorch-transformers.tex', u'pytorch-transformers Documentation',
+     u'huggingface', 'manual'),
+]
+
+
+# -- Options for manual page output ------------------------------------------
+
+# One entry per manual page. List of tuples
+# (source start file, name, description, authors, manual section).
+man_pages = [
+    (master_doc, 'pytorch-transformers', u'pytorch-transformers Documentation',
+     [author], 1)
+]
+
+
+# -- Options for Texinfo output ----------------------------------------------
+
+# Grouping the document tree into Texinfo files. List of tuples
+# (source start file, target name, title, author,
+#  dir menu entry, description, category)
+texinfo_documents = [
+    (master_doc, 'pytorch-transformers', u'pytorch-transformers Documentation',
+     author, 'pytorch-transformers', 'One line description of project.',
+     'Miscellaneous'),
+]
+
+
+# -- Options for Epub output -------------------------------------------------
+
+# Bibliographic Dublin Core info.
+epub_title = project
+
+# The unique identifier of the text. This can be a ISBN number
+# or the project homepage.
+#
+# epub_identifier = ''
+
+# A unique identification for the text.
+#
+# epub_uid = ''
+
+# A list of files that should not be packed into the epub file.
+epub_exclude_files = ['search.html']
+
+
+# -- Extension configuration -------------------------------------------------
diff --git a/docs/source/doc.rst b/docs/source/doc.rst
new file mode 100644
index 0000000000..662799053c
--- /dev/null
+++ b/docs/source/doc.rst
@@ -0,0 +1,521 @@
+Docs
+================================================
+
+
+
+Here is a detailed documentation of the classes in the package and how to use them:
+
+.. list-table::
+   :header-rows: 1
+
+   * - Sub-section
+     - Description
+   * - `Loading pre-trained weights <#loading-google-ai-or-openai-pre-trained-weights-or-pytorch-dump>`_
+     - How to load Google AI/OpenAI's pre-trained weight or a PyTorch saved instance
+   * - `Serialization best-practices <#serialization-best-practices>`_
+     - How to save and reload a fine-tuned model
+   * - `Configurations <#configurations>`_
+     - API of the configuration classes for BERT, GPT, GPT-2 and Transformer-XL
+   * - `Models <#models>`_
+     - API of the PyTorch model classes for BERT, GPT, GPT-2 and Transformer-XL
+   * - `Tokenizers <#tokenizers>`_
+     - API of the tokenizers class for BERT, GPT, GPT-2 and Transformer-XL
+   * - `Optimizers <#optimizers>`_
+     - API of the optimizers
+
+
+Loading Google AI or OpenAI pre-trained weights or PyTorch dump
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+``from_pretrained()`` method
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+To load one of Google AI's, OpenAI's pre-trained models or a PyTorch saved model (an instance of ``BertForPreTraining`` saved with ``torch.save()``\ ), the PyTorch model classes and the tokenizer can be instantiated using the ``from_pretrained()`` method:
+
+.. code-block:: python
+
+   model = BERT_CLASS.from_pretrained(PRE_TRAINED_MODEL_NAME_OR_PATH, cache_dir=None, from_tf=False, state_dict=None, *input, **kwargs)
+
+where
+
+
+* ``BERT_CLASS`` is either a tokenizer to load the vocabulary (\ ``BertTokenizer`` or ``OpenAIGPTTokenizer`` classes) or one of the eight BERT or three OpenAI GPT PyTorch model classes (to load the pre-trained weights): ``BertModel``\ , ``BertForMaskedLM``\ , ``BertForNextSentencePrediction``\ , ``BertForPreTraining``\ , ``BertForSequenceClassification``\ , ``BertForTokenClassification``\ , ``BertForMultipleChoice``\ , ``BertForQuestionAnswering``\ , ``OpenAIGPTModel``\ , ``OpenAIGPTLMHeadModel`` or ``OpenAIGPTDoubleHeadsModel``\ , and
+*
+  ``PRE_TRAINED_MODEL_NAME_OR_PATH`` is either:
+
+
+  *
+    the shortcut name of a Google AI's or OpenAI's pre-trained model selected in the list:
+
+
+    * ``bert-base-uncased``: 12-layer, 768-hidden, 12-heads, 110M parameters
+    * ``bert-large-uncased``: 24-layer, 1024-hidden, 16-heads, 340M parameters
+    * ``bert-base-cased``: 12-layer, 768-hidden, 12-heads , 110M parameters
+    * ``bert-large-cased``: 24-layer, 1024-hidden, 16-heads, 340M parameters
+    * ``bert-base-multilingual-uncased``: (Orig, not recommended) 102 languages, 12-layer, 768-hidden, 12-heads, 110M parameters
+    * ``bert-base-multilingual-cased``: **(New, recommended)** 104 languages, 12-layer, 768-hidden, 12-heads, 110M parameters
+    * ``bert-base-chinese``: Chinese Simplified and Traditional, 12-layer, 768-hidden, 12-heads, 110M parameters
+    * ``bert-base-german-cased``: Trained on German data only, 12-layer, 768-hidden, 12-heads, 110M parameters `Performance Evaluation <https://deepset.ai/german-bert>`_
+    * ``bert-large-uncased-whole-word-masking``: 24-layer, 1024-hidden, 16-heads, 340M parameters - Trained with Whole Word Masking (mask all of the the tokens corresponding to a word at once)
+    * ``bert-large-cased-whole-word-masking``: 24-layer, 1024-hidden, 16-heads, 340M parameters - Trained with Whole Word Masking (mask all of the the tokens corresponding to a word at once)
+    * ``bert-large-uncased-whole-word-masking-finetuned-squad``: The ``bert-large-uncased-whole-word-masking`` model finetuned on SQuAD (using the ``run_bert_squad.py`` examples). Results: *exact_match: 86.91579943235573, f1: 93.1532499015869*
+    * ``openai-gpt``: OpenAI GPT English model, 12-layer, 768-hidden, 12-heads, 110M parameters
+    * ``gpt2``: OpenAI GPT-2 English model, 12-layer, 768-hidden, 12-heads, 117M parameters
+    * ``gpt2-medium``: OpenAI GPT-2 English model, 24-layer, 1024-hidden, 16-heads, 345M parameters
+    * ``transfo-xl-wt103``: Transformer-XL English model trained on wikitext-103, 18-layer, 1024-hidden, 16-heads, 257M parameters
+
+  *
+    a path or url to a pretrained model archive containing:
+
+
+    * ``bert_config.json`` or ``openai_gpt_config.json`` a configuration file for the model, and
+    * ``pytorch_model.bin`` a PyTorch dump of a pre-trained instance of ``BertForPreTraining``\ , ``OpenAIGPTModel``\ , ``TransfoXLModel``\ , ``GPT2LMHeadModel`` (saved with the usual ``torch.save()``\ )
+
+  If ``PRE_TRAINED_MODEL_NAME_OR_PATH`` is a shortcut name, the pre-trained weights will be downloaded from AWS S3 (see the links `here <https://github.com/huggingface/pytorch-pretrained-BERT/tree/master/pytorch_pretrained_bert/modeling.py>`_\ ) and stored in a cache folder to avoid future download (the cache folder can be found at ``~/.pytorch_pretrained_bert/``\ ).
+
+*
+  ``cache_dir`` can be an optional path to a specific directory to download and cache the pre-trained model weights. This option is useful in particular when you are using distributed training: to avoid concurrent access to the same weights you can set for example ``cache_dir='./pretrained_model_{}'.format(args.local_rank)`` (see the section on distributed training for more information).
+
+* ``from_tf``\ : should we load the weights from a locally saved TensorFlow checkpoint
+* ``state_dict``\ : an optional state dictionnary (collections.OrderedDict object) to use instead of Google pre-trained models
+* ``*inputs``\ , `**kwargs`: additional input for the specific Bert class (ex: num_labels for BertForSequenceClassification)
+
+``Uncased`` means that the text has been lowercased before WordPiece tokenization, e.g., ``John Smith`` becomes ``john smith``. The Uncased model also strips out any accent markers. ``Cased`` means that the true case and accent markers are preserved. Typically, the Uncased model is better unless you know that case information is important for your task (e.g., Named Entity Recognition or Part-of-Speech tagging). For information about the Multilingual and Chinese model, see the `Multilingual README <https://github.com/google-research/bert/blob/master/multilingual.md>`_ or the original TensorFlow repository.
+
+When using an ``uncased model``\ , make sure to pass ``--do_lower_case`` to the example training scripts (or pass ``do_lower_case=True`` to FullTokenizer if you're using your own script and loading the tokenizer your-self.).
+
+Examples:
+
+.. code-block:: python
+
+   # BERT
+   tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True, do_basic_tokenize=True)
+   model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
+
+   # OpenAI GPT
+   tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
+   model = OpenAIGPTModel.from_pretrained('openai-gpt')
+
+   # Transformer-XL
+   tokenizer = TransfoXLTokenizer.from_pretrained('transfo-xl-wt103')
+   model = TransfoXLModel.from_pretrained('transfo-xl-wt103')
+
+   # OpenAI GPT-2
+   tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
+   model = GPT2Model.from_pretrained('gpt2')
+
+Cache directory
+~~~~~~~~~~~~~~~
+
+``pytorch_pretrained_bert`` save the pretrained weights in a cache directory which is located at (in this order of priority):
+
+
+* ``cache_dir`` optional arguments to the ``from_pretrained()`` method (see above),
+* shell environment variable ``PYTORCH_PRETRAINED_BERT_CACHE``\ ,
+* PyTorch cache home + ``/pytorch_pretrained_bert/``
+  where PyTorch cache home is defined by (in this order):
+
+  * shell environment variable ``ENV_TORCH_HOME``
+  * shell environment variable ``ENV_XDG_CACHE_HOME`` + ``/torch/``\ )
+  * default: ``~/.cache/torch/``
+
+Usually, if you don't set any specific environment variable, ``pytorch_pretrained_bert`` cache will be at ``~/.cache/torch/pytorch_pretrained_bert/``.
+
+You can alsways safely delete ``pytorch_pretrained_bert`` cache but the pretrained model weights and vocabulary files wil have to be re-downloaded from our S3.
+
+Serialization best-practices
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+This section explain how you can save and re-load a fine-tuned model (BERT, GPT, GPT-2 and Transformer-XL).
+There are three types of files you need to save to be able to reload a fine-tuned model:
+
+
+* the model it-self which should be saved following PyTorch serialization `best practices <https://pytorch.org/docs/stable/notes/serialization.html#best-practices>`_\ ,
+* the configuration file of the model which is saved as a JSON file, and
+* the vocabulary (and the merges for the BPE-based models GPT and GPT-2).
+
+The *default filenames* of these files are as follow:
+
+
+* the model weights file: ``pytorch_model.bin``\ ,
+* the configuration file: ``config.json``\ ,
+* the vocabulary file: ``vocab.txt`` for BERT and Transformer-XL, ``vocab.json`` for GPT/GPT-2 (BPE vocabulary),
+* for GPT/GPT-2 (BPE vocabulary) the additional merges file: ``merges.txt``.
+
+**If you save a model using these *default filenames*\ , you can then re-load the model and tokenizer using the ``from_pretrained()`` method.**
+
+Here is the recommended way of saving the model, configuration and vocabulary to an ``output_dir`` directory and reloading the model and tokenizer afterwards:
+
+.. code-block:: python
+
+   from pytorch_pretrained_bert import WEIGHTS_NAME, CONFIG_NAME
+
+   output_dir = "./models/"
+
+   # Step 1: Save a model, configuration and vocabulary that you have fine-tuned
+
+   # If we have a distributed model, save only the encapsulated model
+   # (it was wrapped in PyTorch DistributedDataParallel or DataParallel)
+   model_to_save = model.module if hasattr(model, 'module') else model
+
+   # If we save using the predefined names, we can load using `from_pretrained`
+   output_model_file = os.path.join(output_dir, WEIGHTS_NAME)
+   output_config_file = os.path.join(output_dir, CONFIG_NAME)
+
+   torch.save(model_to_save.state_dict(), output_model_file)
+   model_to_save.config.to_json_file(output_config_file)
+   tokenizer.save_vocabulary(output_dir)
+
+   # Step 2: Re-load the saved model and vocabulary
+
+   # Example for a Bert model
+   model = BertForQuestionAnswering.from_pretrained(output_dir)
+   tokenizer = BertTokenizer.from_pretrained(output_dir, do_lower_case=args.do_lower_case)  # Add specific options if needed
+   # Example for a GPT model
+   model = OpenAIGPTDoubleHeadsModel.from_pretrained(output_dir)
+   tokenizer = OpenAIGPTTokenizer.from_pretrained(output_dir)
+
+Here is another way you can save and reload the model if you want to use specific paths for each type of files:
+
+.. code-block:: python
+
+   output_model_file = "./models/my_own_model_file.bin"
+   output_config_file = "./models/my_own_config_file.bin"
+   output_vocab_file = "./models/my_own_vocab_file.bin"
+
+   # Step 1: Save a model, configuration and vocabulary that you have fine-tuned
+
+   # If we have a distributed model, save only the encapsulated model
+   # (it was wrapped in PyTorch DistributedDataParallel or DataParallel)
+   model_to_save = model.module if hasattr(model, 'module') else model
+
+   torch.save(model_to_save.state_dict(), output_model_file)
+   model_to_save.config.to_json_file(output_config_file)
+   tokenizer.save_vocabulary(output_vocab_file)
+
+   # Step 2: Re-load the saved model and vocabulary
+
+   # We didn't save using the predefined WEIGHTS_NAME, CONFIG_NAME names, we cannot load using `from_pretrained`.
+   # Here is how to do it in this situation:
+
+   # Example for a Bert model
+   config = BertConfig.from_json_file(output_config_file)
+   model = BertForQuestionAnswering(config)
+   state_dict = torch.load(output_model_file)
+   model.load_state_dict(state_dict)
+   tokenizer = BertTokenizer(output_vocab_file, do_lower_case=args.do_lower_case)
+
+   # Example for a GPT model
+   config = OpenAIGPTConfig.from_json_file(output_config_file)
+   model = OpenAIGPTDoubleHeadsModel(config)
+   state_dict = torch.load(output_model_file)
+   model.load_state_dict(state_dict)
+   tokenizer = OpenAIGPTTokenizer(output_vocab_file)
+
+Configurations
+^^^^^^^^^^^^^^
+
+Models (BERT, GPT, GPT-2 and Transformer-XL) are defined and build from configuration classes which containes the parameters of the models (number of layers, dimensionalities...) and a few utilities to read and write from JSON configuration files. The respective configuration classes are:
+
+
+* ``BertConfig`` for ``BertModel`` and BERT classes instances.
+* ``OpenAIGPTConfig`` for ``OpenAIGPTModel`` and OpenAI GPT classes instances.
+* ``GPT2Config`` for ``GPT2Model`` and OpenAI GPT-2 classes instances.
+* ``TransfoXLConfig`` for ``TransfoXLModel`` and Transformer-XL classes instances.
+
+These configuration classes contains a few utilities to load and save configurations:
+
+
+* ``from_dict(cls, json_object)``\ : A class method to construct a configuration from a Python dictionary of parameters. Returns an instance of the configuration class.
+* ``from_json_file(cls, json_file)``\ : A class method to construct a configuration from a json file of parameters. Returns an instance of the configuration class.
+* ``to_dict()``\ : Serializes an instance to a Python dictionary. Returns a dictionary.
+* ``to_json_string()``\ : Serializes an instance to a JSON string. Returns a string.
+* ``to_json_file(json_file_path)``\ : Save an instance to a json file.
+
+Models
+^^^^^^
+
+1. ``BertModel``
+~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: pytorch_pretrained_bert.BertModel
+    :members:
+
+
+2. ``BertForPreTraining``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: pytorch_pretrained_bert.BertForPreTraining
+    :members:
+
+
+3. ``BertForMaskedLM``
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: pytorch_pretrained_bert.BertForMaskedLM
+    :members:
+
+
+4. ``BertForNextSentencePrediction``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: pytorch_pretrained_bert.BertForNextSentencePrediction
+    :members:
+
+
+5. ``BertForSequenceClassification``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: pytorch_pretrained_bert.BertForSequenceClassification
+    :members:
+
+
+6. ``BertForMultipleChoice``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: pytorch_pretrained_bert.BertForMultipleChoice
+    :members:
+
+
+7. ``BertForTokenClassification``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: pytorch_pretrained_bert.BertForTokenClassification
+    :members:
+
+
+8. ``BertForQuestionAnswering``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: pytorch_pretrained_bert.BertForQuestionAnswering
+    :members:
+
+
+9. ``OpenAIGPTModel``
+~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: pytorch_pretrained_bert.OpenAIGPTModel
+    :members:
+
+
+10. ``OpenAIGPTLMHeadModel``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: pytorch_pretrained_bert.OpenAIGPTLMHeadModel
+    :members:
+
+
+11. ``OpenAIGPTDoubleHeadsModel``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: pytorch_pretrained_bert.OpenAIGPTDoubleHeadsModel
+    :members:
+
+
+12. ``TransfoXLModel``
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: pytorch_pretrained_bert.TransfoXLModel
+    :members:
+
+
+13. ``TransfoXLLMHeadModel``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: pytorch_pretrained_bert.TransfoXLLMHeadModel
+    :members:
+
+
+14. ``GPT2Model``
+~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: pytorch_pretrained_bert.GPT2Model
+    :members:
+
+
+15. ``GPT2LMHeadModel``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: pytorch_pretrained_bert.GPT2LMHeadModel
+    :members:
+
+
+16. ``GPT2DoubleHeadsModel``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: pytorch_pretrained_bert.GPT2DoubleHeadsModel
+    :members:
+
+
+Tokenizers
+^^^^^^^^^^
+
+``BertTokenizer``
+~~~~~~~~~~~~~~~~~~~~~
+
+``BertTokenizer`` perform end-to-end tokenization, i.e. basic tokenization followed by WordPiece tokenization.
+
+This class has five arguments:
+
+
+* ``vocab_file``\ : path to a vocabulary file.
+* ``do_lower_case``\ : convert text to lower-case while tokenizing. **Default = True**.
+* ``max_len``\ : max length to filter the input of the Transformer. Default to pre-trained value for the model if ``None``. **Default = None**
+* ``do_basic_tokenize``\ : Do basic tokenization before wordpice tokenization. Set to false if text is pre-tokenized. **Default = True**.
+* ``never_split``\ : a list of tokens that should not be splitted during tokenization. **Default = ``["[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]"]``\ **
+
+and three methods:
+
+
+* ``tokenize(text)``\ : convert a ``str`` in a list of ``str`` tokens by (1) performing basic tokenization and (2) WordPiece tokenization.
+* ``convert_tokens_to_ids(tokens)``\ : convert a list of ``str`` tokens in a list of ``int`` indices in the vocabulary.
+* ``convert_ids_to_tokens(tokens)``\ : convert a list of ``int`` indices in a list of ``str`` tokens in the vocabulary.
+* `save_vocabulary(directory_path)`: save the vocabulary file to `directory_path`. Return the path to the saved vocabulary file: ``vocab_file_path``. The vocabulary can be reloaded with ``BertTokenizer.from_pretrained('vocab_file_path')`` or ``BertTokenizer.from_pretrained('directory_path')``.
+
+Please refer to the doc strings and code in `\ ``tokenization.py`` <./pytorch_pretrained_bert/tokenization.py>`_ for the details of the ``BasicTokenizer`` and ``WordpieceTokenizer`` classes. In general it is recommended to use ``BertTokenizer`` unless you know what you are doing.
+
+``OpenAIGPTTokenizer``
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+``OpenAIGPTTokenizer`` perform Byte-Pair-Encoding (BPE) tokenization.
+
+This class has four arguments:
+
+
+* ``vocab_file``\ : path to a vocabulary file.
+* ``merges_file``\ : path to a file containing the BPE merges.
+* ``max_len``\ : max length to filter the input of the Transformer. Default to pre-trained value for the model if ``None``. **Default = None**
+* ``special_tokens``\ : a list of tokens to add to the vocabulary for fine-tuning. If SpaCy is not installed and BERT's ``BasicTokenizer`` is used as the pre-BPE tokenizer, these tokens are not split. **Default= None**
+
+and five methods:
+
+
+* ``tokenize(text)``\ : convert a ``str`` in a list of ``str`` tokens by performing BPE tokenization.
+* ``convert_tokens_to_ids(tokens)``\ : convert a list of ``str`` tokens in a list of ``int`` indices in the vocabulary.
+* ``convert_ids_to_tokens(tokens)``\ : convert a list of ``int`` indices in a list of ``str`` tokens in the vocabulary.
+* ``set_special_tokens(self, special_tokens)``\ : update the list of special tokens (see above arguments)
+* ``encode(text)``\ : convert a ``str`` in a list of ``int`` tokens by performing BPE encoding.
+* `decode(ids, skip_special_tokens=False, clean_up_tokenization_spaces=False)`: decode a list of `int` indices in a string and do some post-processing if needed: (i) remove special tokens from the output and (ii) clean up tokenization spaces.
+* `save_vocabulary(directory_path)`: save the vocabulary, merge and special tokens files to `directory_path`. Return the path to the three files: ``vocab_file_path``\ , ``merge_file_path``\ , ``special_tokens_file_path``. The vocabulary can be reloaded with ``OpenAIGPTTokenizer.from_pretrained('directory_path')``.
+
+Please refer to the doc strings and code in `\ ``tokenization_openai.py`` <./pytorch_pretrained_bert/tokenization_openai.py>`_ for the details of the ``OpenAIGPTTokenizer``.
+
+``TransfoXLTokenizer``
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+``TransfoXLTokenizer`` perform word tokenization. This tokenizer can be used for adaptive softmax and has utilities for counting tokens in a corpus to create a vocabulary ordered by toekn frequency (for adaptive softmax). See the adaptive softmax paper (\ `Efficient softmax approximation for GPUs <http://arxiv.org/abs/1609.04309>`_\ ) for more details.
+
+The API is similar to the API of ``BertTokenizer`` (see above).
+
+Please refer to the doc strings and code in `\ ``tokenization_transfo_xl.py`` <./pytorch_pretrained_bert/tokenization_transfo_xl.py>`_ for the details of these additional methods in ``TransfoXLTokenizer``.
+
+``GPT2Tokenizer``
+~~~~~~~~~~~~~~~~~~~~~
+
+``GPT2Tokenizer`` perform byte-level Byte-Pair-Encoding (BPE) tokenization.
+
+This class has three arguments:
+
+
+* ``vocab_file``\ : path to a vocabulary file.
+* ``merges_file``\ : path to a file containing the BPE merges.
+* ``errors``\ : How to handle unicode decoding errors. **Default = ``replace``\ **
+
+and two methods:
+
+
+* ``tokenize(text)``\ : convert a ``str`` in a list of ``str`` tokens by performing byte-level BPE.
+* ``convert_tokens_to_ids(tokens)``\ : convert a list of ``str`` tokens in a list of ``int`` indices in the vocabulary.
+* ``convert_ids_to_tokens(tokens)``\ : convert a list of ``int`` indices in a list of ``str`` tokens in the vocabulary.
+* ``set_special_tokens(self, special_tokens)``\ : update the list of special tokens (see above arguments)
+* ``encode(text)``\ : convert a ``str`` in a list of ``int`` tokens by performing byte-level BPE.
+* ``decode(tokens)``\ : convert back a list of ``int`` tokens in a ``str``.
+* `save_vocabulary(directory_path)`: save the vocabulary, merge and special tokens files to `directory_path`. Return the path to the three files: ``vocab_file_path``\ , ``merge_file_path``\ , ``special_tokens_file_path``. The vocabulary can be reloaded with ``OpenAIGPTTokenizer.from_pretrained('directory_path')``.
+
+Please refer to `\ ``tokenization_gpt2.py`` <./pytorch_pretrained_bert/tokenization_gpt2.py>`_ for more details on the ``GPT2Tokenizer``.
+
+Optimizers
+^^^^^^^^^^
+
+``BertAdam``
+~~~~~~~~~~~~~~~~
+
+``BertAdam`` is a ``torch.optimizer`` adapted to be closer to the optimizer used in the TensorFlow implementation of Bert. The differences with PyTorch Adam optimizer are the following:
+
+
+* BertAdam implements weight decay fix,
+* BertAdam doesn't compensate for bias as in the regular Adam optimizer.
+
+The optimizer accepts the following arguments:
+
+
+* ``lr`` : learning rate
+* ``warmup`` : portion of ``t_total`` for the warmup, ``-1``  means no warmup. Default : ``-1``
+* ``t_total`` : total number of training steps for the learning
+    rate schedule, ``-1``  means constant learning rate. Default : ``-1``
+* ``schedule`` : schedule to use for the warmup (see above).
+    Can be ``'warmup_linear'``\ , ``'warmup_constant'``\ , ``'warmup_cosine'``\ , ``'none'``\ , ``None`` or a ``_LRSchedule`` object (see below).
+    If ``None`` or ``'none'``\ , learning rate is always kept constant.
+    Default : ``'warmup_linear'``
+* ``b1`` : Adams b1. Default : ``0.9``
+* ``b2`` : Adams b2. Default : ``0.999``
+* ``e`` : Adams epsilon. Default : ``1e-6``
+* ``weight_decay:`` Weight decay. Default : ``0.01``
+* ``max_grad_norm`` : Maximum norm for the gradients (\ ``-1`` means no clipping). Default : ``1.0``
+
+``OpenAIAdam``
+~~~~~~~~~~~~~~~~~~
+
+``OpenAIAdam`` is similar to ``BertAdam``.
+The differences with ``BertAdam`` is that ``OpenAIAdam`` compensate for bias as in the regular Adam optimizer.
+
+``OpenAIAdam`` accepts the same arguments as ``BertAdam``.
+
+Learning Rate Schedules
+~~~~~~~~~~~~~~~~~~~~~~~
+
+The ``.optimization`` module also provides additional schedules in the form of schedule objects that inherit from ``_LRSchedule``.
+All ``_LRSchedule`` subclasses accept ``warmup`` and ``t_total`` arguments at construction.
+When an ``_LRSchedule`` object is passed into ``BertAdam`` or ``OpenAIAdam``\ ,
+the ``warmup`` and ``t_total`` arguments on the optimizer are ignored and the ones in the ``_LRSchedule`` object are used.
+An overview of the implemented schedules:
+
+
+* ``ConstantLR``\ : always returns learning rate 1.
+* ``WarmupConstantSchedule``\ : Linearly increases learning rate from 0 to 1 over ``warmup`` fraction of training steps.
+    Keeps learning rate equal to 1. after warmup.
+
+  .. image:: docs/imgs/warmup_constant_schedule.png
+     :target: docs/imgs/warmup_constant_schedule.png
+     :alt:
+
+* ``WarmupLinearSchedule``\ : Linearly increases learning rate from 0 to 1 over ``warmup`` fraction of training steps.
+    Linearly decreases learning rate from 1. to 0. over remaining ``1 - warmup`` steps.
+
+  .. image:: docs/imgs/warmup_linear_schedule.png
+     :target: docs/imgs/warmup_linear_schedule.png
+     :alt:
+
+* ``WarmupCosineSchedule``\ : Linearly increases learning rate from 0 to 1 over ``warmup`` fraction of training steps.
+   Decreases learning rate from 1. to 0. over remaining ``1 - warmup`` steps following a cosine curve.
+   If ``cycles`` (default=0.5) is different from default, learning rate follows cosine function after warmup.
+
+  .. image:: docs/imgs/warmup_cosine_schedule.png
+     :target: docs/imgs/warmup_cosine_schedule.png
+     :alt:
+
+* ``WarmupCosineWithHardRestartsSchedule``\ : Linearly increases learning rate from 0 to 1 over ``warmup`` fraction of training steps.
+    If ``cycles`` (default=1.) is different from default, learning rate follows ``cycles`` times a cosine decaying learning rate (with hard restarts).
+
+  .. image:: docs/imgs/warmup_cosine_hard_restarts_schedule.png
+     :target: docs/imgs/warmup_cosine_hard_restarts_schedule.png
+     :alt:
+
+* ``WarmupCosineWithWarmupRestartsSchedule``\ : All training progress is divided in ``cycles`` (default=1.) parts of equal length.
+    Every part follows a schedule with the first ``warmup`` fraction of the training steps linearly increasing from 0. to 1.,
+    followed by a learning rate decreasing from 1. to 0. following a cosine curve.
+    Note that the total number of all warmup steps over all cycles together is equal to ``warmup`` * ``cycles``
+
+  .. image:: docs/imgs/warmup_cosine_warm_restarts_schedule.png
+     :target: docs/imgs/warmup_cosine_warm_restarts_schedule.png
+     :alt:
\ No newline at end of file
diff --git a/docs/source/examples.rst b/docs/source/examples.rst
new file mode 100644
index 0000000000..92e7ee661a
--- /dev/null
+++ b/docs/source/examples.rst
@@ -0,0 +1,593 @@
+Examples
+================================================
+
+.. list-table::
+   :header-rows: 1
+
+   * - Sub-section
+     - Description
+   * - `Training large models: introduction, tools and examples <#Training-large-models-introduction,-tools-and-examples>`_
+     - How to use gradient-accumulation, multi-gpu training, distributed training, optimize on CPU and 16-bits training to train Bert models
+   * - `Fine-tuning with BERT: running the examples <#Fine-tuning-with-BERT-running-the-examples>`_
+     - Running the examples in `\ ``./examples`` <./examples/>`_\ : ``extract_classif.py``\ , ``run_bert_classifier.py``\ , ``run_bert_squad.py`` and ``run_lm_finetuning.py``
+   * - `Fine-tuning with OpenAI GPT, Transformer-XL and GPT-2 <#openai-gpt-transformer-xl-and-gpt-2-running-the-examples>`_
+     - Running the examples in `\ ``./examples`` <./examples/>`_\ : ``run_openai_gpt.py``\ , ``run_transfo_xl.py`` and ``run_gpt2.py``
+   * - `Fine-tuning BERT-large on GPUs <#Fine-tuning-BERT-large-on-GPUs>`_
+     - How to fine tune ``BERT large``
+
+
+Training large models: introduction, tools and examples
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+BERT-base and BERT-large are respectively 110M and 340M parameters models and it can be difficult to fine-tune them on a single GPU with the recommended batch size for good performance (in most case a batch size of 32).
+
+To help with fine-tuning these models, we have included several techniques that you can activate in the fine-tuning scripts `\ ``run_bert_classifier.py`` <./examples/run_bert_classifier.py>`_ and `\ ``run_bert_squad.py`` <./examples/run_bert_squad.py>`_\ : gradient-accumulation, multi-gpu training, distributed training and 16-bits training . For more details on how to use these techniques you can read `the tips on training large batches in PyTorch <https://medium.com/huggingface/training-larger-batches-practical-tips-on-1-gpu-multi-gpu-distributed-setups-ec88c3e51255>`_ that I published earlier this month.
+
+Here is how to use these techniques in our scripts:
+
+
+* **Gradient Accumulation**\ : Gradient accumulation can be used by supplying a integer greater than 1 to the ``--gradient_accumulation_steps`` argument. The batch at each step will be divided by this integer and gradient will be accumulated over ``gradient_accumulation_steps`` steps.
+* **Multi-GPU**\ : Multi-GPU is automatically activated when several GPUs are detected and the batches are splitted over the GPUs.
+* **Distributed training**\ : Distributed training can be activated by supplying an integer greater or equal to 0 to the ``--local_rank`` argument (see below).
+* **16-bits training**\ : 16-bits training, also called mixed-precision training, can reduce the memory requirement of your model on the GPU by using half-precision training, basically allowing to double the batch size. If you have a recent GPU (starting from NVIDIA Volta architecture) you should see no decrease in speed. A good introduction to Mixed precision training can be found `here <https://devblogs.nvidia.com/mixed-precision-training-deep-neural-networks/>`_ and a full documentation is `here <https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html>`_. In our scripts, this option can be activated by setting the ``--fp16`` flag and you can play with loss scaling using the ``--loss_scale`` flag (see the previously linked documentation for details on loss scaling). The loss scale can be zero in which case the scale is dynamically adjusted or a positive power of two in which case the scaling is static.
+
+To use 16-bits training and distributed training, you need to install NVIDIA's apex extension `as detailed here <https://github.com/nvidia/apex>`_. You will find more information regarding the internals of ``apex`` and how to use ``apex`` in `the doc and the associated repository <https://github.com/nvidia/apex>`_. The results of the tests performed on pytorch-BERT by the NVIDIA team (and my trials at reproducing them) can be consulted in `the relevant PR of the present repository <https://github.com/huggingface/pytorch-pretrained-BERT/pull/116>`_.
+
+Note: To use *Distributed Training*\ , you will need to run one training script on each of your machines. This can be done for example by running the following command on each server (see `the above mentioned blog post <(https://medium.com/huggingface/training-larger-batches-practical-tips-on-1-gpu-multi-gpu-distributed-setups-ec88c3e51255>`_\ ) for more details):
+
+.. code-block:: bash
+
+   python -m torch.distributed.launch --nproc_per_node=4 --nnodes=2 --node_rank=$THIS_MACHINE_INDEX --master_addr="192.168.1.1" --master_port=1234 run_bert_classifier.py (--arg1 --arg2 --arg3 and all other arguments of the run_classifier script)
+
+Where ``$THIS_MACHINE_INDEX`` is an sequential index assigned to each of your machine (0, 1, 2...) and the machine with rank 0 has an IP address ``192.168.1.1`` and an open port ``1234``.
+
+Fine-tuning with BERT: running the examples
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+We showcase several fine-tuning examples based on (and extended from) `the original implementation <https://github.com/google-research/bert/>`_\ :
+
+
+* a *sequence-level classifier* on nine different GLUE tasks,
+* a *token-level classifier* on the question answering dataset SQuAD, and
+* a *sequence-level multiple-choice classifier* on the SWAG classification corpus.
+* a *BERT language model* on another target corpus
+
+GLUE results on dev set
+~~~~~~~~~~~~~~~~~~~~~~~
+
+We get the following results on the dev set of GLUE benchmark with an uncased BERT base
+model. All experiments were run on a P100 GPU with a batch size of 32.
+
+.. list-table::
+   :header-rows: 1
+
+   * - Task
+     - Metric
+     - Result
+   * - CoLA
+     - Matthew's corr.
+     - 57.29
+   * - SST-2
+     - accuracy
+     - 93.00
+   * - MRPC
+     - F1/accuracy
+     - 88.85/83.82
+   * - STS-B
+     - Pearson/Spearman corr.
+     - 89.70/89.37
+   * - QQP
+     - accuracy/F1
+     - 90.72/87.41
+   * - MNLI
+     - matched acc./mismatched acc.
+     - 83.95/84.39
+   * - QNLI
+     - accuracy
+     - 89.04
+   * - RTE
+     - accuracy
+     - 61.01
+   * - WNLI
+     - accuracy
+     - 53.52
+
+
+Some of these results are significantly different from the ones reported on the test set
+of GLUE benchmark on the website. For QQP and WNLI, please refer to `FAQ #12 <https://gluebenchmark.com/faq>`_ on the webite.
+
+Before running anyone of these GLUE tasks you should download the
+`GLUE data <https://gluebenchmark.com/tasks>`_ by running
+`this script <https://gist.github.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e>`_
+and unpack it to some directory ``$GLUE_DIR``.
+
+.. code-block:: shell
+
+   export GLUE_DIR=/path/to/glue
+   export TASK_NAME=MRPC
+
+   python run_bert_classifier.py \
+     --task_name $TASK_NAME \
+     --do_train \
+     --do_eval \
+     --do_lower_case \
+     --data_dir $GLUE_DIR/$TASK_NAME \
+     --bert_model bert-base-uncased \
+     --max_seq_length 128 \
+     --train_batch_size 32 \
+     --learning_rate 2e-5 \
+     --num_train_epochs 3.0 \
+     --output_dir /tmp/$TASK_NAME/
+
+where task name can be one of CoLA, SST-2, MRPC, STS-B, QQP, MNLI, QNLI, RTE, WNLI.
+
+The dev set results will be present within the text file 'eval_results.txt' in the specified output_dir. In case of MNLI, since there are two separate dev sets, matched and mismatched, there will be a separate output folder called '/tmp/MNLI-MM/' in addition to '/tmp/MNLI/'.
+
+The code has not been tested with half-precision training with apex on any GLUE task apart from MRPC, MNLI, CoLA, SST-2. The following section provides details on how to run half-precision training with MRPC. With that being said, there shouldn't be any issues in running half-precision training with the remaining GLUE tasks as well, since the data processor for each task inherits from the base class DataProcessor.
+
+MRPC
+~~~~
+
+This example code fine-tunes BERT on the Microsoft Research Paraphrase
+Corpus (MRPC) corpus and runs in less than 10 minutes on a single K-80 and in 27 seconds (!) on single tesla V100 16GB with apex installed.
+
+Before running this example you should download the
+`GLUE data <https://gluebenchmark.com/tasks>`_ by running
+`this script <https://gist.github.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e>`_
+and unpack it to some directory ``$GLUE_DIR``.
+
+.. code-block:: shell
+
+   export GLUE_DIR=/path/to/glue
+
+   python run_bert_classifier.py \
+     --task_name MRPC \
+     --do_train \
+     --do_eval \
+     --do_lower_case \
+     --data_dir $GLUE_DIR/MRPC/ \
+     --bert_model bert-base-uncased \
+     --max_seq_length 128 \
+     --train_batch_size 32 \
+     --learning_rate 2e-5 \
+     --num_train_epochs 3.0 \
+     --output_dir /tmp/mrpc_output/
+
+Our test ran on a few seeds with `the original implementation hyper-parameters <https://github.com/google-research/bert#sentence-and-sentence-pair-classification-tasks>`_ gave evaluation results between 84% and 88%.
+
+**Fast run with apex and 16 bit precision: fine-tuning on MRPC in 27 seconds!**
+First install apex as indicated `here <https://github.com/NVIDIA/apex>`_.
+Then run
+
+.. code-block:: shell
+
+   export GLUE_DIR=/path/to/glue
+
+   python run_bert_classifier.py \
+     --task_name MRPC \
+     --do_train \
+     --do_eval \
+     --do_lower_case \
+     --data_dir $GLUE_DIR/MRPC/ \
+     --bert_model bert-base-uncased \
+     --max_seq_length 128 \
+     --train_batch_size 32 \
+     --learning_rate 2e-5 \
+     --num_train_epochs 3.0 \
+     --output_dir /tmp/mrpc_output/ \
+     --fp16
+
+**Distributed training**
+Here is an example using distributed training on 8 V100 GPUs and Bert Whole Word Masking model to reach a F1 > 92 on MRPC:
+
+.. code-block:: bash
+
+   python -m torch.distributed.launch --nproc_per_node 8 run_bert_classifier.py   --bert_model bert-large-uncased-whole-word-masking    --task_name MRPC --do_train   --do_eval   --do_lower_case   --data_dir $GLUE_DIR/MRPC/   --max_seq_length 128   --train_batch_size 8   --learning_rate 2e-5   --num_train_epochs 3.0  --output_dir /tmp/mrpc_output/
+
+Training with these hyper-parameters gave us the following results:
+
+.. code-block:: bash
+
+     acc = 0.8823529411764706
+     acc_and_f1 = 0.901702786377709
+     eval_loss = 0.3418912578906332
+     f1 = 0.9210526315789473
+     global_step = 174
+     loss = 0.07231863956341798
+
+Here is an example on MNLI:
+
+.. code-block:: bash
+
+   python -m torch.distributed.launch --nproc_per_node 8 run_bert_classifier.py   --bert_model bert-large-uncased-whole-word-masking    --task_name mnli --do_train   --do_eval   --do_lower_case   --data_dir /datadrive/bert_data/glue_data//MNLI/   --max_seq_length 128   --train_batch_size 8   --learning_rate 2e-5   --num_train_epochs 3.0   --output_dir ../models/wwm-uncased-finetuned-mnli/ --overwrite_output_dir
+
+.. code-block:: bash
+
+   ***** Eval results *****
+     acc = 0.8679706601466992
+     eval_loss = 0.4911287787382479
+     global_step = 18408
+     loss = 0.04755385363816904
+
+   ***** Eval results *****
+     acc = 0.8747965825874695
+     eval_loss = 0.45516540421714036
+     global_step = 18408
+     loss = 0.04755385363816904
+
+This is the example of the ``bert-large-uncased-whole-word-masking-finetuned-mnli`` model
+
+SQuAD
+~~~~~
+
+This example code fine-tunes BERT on the SQuAD dataset. It runs in 24 min (with BERT-base) or 68 min (with BERT-large) on a single tesla V100 16GB.
+
+The data for SQuAD can be downloaded with the following links and should be saved in a ``$SQUAD_DIR`` directory.
+
+
+* `train-v1.1.json <https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json>`_
+* `dev-v1.1.json <https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json>`_
+* `evaluate-v1.1.py <https://github.com/allenai/bi-att-flow/blob/master/squad/evaluate-v1.1.py>`_
+
+.. code-block:: shell
+
+   export SQUAD_DIR=/path/to/SQUAD
+
+   python run_bert_squad.py \
+     --bert_model bert-base-uncased \
+     --do_train \
+     --do_predict \
+     --do_lower_case \
+     --train_file $SQUAD_DIR/train-v1.1.json \
+     --predict_file $SQUAD_DIR/dev-v1.1.json \
+     --train_batch_size 12 \
+     --learning_rate 3e-5 \
+     --num_train_epochs 2.0 \
+     --max_seq_length 384 \
+     --doc_stride 128 \
+     --output_dir /tmp/debug_squad/
+
+Training with the previous hyper-parameters gave us the following results:
+
+.. code-block:: bash
+
+   python $SQUAD_DIR/evaluate-v1.1.py $SQUAD_DIR/dev-v1.1.json /tmp/debug_squad/predictions.json
+   {"f1": 88.52381567990474, "exact_match": 81.22043519394512}
+
+**distributed training**
+
+Here is an example using distributed training on 8 V100 GPUs and Bert Whole Word Masking uncased model to reach a F1 > 93 on SQuAD:
+
+.. code-block:: bash
+
+   python -m torch.distributed.launch --nproc_per_node=8 \
+    run_bert_squad.py \
+    --bert_model bert-large-uncased-whole-word-masking  \
+    --do_train \
+    --do_predict \
+    --do_lower_case \
+    --train_file $SQUAD_DIR/train-v1.1.json \
+    --predict_file $SQUAD_DIR/dev-v1.1.json \
+    --learning_rate 3e-5 \
+    --num_train_epochs 2 \
+    --max_seq_length 384 \
+    --doc_stride 128 \
+    --output_dir ../models/wwm_uncased_finetuned_squad/ \
+    --train_batch_size 24 \
+    --gradient_accumulation_steps 12
+
+Training with these hyper-parameters gave us the following results:
+
+.. code-block:: bash
+
+   python $SQUAD_DIR/evaluate-v1.1.py $SQUAD_DIR/dev-v1.1.json ../models/wwm_uncased_finetuned_squad/predictions.json
+   {"exact_match": 86.91579943235573, "f1": 93.1532499015869}
+
+This is the model provided as ``bert-large-uncased-whole-word-masking-finetuned-squad``.
+
+And here is the model provided as ``bert-large-cased-whole-word-masking-finetuned-squad``\ :
+
+.. code-block:: bash
+
+   python -m torch.distributed.launch --nproc_per_node=8  run_bert_squad.py  --bert_model bert-large-cased-whole-word-masking   --do_train  --do_predict  --do_lower_case  --train_file $SQUAD_DIR/train-v1.1.json  --predict_file $SQUAD_DIR/dev-v1.1.json  --learning_rate 3e-5  --num_train_epochs 2  --max_seq_length 384  --doc_stride 128  --output_dir ../models/wwm_cased_finetuned_squad/  --train_batch_size 24  --gradient_accumulation_steps 12
+
+Training with these hyper-parameters gave us the following results:
+
+.. code-block:: bash
+
+   python $SQUAD_DIR/evaluate-v1.1.py $SQUAD_DIR/dev-v1.1.json ../models/wwm_uncased_finetuned_squad/predictions.json
+   {"exact_match": 84.18164616840113, "f1": 91.58645594850135}
+
+SWAG
+~~~~
+
+The data for SWAG can be downloaded by cloning the following `repository <https://github.com/rowanz/swagaf>`_
+
+.. code-block:: shell
+
+   export SWAG_DIR=/path/to/SWAG
+
+   python run_bert_swag.py \
+     --bert_model bert-base-uncased \
+     --do_train \
+     --do_lower_case \
+     --do_eval \
+     --data_dir $SWAG_DIR/data \
+     --train_batch_size 16 \
+     --learning_rate 2e-5 \
+     --num_train_epochs 3.0 \
+     --max_seq_length 80 \
+     --output_dir /tmp/swag_output/ \
+     --gradient_accumulation_steps 4
+
+Training with the previous hyper-parameters on a single GPU gave us the following results:
+
+.. code-block::
+
+   eval_accuracy = 0.8062081375587323
+   eval_loss = 0.5966546792367169
+   global_step = 13788
+   loss = 0.06423990014260186
+
+LM Fine-tuning
+~~~~~~~~~~~~~~
+
+The data should be a text file in the same format as `sample_text.txt <./samples/sample_text.txt>`_  (one sentence per line, docs separated by empty line).
+You can download an `exemplary training corpus <https://ext-bert-sample.obs.eu-de.otc.t-systems.com/small_wiki_sentence_corpus.txt>`_ generated from wikipedia articles and splitted into ~500k sentences with spaCy.
+Training one epoch on this corpus takes about 1:20h on 4 x NVIDIA Tesla P100 with ``train_batch_size=200`` and ``max_seq_length=128``\ :
+
+Thank to the work of @Rocketknight1 and @tholor there are now **several scripts** that can be used to fine-tune BERT using the pretraining objective (combination of masked-language modeling and next sentence prediction loss). These scripts are detailed in the `\ ``README`` <./examples/lm_finetuning/README.md>`_ of the `\ ``examples/lm_finetuning/`` <./examples/lm_finetuning/>`_ folder.
+
+OpenAI GPT, Transformer-XL and GPT-2: running the examples
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+We provide three examples of scripts for OpenAI GPT, Transformer-XL and OpenAI GPT-2 based on (and extended from) the respective original implementations:
+
+
+* fine-tuning OpenAI GPT on the ROCStories dataset
+* evaluating Transformer-XL on Wikitext 103
+* unconditional and conditional generation from a pre-trained OpenAI GPT-2 model
+
+Fine-tuning OpenAI GPT on the RocStories dataset
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+This example code fine-tunes OpenAI GPT on the RocStories dataset.
+
+Before running this example you should download the
+`RocStories dataset <https://github.com/snigdhac/StoryComprehension_EMNLP/tree/master/Dataset/RoCStories>`_ and unpack it to some directory ``$ROC_STORIES_DIR``.
+
+.. code-block:: shell
+
+   export ROC_STORIES_DIR=/path/to/RocStories
+
+   python run_openai_gpt.py \
+     --model_name openai-gpt \
+     --do_train \
+     --do_eval \
+     --train_dataset $ROC_STORIES_DIR/cloze_test_val__spring2016\ -\ cloze_test_ALL_val.csv \
+     --eval_dataset $ROC_STORIES_DIR/cloze_test_test__spring2016\ -\ cloze_test_ALL_test.csv \
+     --output_dir ../log \
+     --train_batch_size 16 \
+
+This command runs in about 10 min on a single K-80 an gives an evaluation accuracy of about 87.7% (the authors report a median accuracy with the TensorFlow code of 85.8% and the OpenAI GPT paper reports a best single run accuracy of 86.5%).
+
+Evaluating the pre-trained Transformer-XL on the WikiText 103 dataset
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+This example code evaluate the pre-trained Transformer-XL on the WikiText 103 dataset.
+This command will download a pre-processed version of the WikiText 103 dataset in which the vocabulary has been computed.
+
+.. code-block:: shell
+
+   python run_transfo_xl.py --work_dir ../log
+
+This command runs in about 1 min on a V100 and gives an evaluation perplexity of 18.22 on WikiText-103 (the authors report a perplexity of about 18.3 on this dataset with the TensorFlow code).
+
+Unconditional and conditional generation from OpenAI's GPT-2 model
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+This example code is identical to the original unconditional and conditional generation codes.
+
+Conditional generation:
+
+.. code-block:: shell
+
+   python run_gpt2.py
+
+Unconditional generation:
+
+.. code-block:: shell
+
+   python run_gpt2.py --unconditional
+
+The same option as in the original scripts are provided, please refere to the code of the example and the original repository of OpenAI.
+
+Fine-tuning BERT-large on GPUs
+------------------------------
+
+The options we list above allow to fine-tune BERT-large rather easily on GPU(s) instead of the TPU used by the original implementation.
+
+For example, fine-tuning BERT-large on SQuAD can be done on a server with 4 k-80 (these are pretty old now) in 18 hours. Our results are similar to the TensorFlow implementation results (actually slightly higher):
+
+.. code-block:: bash
+
+   {"exact_match": 84.56953642384106, "f1": 91.04028647786927}
+
+To get these results we used a combination of:
+
+
+* multi-GPU training (automatically activated on a multi-GPU server),
+* 2 steps of gradient accumulation and
+* perform the optimization step on CPU to store Adam's averages in RAM.
+
+Here is the full list of hyper-parameters for this run:
+
+.. code-block:: bash
+
+   export SQUAD_DIR=/path/to/SQUAD
+
+   python ./run_bert_squad.py \
+     --bert_model bert-large-uncased \
+     --do_train \
+     --do_predict \
+     --do_lower_case \
+     --train_file $SQUAD_DIR/train-v1.1.json \
+     --predict_file $SQUAD_DIR/dev-v1.1.json \
+     --learning_rate 3e-5 \
+     --num_train_epochs 2 \
+     --max_seq_length 384 \
+     --doc_stride 128 \
+     --output_dir /tmp/debug_squad/ \
+     --train_batch_size 24 \
+     --gradient_accumulation_steps 2
+
+If you have a recent GPU (starting from NVIDIA Volta series), you should try **16-bit fine-tuning** (FP16).
+
+Here is an example of hyper-parameters for a FP16 run we tried:
+
+.. code-block:: bash
+
+   export SQUAD_DIR=/path/to/SQUAD
+
+   python ./run_bert_squad.py \
+     --bert_model bert-large-uncased \
+     --do_train \
+     --do_predict \
+     --do_lower_case \
+     --train_file $SQUAD_DIR/train-v1.1.json \
+     --predict_file $SQUAD_DIR/dev-v1.1.json \
+     --learning_rate 3e-5 \
+     --num_train_epochs 2 \
+     --max_seq_length 384 \
+     --doc_stride 128 \
+     --output_dir /tmp/debug_squad/ \
+     --train_batch_size 24 \
+     --fp16 \
+     --loss_scale 128
+
+The results were similar to the above FP32 results (actually slightly higher):
+
+.. code-block:: bash
+
+   {"exact_match": 84.65468306527909, "f1": 91.238669287002}
+
+Here is an example with the recent ``bert-large-uncased-whole-word-masking``\ :
+
+.. code-block:: bash
+
+   python -m torch.distributed.launch --nproc_per_node=8 \
+     run_bert_squad.py \
+     --bert_model bert-large-uncased-whole-word-masking \
+     --do_train \
+     --do_predict \
+     --do_lower_case \
+     --train_file $SQUAD_DIR/train-v1.1.json \
+     --predict_file $SQUAD_DIR/dev-v1.1.json \
+     --learning_rate 3e-5 \
+     --num_train_epochs 2 \
+     --max_seq_length 384 \
+     --doc_stride 128 \
+     --output_dir /tmp/debug_squad/ \
+     --train_batch_size 24 \
+     --gradient_accumulation_steps 2
+
+Fine-tuning XLNet
+-----------------
+
+STS-B
+~~~~~
+
+This example code fine-tunes XLNet on the STS-B corpus.
+
+Before running this example you should download the
+`GLUE data <https://gluebenchmark.com/tasks>`_ by running
+`this script <https://gist.github.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e>`_
+and unpack it to some directory ``$GLUE_DIR``.
+
+.. code-block:: shell
+
+   export GLUE_DIR=/path/to/glue
+
+   python run_xlnet_classifier.py \
+    --task_name STS-B \
+    --do_train \
+    --do_eval \
+    --data_dir $GLUE_DIR/STS-B/ \
+    --max_seq_length 128 \
+    --train_batch_size 8 \
+    --gradient_accumulation_steps 1 \
+    --learning_rate 5e-5 \
+    --num_train_epochs 3.0 \
+    --output_dir /tmp/mrpc_output/
+
+Our test ran on a few seeds with `the original implementation hyper-parameters <https://github.com/zihangdai/xlnet#1-sts-b-sentence-pair-relevance-regression-with-gpus>`_ gave evaluation results between 84% and 88%.
+
+**Distributed training**
+Here is an example using distributed training on 8 V100 GPUs to reach XXXX:
+
+.. code-block:: bash
+
+   python -m torch.distributed.launch --nproc_per_node 8 \
+    run_xlnet_classifier.py \
+    --task_name STS-B \
+    --do_train \
+    --do_eval \
+    --data_dir $GLUE_DIR/STS-B/ \
+    --max_seq_length 128 \
+    --train_batch_size 8 \
+    --gradient_accumulation_steps 1 \
+    --learning_rate 5e-5 \
+    --num_train_epochs 3.0 \
+    --output_dir /tmp/mrpc_output/
+
+Training with these hyper-parameters gave us the following results:
+
+.. code-block:: bash
+
+     acc = 0.8823529411764706
+     acc_and_f1 = 0.901702786377709
+     eval_loss = 0.3418912578906332
+     f1 = 0.9210526315789473
+     global_step = 174
+     loss = 0.07231863956341798
+
+Here is an example on MNLI:
+
+.. code-block:: bash
+
+   python -m torch.distributed.launch --nproc_per_node 8 run_bert_classifier.py   --bert_model bert-large-uncased-whole-word-masking    --task_name mnli --do_train   --do_eval   --data_dir /datadrive/bert_data/glue_data//MNLI/   --max_seq_length 128   --train_batch_size 8   --learning_rate 2e-5   --num_train_epochs 3.0   --output_dir ../models/wwm-uncased-finetuned-mnli/ --overwrite_output_dir
+
+.. code-block:: bash
+
+   ***** Eval results *****
+     acc = 0.8679706601466992
+     eval_loss = 0.4911287787382479
+     global_step = 18408
+     loss = 0.04755385363816904
+
+   ***** Eval results *****
+     acc = 0.8747965825874695
+     eval_loss = 0.45516540421714036
+     global_step = 18408
+     loss = 0.04755385363816904
+
+This is the example of the ``bert-large-uncased-whole-word-masking-finetuned-mnli`` model
+
+BERTology
+---------
+
+There is a growing field of study concerned with investigating the inner working of large-scale transformers like BERT (that some call "BERTology"). Some good examples of this field are:
+
+
+* BERT Rediscovers the Classical NLP Pipeline by Ian Tenney, Dipanjan Das, Ellie Pavlick: https://arxiv.org/abs/1905.05950
+* Are Sixteen Heads Really Better than One? by Paul Michel, Omer Levy, Graham Neubig: https://arxiv.org/abs/1905.10650
+* What Does BERT Look At? An Analysis of BERT's Attention by Kevin Clark, Urvashi Khandelwal, Omer Levy, Christopher D. Manning: https://arxiv.org/abs/1906.04341
+
+In order to help this new field develop, we have included a few additional features in the BERT/GPT/GPT-2 models to help people access the inner representations, mainly adapted  from the great work of Paul Michel (https://arxiv.org/abs/1905.10650):
+
+
+* accessing all the hidden-states of BERT/GPT/GPT-2,
+* accessing all the attention weights for each head of BERT/GPT/GPT-2,
+* retrieving heads output values and gradients to be able to compute head importance score and prune head as explained in https://arxiv.org/abs/1905.10650.
+
+To help you understand and use these features, we have added a specific example script: `\ ``bertology.py`` <./examples/bertology.py>`_ while extract information and prune a model pre-trained on MRPC.
diff --git a/docs/source/index.rst b/docs/source/index.rst
new file mode 100644
index 0000000000..85125f3cf3
--- /dev/null
+++ b/docs/source/index.rst
@@ -0,0 +1,207 @@
+Pytorch-Transformers: The Big & Extending Repository of pretrained Transformers
+================================================================================================================================================
+
+
+.. toctree::
+    :maxdepth: 2
+
+    installation
+    usage
+    doc
+    examples
+    notebooks
+    tpu
+    cli
+
+
+.. image:: https://circleci.com/gh/huggingface/pytorch-pretrained-BERT.svg?style=svg
+   :target: https://circleci.com/gh/huggingface/pytorch-pretrained-BERT
+   :alt: CircleCI
+
+
+This repository contains op-for-op PyTorch reimplementations, pre-trained models and fine-tuning examples for:
+
+
+* `Google's BERT model <https://github.com/google-research/bert>`_\ ,
+* `OpenAI's GPT model <https://github.com/openai/finetune-transformer-lm>`_\ ,
+* `Google/CMU's Transformer-XL model <https://github.com/kimiyoung/transformer-xl>`_\ , and
+* `OpenAI's GPT-2 model <https://blog.openai.com/better-language-models/>`_.
+
+These implementations have been tested on several datasets (see the examples) and should match the performances of the associated TensorFlow implementations (e.g. ~91 F1 on SQuAD for BERT, ~88 F1 on RocStories for OpenAI GPT and ~18.3 perplexity on WikiText 103 for the Transformer-XL). You can find more details in the `Examples <#examples>`_ section below.
+
+Here are some information on these models:
+
+**BERT** was released together with the paper `BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding <https://arxiv.org/abs/1810.04805>`_ by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova.
+This PyTorch implementation of BERT is provided with `Google's pre-trained models <https://github.com/google-research/bert>`_\ , examples, notebooks and a command-line interface to load any pre-trained TensorFlow checkpoint for BERT is also provided.
+
+**OpenAI GPT** was released together with the paper `Improving Language Understanding by Generative Pre-Training <https://blog.openai.com/language-unsupervised/>`_ by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.
+This PyTorch implementation of OpenAI GPT is an adaptation of the `PyTorch implementation by HuggingFace <https://github.com/huggingface/pytorch-openai-transformer-lm>`_ and is provided with `OpenAI's pre-trained model <https://github.com/openai/finetune-transformer-lm>`_ and a command-line interface that was used to convert the pre-trained NumPy checkpoint in PyTorch.
+
+**Google/CMU's Transformer-XL** was released together with the paper `Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context <http://arxiv.org/abs/1901.02860>`_ by Zihang Dai\ *, Zhilin Yang*\ , Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
+This PyTorch implementation of Transformer-XL is an adaptation of the original `PyTorch implementation <https://github.com/kimiyoung/transformer-xl>`_ which has been slightly modified to match the performances of the TensorFlow implementation and allow to re-use the pretrained weights. A command-line interface is provided to convert TensorFlow checkpoints in PyTorch models.
+
+**OpenAI GPT-2** was released together with the paper `Language Models are Unsupervised Multitask Learners <https://blog.openai.com/better-language-models/>`_ by Alec Radford\ *, Jeffrey Wu*\ , Rewon Child, David Luan, Dario Amodei\ ** and Ilya Sutskever**.
+This PyTorch implementation of OpenAI GPT-2 is an adaptation of the `OpenAI's implementation <https://github.com/openai/gpt-2>`_ and is provided with `OpenAI's pre-trained model <https://github.com/openai/gpt-2>`_ and a command-line interface that was used to convert the TensorFlow checkpoint in PyTorch.
+
+Content
+-------
+
+.. list-table::
+   :header-rows: 1
+
+   * - Section
+     - Description
+   * - `Installation <#installation>`_
+     - How to install the package
+   * - `Overview <#overview>`_
+     - Overview of the package
+   * - `Usage <#usage>`_
+     - Quickstart examples
+   * - `Doc <#doc>`_
+     - Detailed documentation
+   * - `Examples <#examples>`_
+     - Detailed examples on how to fine-tune Bert
+   * - `Notebooks <#notebooks>`_
+     - Introduction on the provided Jupyter Notebooks
+   * - `TPU <#tpu>`_
+     - Notes on TPU support and pretraining scripts
+   * - `Command-line interface <#Command-line-interface>`_
+     - Convert a TensorFlow checkpoint in a PyTorch dump
+
+Overview
+--------
+
+This package comprises the following classes that can be imported in Python and are detailed in the `Doc <#doc>`_ section of this readme:
+
+
+*
+  Eight **Bert** PyTorch models (\ ``torch.nn.Module``\ ) with pre-trained weights (in the `modeling.py <https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/pytorch_pretrained_bert/modeling.py>`_ file):
+
+
+  * `BertModel <https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/pytorch_pretrained_bert/modeling.py#L639>`_ - raw BERT Transformer model (\ **fully pre-trained**\ ),
+  * `BertForMaskedLM <https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/pytorch_pretrained_bert/modeling.py#L793>`_ - BERT Transformer with the pre-trained masked language modeling head on top (\ **fully pre-trained**\ ),
+  * `BertForNextSentencePrediction <https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/pytorch_pretrained_bert/modeling.py#L854>`_ - BERT Transformer with the pre-trained next sentence prediction classifier on top  (\ **fully pre-trained**\ ),
+  * `BertForPreTraining <https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/pytorch_pretrained_bert/modeling.py#L722>`_ - BERT Transformer with masked language modeling head and next sentence prediction classifier on top (\ **fully pre-trained**\ ),
+  * `BertForSequenceClassification <https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/pytorch_pretrained_bert/modeling.py#L916>`_ - BERT Transformer with a sequence classification head on top (BERT Transformer is **pre-trained**\ , the sequence classification head **is only initialized and has to be trained**\ ),
+  * `BertForMultipleChoice <https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/pytorch_pretrained_bert/modeling.py#L982>`_ - BERT Transformer with a multiple choice head on top (used for task like Swag) (BERT Transformer is **pre-trained**\ , the multiple choice classification head **is only initialized and has to be trained**\ ),
+  * `BertForTokenClassification <https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/pytorch_pretrained_bert/modeling.py#L1051>`_ - BERT Transformer with a token classification head on top (BERT Transformer is **pre-trained**\ , the token classification head **is only initialized and has to be trained**\ ),
+  * `BertForQuestionAnswering <https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/pytorch_pretrained_bert/modeling.py#L1124>`_ - BERT Transformer with a token classification head on top (BERT Transformer is **pre-trained**\ , the token classification head **is only initialized and has to be trained**\ ).
+
+*
+  Three **OpenAI GPT** PyTorch models (\ ``torch.nn.Module``\ ) with pre-trained weights (in the `modeling_openai.py <https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/pytorch_pretrained_bert/modeling_openai.py>`_ file):
+
+
+  * `OpenAIGPTModel <https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/pytorch_pretrained_bert/modeling_openai.py#L536>`_ - raw OpenAI GPT Transformer model (\ **fully pre-trained**\ ),
+  * `OpenAIGPTLMHeadModel <https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/pytorch_pretrained_bert/modeling_openai.py#L643>`_ - OpenAI GPT Transformer with the tied language modeling head on top (\ **fully pre-trained**\ ),
+  * `OpenAIGPTDoubleHeadsModel <https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/pytorch_pretrained_bert/modeling_openai.py#L722>`_ - OpenAI GPT Transformer with the tied language modeling head and a multiple choice classification head on top (OpenAI GPT Transformer is **pre-trained**\ , the multiple choice classification head **is only initialized and has to be trained**\ ),
+
+*
+  Two **Transformer-XL** PyTorch models (\ ``torch.nn.Module``\ ) with pre-trained weights (in the `modeling_transfo_xl.py <https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/pytorch_pretrained_bert/modeling_transfo_xl.py>`_ file):
+
+
+  * `TransfoXLModel <https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/pytorch_pretrained_bert/modeling_transfo_xl.py#L983>`_ - Transformer-XL model which outputs the last hidden state and memory cells (\ **fully pre-trained**\ ),
+  * `TransfoXLLMHeadModel <https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/pytorch_pretrained_bert/modeling_transfo_xl.py#L1260>`_ - Transformer-XL with the tied adaptive softmax head on top for language modeling which outputs the logits/loss and memory cells (\ **fully pre-trained**\ ),
+
+*
+  Three **OpenAI GPT-2** PyTorch models (\ ``torch.nn.Module``\ ) with pre-trained weights (in the `modeling_gpt2.py <https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/pytorch_pretrained_bert/modeling_gpt2.py>`_ file):
+
+
+  * `GPT2Model <https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/pytorch_pretrained_bert/modeling_gpt2.py#L479>`_ - raw OpenAI GPT-2 Transformer model (\ **fully pre-trained**\ ),
+  * `GPT2LMHeadModel <https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/pytorch_pretrained_bert/modeling_gpt2.py#L559>`_ - OpenAI GPT-2 Transformer with the tied language modeling head on top (\ **fully pre-trained**\ ),
+  * `GPT2DoubleHeadsModel <https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/pytorch_pretrained_bert/modeling_gpt2.py#L624>`_ - OpenAI GPT-2 Transformer with the tied language modeling head and a multiple choice classification head on top (OpenAI GPT-2 Transformer is **pre-trained**\ , the multiple choice classification head **is only initialized and has to be trained**\ ),
+
+*
+  Tokenizers for **BERT** (using word-piece) (in the `tokenization.py <https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/pytorch_pretrained_bert/tokenization.py>`_ file):
+
+
+  * ``BasicTokenizer`` - basic tokenization (punctuation splitting, lower casing, etc.),
+  * ``WordpieceTokenizer`` - WordPiece tokenization,
+  * ``BertTokenizer`` - perform end-to-end tokenization, i.e. basic tokenization followed by WordPiece tokenization.
+
+*
+  Tokenizer for **OpenAI GPT** (using Byte-Pair-Encoding) (in the `tokenization_openai.py <https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/pytorch_pretrained_bert/tokenization_openai.py>`_ file):
+
+
+  * ``OpenAIGPTTokenizer`` - perform Byte-Pair-Encoding (BPE) tokenization.
+
+*
+  Tokenizer for **Transformer-XL** (word tokens ordered by frequency for adaptive softmax) (in the `tokenization_transfo_xl.py <https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/pytorch_pretrained_bert/tokenization_transfo_xl.py>`_ file):
+
+
+  * ``OpenAIGPTTokenizer`` - perform word tokenization and can order words by frequency in a corpus for use in an adaptive softmax.
+
+*
+  Tokenizer for **OpenAI GPT-2** (using byte-level Byte-Pair-Encoding) (in the `tokenization_gpt2.py <https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/pytorch_pretrained_bert/tokenization_gpt2.py>`_ file):
+
+
+  * ``GPT2Tokenizer`` - perform byte-level Byte-Pair-Encoding (BPE) tokenization.
+
+*
+  Optimizer for **BERT** (in the `optimization.py <https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/pytorch_pretrained_bert/pytorch_pretrained_bert/optimization.py>`_ file):
+
+
+  * ``BertAdam`` - Bert version of Adam algorithm with weight decay fix, warmup and linear decay of the learning rate.
+
+*
+  Optimizer for **OpenAI GPT** (in the `optimization_openai.py <https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/pytorch_pretrained_bert/optimization_openai.py>`_ file):
+
+
+  * ``OpenAIAdam`` - OpenAI GPT version of Adam algorithm with weight decay fix, warmup and linear decay of the learning rate.
+
+*
+  Configuration classes for BERT, OpenAI GPT and Transformer-XL (in the respective `modeling.py <https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/pytorch_pretrained_bert/modeling.py>`_\ , `modeling_openai.py <https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/pytorch_pretrained_bert/modeling_openai.py>`_\ , `modeling_transfo_xl.py <https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/pytorch_pretrained_bert/modeling_transfo_xl.py>`_ files):
+
+
+  * ``BertConfig`` - Configuration class to store the configuration of a ``BertModel`` with utilities to read and write from JSON configuration files.
+  * ``OpenAIGPTConfig`` - Configuration class to store the configuration of a ``OpenAIGPTModel`` with utilities to read and write from JSON configuration files.
+  * ``GPT2Config`` - Configuration class to store the configuration of a ``GPT2Model`` with utilities to read and write from JSON configuration files.
+  * ``TransfoXLConfig`` - Configuration class to store the configuration of a ``TransfoXLModel`` with utilities to read and write from JSON configuration files.
+
+The repository further comprises:
+
+
+*
+  Five examples on how to use **BERT** (in the `examples folder <https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/examples>`_\ ):
+
+
+  * `run_bert_extract_features.py <https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/examples/run_bert_extract_features.py>`_ - Show how to extract hidden states from an instance of ``BertModel``\ ,
+  * `run_bert_classifier.py <https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/examples/run_bert_classifier.py>`_ - Show how to fine-tune an instance of ``BertForSequenceClassification`` on GLUE's MRPC task,
+  * `run_bert_squad.py <https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/examples/run_bert_squad.py>`_ - Show how to fine-tune an instance of ``BertForQuestionAnswering`` on SQuAD v1.0 and SQuAD v2.0 tasks.
+  * `run_swag.py <https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/examples/run_swag.py>`_ - Show how to fine-tune an instance of ``BertForMultipleChoice`` on Swag task.
+  * `simple_lm_finetuning.py <https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/examples/lm_finetuning/simple_lm_finetuning.py>`_ - Show how to fine-tune an instance of ``BertForPretraining`` on a target text corpus.
+
+*
+  One example on how to use **OpenAI GPT** (in the `examples folder <https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/examples>`_\ ):
+
+
+  * `run_openai_gpt.py <https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/examples/run_openai_gpt.py>`_ - Show how to fine-tune an instance of ``OpenGPTDoubleHeadsModel`` on the RocStories task.
+
+*
+  One example on how to use **Transformer-XL** (in the `examples folder <https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/examples>`_\ ):
+
+
+  * `run_transfo_xl.py <https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/examples/run_transfo_xl.py>`_ - Show how to load and evaluate a pre-trained model of ``TransfoXLLMHeadModel`` on WikiText 103.
+
+*
+  One example on how to use **OpenAI GPT-2** in the unconditional and interactive mode (in the `examples folder <https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/examples>`_\ ):
+
+
+  * `run_gpt2.py <https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/examples/run_gpt2.py>`_ - Show how to use OpenAI GPT-2 an instance of ``GPT2LMHeadModel`` to generate text (same as the original OpenAI GPT-2 examples).
+
+  These examples are detailed in the `Examples <#examples>`_ section of this readme.
+
+*
+  Three notebooks that were used to check that the TensorFlow and PyTorch models behave identically (in the `notebooks folder <https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/notebooks>`_\ ):
+
+
+  * `Comparing-TF-and-PT-models.ipynb <https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/notebooks/Comparing-TF-and-PT-models.ipynb>`_ - Compare the hidden states predicted by ``BertModel``\ ,
+  * `Comparing-TF-and-PT-models-SQuAD.ipynb <https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/notebooks/Comparing-TF-and-PT-models-SQuAD.ipynb>`_ - Compare the spans predicted by  ``BertForQuestionAnswering`` instances,
+  * `Comparing-TF-and-PT-models-MLM-NSP.ipynb <https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/notebooks/Comparing-TF-and-PT-models-MLM-NSP.ipynb>`_ - Compare the predictions of the ``BertForPretraining`` instances.
+
+  These notebooks are detailed in the `Notebooks <#notebooks>`_ section of this readme.
+
+
+*
+  A command-line interface to convert TensorFlow checkpoints (BERT, Transformer-XL) or NumPy checkpoint (OpenAI) in a PyTorch save of the associated PyTorch model:
+
+  This CLI is detailed in the `Command-line interface <#Command-line-interface>`_ section of this readme.
diff --git a/docs/source/installation.rst b/docs/source/installation.rst
new file mode 100644
index 0000000000..054d7a1323
--- /dev/null
+++ b/docs/source/installation.rst
@@ -0,0 +1,48 @@
+Installation
+================================================
+
+This repo was tested on Python 2.7 and 3.5+ (examples are tested only on python 3.5+) and PyTorch 0.4.1/1.0.0
+
+With pip
+^^^^^^^^
+
+PyTorch pretrained bert can be installed by pip as follows:
+
+.. code-block:: bash
+
+   pip install pytorch-pretrained-bert
+
+If you want to reproduce the original tokenization process of the ``OpenAI GPT`` paper, you will need to install ``ftfy`` (limit to version 4.4.3 if you are using Python 2) and ``SpaCy`` :
+
+.. code-block:: bash
+
+   pip install spacy ftfy==4.4.3
+   python -m spacy download en
+
+If you don't install ``ftfy`` and ``SpaCy``\ , the ``OpenAI GPT`` tokenizer will default to tokenize using BERT's ``BasicTokenizer`` followed by Byte-Pair Encoding (which should be fine for most usage, don't worry).
+
+From source
+^^^^^^^^^^^
+
+Clone the repository and run:
+
+.. code-block:: bash
+
+   pip install [--editable] .
+
+Here also, if you want to reproduce the original tokenization process of the ``OpenAI GPT`` model, you will need to install ``ftfy`` (limit to version 4.4.3 if you are using Python 2) and ``SpaCy`` :
+
+.. code-block:: bash
+
+   pip install spacy ftfy==4.4.3
+   python -m spacy download en
+
+Again, if you don't install ``ftfy`` and ``SpaCy``\ , the ``OpenAI GPT`` tokenizer will default to tokenize using BERT's ``BasicTokenizer`` followed by Byte-Pair Encoding (which should be fine for most usage).
+
+A series of tests is included in the `tests folder <https://github.com/huggingface/pytorch-pretrained-BERT/tree/master/tests>`_ and can be run using ``pytest`` (install pytest if needed: ``pip install pytest``\ ).
+
+You can run the tests with the command:
+
+.. code-block:: bash
+
+   python -m pytest -sv tests/
diff --git a/docs/source/notebooks.rst b/docs/source/notebooks.rst
new file mode 100644
index 0000000000..f608bf64c5
--- /dev/null
+++ b/docs/source/notebooks.rst
@@ -0,0 +1,16 @@
+Notebooks
+================================================
+
+We include `three Jupyter Notebooks <https://github.com/huggingface/pytorch-pretrained-BERT/tree/master/notebooks>`_ that can be used to check that the predictions of the PyTorch model are identical to the predictions of the original TensorFlow model.
+
+
+*
+  The first NoteBook (\ `Comparing-TF-and-PT-models.ipynb <./notebooks/Comparing-TF-and-PT-models.ipynb>`_\ ) extracts the hidden states of a full sequence on each layers of the TensorFlow and the PyTorch models and computes the standard deviation between them. In the given example, we get a standard deviation of 1.5e-7 to 9e-7 on the various hidden state of the models.
+
+*
+  The second NoteBook (\ `Comparing-TF-and-PT-models-SQuAD.ipynb <./notebooks/Comparing-TF-and-PT-models-SQuAD.ipynb>`_\ ) compares the loss computed by the TensorFlow and the PyTorch models for identical initialization of the fine-tuning layer of the ``BertForQuestionAnswering`` and computes the standard deviation between them. In the given example, we get a standard deviation of 2.5e-7 between the models.
+
+*
+  The third NoteBook (\ `Comparing-TF-and-PT-models-MLM-NSP.ipynb <./notebooks/Comparing-TF-and-PT-models-MLM-NSP.ipynb>`_\ ) compares the predictions computed by the TensorFlow and the PyTorch models for masked token language modeling using the pre-trained masked language modeling model.
+
+Please follow the instructions given in the notebooks to run and modify them.
diff --git a/docs/source/tpu.rst b/docs/source/tpu.rst
new file mode 100644
index 0000000000..31f72ca891
--- /dev/null
+++ b/docs/source/tpu.rst
@@ -0,0 +1,13 @@
+TPU
+================================================
+
+TPU support and pretraining scripts
+------------------------------------------------
+
+TPU are not supported by the current stable release of PyTorch (0.4.1). However, the next version of PyTorch (v1.0) should support training on TPU and is expected to be released soon (see the recent `official announcement <https://cloud.google.com/blog/products/ai-machine-learning/introducing-pytorch-across-google-cloud>`_\ ).
+
+We will add TPU support when this next release is published.
+
+The original TensorFlow code further comprises two scripts for pre-training BERT: `create_pretraining_data.py <https://github.com/google-research/bert/blob/master/create_pretraining_data.py>`_ and `run_pretraining.py <https://github.com/google-research/bert/blob/master/run_pretraining.py>`_.
+
+Since, pre-training BERT is a particularly expensive operation that basically requires one or several TPUs to be completed in a reasonable amout of time (see details `here <https://github.com/google-research/bert#pre-training-with-bert>`_\ ) we have decided to wait for the inclusion of TPU support in PyTorch to convert these pre-training scripts.
diff --git a/docs/source/usage.rst b/docs/source/usage.rst
new file mode 100644
index 0000000000..1e48a6ecf8
--- /dev/null
+++ b/docs/source/usage.rst
@@ -0,0 +1,339 @@
+Usage
+================================================
+
+BERT
+^^^^
+
+Here is a quick-start example using ``BertTokenizer``\ , ``BertModel`` and ``BertForMaskedLM`` class with Google AI's pre-trained ``Bert base uncased`` model. See the `doc section <#doc>`_ below for all the details on these classes.
+
+First let's prepare a tokenized input with ``BertTokenizer``
+
+.. code-block:: python
+
+   import torch
+   from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM
+
+   # OPTIONAL: if you want to have more information on what's happening, activate the logger as follows
+   import logging
+   logging.basicConfig(level=logging.INFO)
+
+   # Load pre-trained model tokenizer (vocabulary)
+   tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+
+   # Tokenized input
+   text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
+   tokenized_text = tokenizer.tokenize(text)
+
+   # Mask a token that we will try to predict back with `BertForMaskedLM`
+   masked_index = 8
+   tokenized_text[masked_index] = '[MASK]'
+   assert tokenized_text == ['[CLS]', 'who', 'was', 'jim', 'henson', '?', '[SEP]', 'jim', '[MASK]', 'was', 'a', 'puppet', '##eer', '[SEP]']
+
+   # Convert token to vocabulary indices
+   indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
+   # Define sentence A and B indices associated to 1st and 2nd sentences (see paper)
+   segments_ids = [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]
+
+   # Convert inputs to PyTorch tensors
+   tokens_tensor = torch.tensor([indexed_tokens])
+   segments_tensors = torch.tensor([segments_ids])
+
+Let's see how to use ``BertModel`` to get hidden states
+
+.. code-block:: python
+
+   # Load pre-trained model (weights)
+   model = BertModel.from_pretrained('bert-base-uncased')
+   model.eval()
+
+   # If you have a GPU, put everything on cuda
+   tokens_tensor = tokens_tensor.to('cuda')
+   segments_tensors = segments_tensors.to('cuda')
+   model.to('cuda')
+
+   # Predict hidden states features for each layer
+   with torch.no_grad():
+       encoded_layers, _ = model(tokens_tensor, segments_tensors)
+   # We have a hidden states for each of the 12 layers in model bert-base-uncased
+   assert len(encoded_layers) == 12
+
+And how to use ``BertForMaskedLM``
+
+.. code-block:: python
+
+   # Load pre-trained model (weights)
+   model = BertForMaskedLM.from_pretrained('bert-base-uncased')
+   model.eval()
+
+   # If you have a GPU, put everything on cuda
+   tokens_tensor = tokens_tensor.to('cuda')
+   segments_tensors = segments_tensors.to('cuda')
+   model.to('cuda')
+
+   # Predict all tokens
+   with torch.no_grad():
+       predictions = model(tokens_tensor, segments_tensors)
+
+   # confirm we were able to predict 'henson'
+   predicted_index = torch.argmax(predictions[0, masked_index]).item()
+   predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
+   assert predicted_token == 'henson'
+
+OpenAI GPT
+^^^^^^^^^^
+
+Here is a quick-start example using ``OpenAIGPTTokenizer``\ , ``OpenAIGPTModel`` and ``OpenAIGPTLMHeadModel`` class with OpenAI's pre-trained  model. See the `doc section <#doc>`_ below for all the details on these classes.
+
+First let's prepare a tokenized input with ``OpenAIGPTTokenizer``
+
+.. code-block:: python
+
+   import torch
+   from pytorch_pretrained_bert import OpenAIGPTTokenizer, OpenAIGPTModel, OpenAIGPTLMHeadModel
+
+   # OPTIONAL: if you want to have more information on what's happening, activate the logger as follows
+   import logging
+   logging.basicConfig(level=logging.INFO)
+
+   # Load pre-trained model tokenizer (vocabulary)
+   tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
+
+   # Tokenized input
+   text = "Who was Jim Henson ? Jim Henson was a puppeteer"
+   tokenized_text = tokenizer.tokenize(text)
+
+   # Convert token to vocabulary indices
+   indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
+
+   # Convert inputs to PyTorch tensors
+   tokens_tensor = torch.tensor([indexed_tokens])
+
+Let's see how to use ``OpenAIGPTModel`` to get hidden states
+
+.. code-block:: python
+
+   # Load pre-trained model (weights)
+   model = OpenAIGPTModel.from_pretrained('openai-gpt')
+   model.eval()
+
+   # If you have a GPU, put everything on cuda
+   tokens_tensor = tokens_tensor.to('cuda')
+   model.to('cuda')
+
+   # Predict hidden states features for each layer
+   with torch.no_grad():
+       hidden_states = model(tokens_tensor)
+
+And how to use ``OpenAIGPTLMHeadModel``
+
+.. code-block:: python
+
+   # Load pre-trained model (weights)
+   model = OpenAIGPTLMHeadModel.from_pretrained('openai-gpt')
+   model.eval()
+
+   # If you have a GPU, put everything on cuda
+   tokens_tensor = tokens_tensor.to('cuda')
+   model.to('cuda')
+
+   # Predict all tokens
+   with torch.no_grad():
+       predictions = model(tokens_tensor)
+
+   # get the predicted last token
+   predicted_index = torch.argmax(predictions[0, -1, :]).item()
+   predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
+   assert predicted_token == '.</w>'
+
+And how to use ``OpenAIGPTDoubleHeadsModel``
+
+.. code-block:: python
+
+   # Load pre-trained model (weights)
+   model = OpenAIGPTDoubleHeadsModel.from_pretrained('openai-gpt')
+   model.eval()
+
+   #  Prepare tokenized input
+   text1 = "Who was Jim Henson ? Jim Henson was a puppeteer"
+   text2 = "Who was Jim Henson ? Jim Henson was a mysterious young man"
+   tokenized_text1 = tokenizer.tokenize(text1)
+   tokenized_text2 = tokenizer.tokenize(text2)
+   indexed_tokens1 = tokenizer.convert_tokens_to_ids(tokenized_text1)
+   indexed_tokens2 = tokenizer.convert_tokens_to_ids(tokenized_text2)
+   tokens_tensor = torch.tensor([[indexed_tokens1, indexed_tokens2]])
+   mc_token_ids = torch.LongTensor([[len(tokenized_text1)-1, len(tokenized_text2)-1]])
+
+   # Predict hidden states features for each layer
+   with torch.no_grad():
+       lm_logits, multiple_choice_logits = model(tokens_tensor, mc_token_ids)
+
+Transformer-XL
+^^^^^^^^^^^^^^
+
+Here is a quick-start example using ``TransfoXLTokenizer``\ , ``TransfoXLModel`` and ``TransfoXLModelLMHeadModel`` class with the Transformer-XL model pre-trained on WikiText-103. See the `doc section <#doc>`_ below for all the details on these classes.
+
+First let's prepare a tokenized input with ``TransfoXLTokenizer``
+
+.. code-block:: python
+
+   import torch
+   from pytorch_pretrained_bert import TransfoXLTokenizer, TransfoXLModel, TransfoXLLMHeadModel
+
+   # OPTIONAL: if you want to have more information on what's happening, activate the logger as follows
+   import logging
+   logging.basicConfig(level=logging.INFO)
+
+   # Load pre-trained model tokenizer (vocabulary from wikitext 103)
+   tokenizer = TransfoXLTokenizer.from_pretrained('transfo-xl-wt103')
+
+   # Tokenized input
+   text_1 = "Who was Jim Henson ?"
+   text_2 = "Jim Henson was a puppeteer"
+   tokenized_text_1 = tokenizer.tokenize(text_1)
+   tokenized_text_2 = tokenizer.tokenize(text_2)
+
+   # Convert token to vocabulary indices
+   indexed_tokens_1 = tokenizer.convert_tokens_to_ids(tokenized_text_1)
+   indexed_tokens_2 = tokenizer.convert_tokens_to_ids(tokenized_text_2)
+
+   # Convert inputs to PyTorch tensors
+   tokens_tensor_1 = torch.tensor([indexed_tokens_1])
+   tokens_tensor_2 = torch.tensor([indexed_tokens_2])
+
+Let's see how to use ``TransfoXLModel`` to get hidden states
+
+.. code-block:: python
+
+   # Load pre-trained model (weights)
+   model = TransfoXLModel.from_pretrained('transfo-xl-wt103')
+   model.eval()
+
+   # If you have a GPU, put everything on cuda
+   tokens_tensor_1 = tokens_tensor_1.to('cuda')
+   tokens_tensor_2 = tokens_tensor_2.to('cuda')
+   model.to('cuda')
+
+   with torch.no_grad():
+       # Predict hidden states features for each layer
+       hidden_states_1, mems_1 = model(tokens_tensor_1)
+       # We can re-use the memory cells in a subsequent call to attend a longer context
+       hidden_states_2, mems_2 = model(tokens_tensor_2, mems=mems_1)
+
+And how to use ``TransfoXLLMHeadModel``
+
+.. code-block:: python
+
+   # Load pre-trained model (weights)
+   model = TransfoXLLMHeadModel.from_pretrained('transfo-xl-wt103')
+   model.eval()
+
+   # If you have a GPU, put everything on cuda
+   tokens_tensor_1 = tokens_tensor_1.to('cuda')
+   tokens_tensor_2 = tokens_tensor_2.to('cuda')
+   model.to('cuda')
+
+   with torch.no_grad():
+       # Predict all tokens
+       predictions_1, mems_1 = model(tokens_tensor_1)
+       # We can re-use the memory cells in a subsequent call to attend a longer context
+       predictions_2, mems_2 = model(tokens_tensor_2, mems=mems_1)
+
+   # get the predicted last token
+   predicted_index = torch.argmax(predictions_2[0, -1, :]).item()
+   predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
+   assert predicted_token == 'who'
+
+OpenAI GPT-2
+^^^^^^^^^^^^
+
+Here is a quick-start example using ``GPT2Tokenizer``\ , ``GPT2Model`` and ``GPT2LMHeadModel`` class with OpenAI's pre-trained  model. See the `doc section <#doc>`_ below for all the details on these classes.
+
+First let's prepare a tokenized input with ``GPT2Tokenizer``
+
+.. code-block:: python
+
+   import torch
+   from pytorch_pretrained_bert import GPT2Tokenizer, GPT2Model, GPT2LMHeadModel
+
+   # OPTIONAL: if you want to have more information on what's happening, activate the logger as follows
+   import logging
+   logging.basicConfig(level=logging.INFO)
+
+   # Load pre-trained model tokenizer (vocabulary)
+   tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
+
+   # Encode some inputs
+   text_1 = "Who was Jim Henson ?"
+   text_2 = "Jim Henson was a puppeteer"
+   indexed_tokens_1 = tokenizer.encode(text_1)
+   indexed_tokens_2 = tokenizer.encode(text_2)
+
+   # Convert inputs to PyTorch tensors
+   tokens_tensor_1 = torch.tensor([indexed_tokens_1])
+   tokens_tensor_2 = torch.tensor([indexed_tokens_2])
+
+Let's see how to use ``GPT2Model`` to get hidden states
+
+.. code-block:: python
+
+   # Load pre-trained model (weights)
+   model = GPT2Model.from_pretrained('gpt2')
+   model.eval()
+
+   # If you have a GPU, put everything on cuda
+   tokens_tensor_1 = tokens_tensor_1.to('cuda')
+   tokens_tensor_2 = tokens_tensor_2.to('cuda')
+   model.to('cuda')
+
+   # Predict hidden states features for each layer
+   with torch.no_grad():
+       hidden_states_1, past = model(tokens_tensor_1)
+       # past can be used to reuse precomputed hidden state in a subsequent predictions
+       # (see beam-search examples in the run_gpt2.py example).
+       hidden_states_2, past = model(tokens_tensor_2, past=past)
+
+And how to use ``GPT2LMHeadModel``
+
+.. code-block:: python
+
+   # Load pre-trained model (weights)
+   model = GPT2LMHeadModel.from_pretrained('gpt2')
+   model.eval()
+
+   # If you have a GPU, put everything on cuda
+   tokens_tensor_1 = tokens_tensor_1.to('cuda')
+   tokens_tensor_2 = tokens_tensor_2.to('cuda')
+   model.to('cuda')
+
+   # Predict all tokens
+   with torch.no_grad():
+       predictions_1, past = model(tokens_tensor_1)
+       # past can be used to reuse precomputed hidden state in a subsequent predictions
+       # (see beam-search examples in the run_gpt2.py example).
+       predictions_2, past = model(tokens_tensor_2, past=past)
+
+   # get the predicted last token
+   predicted_index = torch.argmax(predictions_2[0, -1, :]).item()
+   predicted_token = tokenizer.decode([predicted_index])
+
+And how to use ``GPT2DoubleHeadsModel``
+
+.. code-block:: python
+
+   # Load pre-trained model (weights)
+   model = GPT2DoubleHeadsModel.from_pretrained('gpt2')
+   model.eval()
+
+   #  Prepare tokenized input
+   text1 = "Who was Jim Henson ? Jim Henson was a puppeteer"
+   text2 = "Who was Jim Henson ? Jim Henson was a mysterious young man"
+   tokenized_text1 = tokenizer.tokenize(text1)
+   tokenized_text2 = tokenizer.tokenize(text2)
+   indexed_tokens1 = tokenizer.convert_tokens_to_ids(tokenized_text1)
+   indexed_tokens2 = tokenizer.convert_tokens_to_ids(tokenized_text2)
+   tokens_tensor = torch.tensor([[indexed_tokens1, indexed_tokens2]])
+   mc_token_ids = torch.LongTensor([[len(tokenized_text1)-1, len(tokenized_text2)-1]])
+
+   # Predict hidden states features for each layer
+   with torch.no_grad():
+       lm_logits, multiple_choice_logits, past = model(tokens_tensor, mc_token_ids)
diff --git a/pytorch_pretrained_bert/modeling_bert.py b/pytorch_pretrained_bert/modeling_bert.py
index d4967b3718..30ea4ece9e 100644
--- a/pytorch_pretrained_bert/modeling_bert.py
+++ b/pytorch_pretrained_bert/modeling_bert.py
@@ -565,53 +565,25 @@ class BertPreTrainedModel(PreTrainedModel):
 
 
 class BertModel(BertPreTrainedModel):
-    """BERT model ("Bidirectional Embedding Representations from a Transformer").
+    r"""BERT model ("Bidirectional Embedding Representations from a Transformer").
 
-    Params:
-        `config`: a BertConfig class instance with the configuration to build a new model
-        `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
-        `output_hidden_states`: If True, also output hidden states computed by the model at each layer. Default: False
+    :class:`~pytorch_pretrained_bert.BertModel` is the basic BERT Transformer model with a layer of summed token, \
+    position and sequence embeddings followed by a series of identical self-attention blocks (12 for BERT-base, 24 \
+    for BERT-large). The model is instantiated with the following parameters.
 
-    Inputs:
-        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
-            with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
-            `run_bert_extract_features.py`, `run_bert_classifier.py` and `run_bert_squad.py`)
-        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
-            types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
-            a `sentence B` token (see BERT paper for more details).
-        `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
-            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
-            input sequence length in the current batch. It's the mask that we typically use for attention when
-            a batch has varying length sentences.
-        `output_all_encoded_layers`: boolean which controls the content of the `encoded_layers` output as described below. Default: `True`.
-        `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
-            It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
+    Arguments:
+        config: a BertConfig class instance with the configuration to build a new model
+        output_attentions: If True, also output attentions weights computed by the model at each layer. Default: False
+        output_hidden_states: If True, also output hidden states computed by the model at each layer. Default: Fals
 
 
-    Outputs: Tuple of (encoded_layers, pooled_output)
-        `encoded_layers`: controled by `output_all_encoded_layers` argument:
-            - `output_all_encoded_layers=True`: outputs a list of the full sequences of encoded-hidden-states at the end
-                of each attention block (i.e. 12 full sequences for BERT-base, 24 for BERT-large), each
-                encoded-hidden-state is a torch.FloatTensor of size [batch_size, sequence_length, hidden_size],
-            - `output_all_encoded_layers=False`: outputs only the full sequence of hidden-states corresponding
-                to the last attention block of shape [batch_size, sequence_length, hidden_size],
-        `pooled_output`: a torch.FloatTensor of size [batch_size, hidden_size] which is the output of a
-            classifier pretrained on top of the hidden state associated to the first character of the
-            input (`CLS`) to train on the Next-Sentence task (see BERT's paper).
+    Example::
 
-    Example usage:
-    ```python
-    # Already been converted into WordPiece token ids
-    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
-    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
-    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
+        config = modeling.BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
+            num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
 
-    config = modeling.BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
-        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
+        model = modeling.BertModel(config=config)
 
-    model = modeling.BertModel(config=config)
-    all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)
-    ```
     """
     def __init__(self, config):
         super(BertModel, self).__init__(config)
@@ -631,6 +603,58 @@ class BertModel(BertPreTrainedModel):
             self.encoder.layer[layer].attention.prune_heads(heads)
 
     def forward(self, input_ids, token_type_ids=None, attention_mask=None, head_mask=None):
+        """
+        Performs a model forward pass. Can be called by calling the class directly, once it has been instantiated.
+
+
+        Arguments:
+            input_ids: a torch.LongTensor of shape [batch_size, sequence_length] with the word token indices in the \
+                vocabulary(see the tokens pre-processing logic in the scripts `run_bert_extract_features.py`, \
+                `run_bert_classifier.py` and `run_bert_squad.py`)
+            token_type_ids: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token \
+                types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to \
+                a `sentence B` token (see BERT paper for more details).
+            attention_mask: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices \
+                selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max \
+                input sequence length in the current batch. It's the mask that we typically use for attention when \
+                a batch has varying length sentences.
+            output_all_encoded_layers: boolean which controls the content of the `encoded_layers` output as described \
+            below. Default: `True`.
+            head_mask: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 \
+            and 1. It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 \
+            => head is not masked.
+
+
+        Returns:
+            A tuple composed of (encoded_layers, pooled_output). Encoded layers are controlled by the \
+            ``output_all_encoded_layers`` argument.
+
+            If ``output_all_encoded_layers`` is set to True, outputs a list of the full sequences of \
+            encoded-hidden-states at the end of each attention \
+            block (i.e. 12 full sequences for BERT-base, 24 for BERT-large), each encoded-hidden-state is a\
+            torch.FloatTensor of size [batch_size, sequence_length, hidden_size].
+
+            If set to False, outputs only the full sequence of hidden-states corresponding \
+            to the last attention block of shape [batch_size, sequence_length, hidden_size].
+
+            ``pooled_output`` is a torch.FloatTensor of size [batch_size, hidden_size] which is the output of a \
+            classifier pretrained on top of the hidden state associated to the first character of the \
+            input (`CLS`) to train on the Next-Sentence task (see BERT's paper).
+
+        Example::
+
+            # Already been converted into WordPiece token ids
+            input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
+            input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
+            token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
+
+
+            all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)
+            # or
+            all_encoder_layers, pooled_output = model.forward(input_ids, token_type_ids, input_mask)
+
+
+        """
         if attention_mask is None:
             attention_mask = torch.ones_like(input_ids)
         if token_type_ids is None:

From df759114c939a2c276085df168141a8a5fa3acaa Mon Sep 17 00:00:00 2001
From: LysandreJik <lysandre.debut@reseau.eseo.fr>
Date: Fri, 5 Jul 2019 17:35:26 -0400
Subject: [PATCH 066/139] Single file documentation for each model, accompanied
 by the Documentation overview.

---
 docs/index.rst                                |   2 -
 docs/source/index.rst                         |  15 +-
 docs/source/model_doc/bert.rst                | 110 +++++++
 docs/source/model_doc/gpt.rst                 |  59 ++++
 docs/source/model_doc/gpt2.rst                |  49 +++
 .../{doc.rst => model_doc/overview.rst}       | 291 ++----------------
 docs/source/model_doc/transformerxl.rst       |  26 ++
 docs/source/model_doc/xlm.rst                 |   2 +
 docs/source/model_doc/xlnet.rst               |   2 +
 9 files changed, 290 insertions(+), 266 deletions(-)
 delete mode 100644 docs/index.rst
 create mode 100644 docs/source/model_doc/bert.rst
 create mode 100644 docs/source/model_doc/gpt.rst
 create mode 100644 docs/source/model_doc/gpt2.rst
 rename docs/source/{doc.rst => model_doc/overview.rst} (60%)
 create mode 100644 docs/source/model_doc/transformerxl.rst
 create mode 100644 docs/source/model_doc/xlm.rst
 create mode 100644 docs/source/model_doc/xlnet.rst

diff --git a/docs/index.rst b/docs/index.rst
deleted file mode 100644
index 4639f1d218..0000000000
--- a/docs/index.rst
+++ /dev/null
@@ -1,2 +0,0 @@
-Home
-====
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 85125f3cf3..d7b60bd660 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -4,16 +4,29 @@ Pytorch-Transformers: The Big & Extending Repository of pretrained Transformers
 
 .. toctree::
     :maxdepth: 2
+    :caption: Notes
 
     installation
     usage
-    doc
     examples
     notebooks
     tpu
     cli
 
 
+.. toctree::
+    :maxdepth: 2
+    :caption: Package Reference
+
+    model_doc/overview
+    model_doc/bert
+    model_doc/gpt
+    model_doc/transformerxl
+    model_doc/gpt2
+    model_doc/xlm
+    model_doc/xlnet
+
+
 .. image:: https://circleci.com/gh/huggingface/pytorch-pretrained-BERT.svg?style=svg
    :target: https://circleci.com/gh/huggingface/pytorch-pretrained-BERT
    :alt: CircleCI
diff --git a/docs/source/model_doc/bert.rst b/docs/source/model_doc/bert.rst
new file mode 100644
index 0000000000..018f3e3968
--- /dev/null
+++ b/docs/source/model_doc/bert.rst
@@ -0,0 +1,110 @@
+BERT
+----------------------------------------------------
+
+``BertTokenizer``
+~~~~~~~~~~~~~~~~~~~~~
+
+``BertTokenizer`` perform end-to-end tokenization, i.e. basic tokenization followed by WordPiece tokenization.
+
+This class has five arguments:
+
+
+* ``vocab_file``\ : path to a vocabulary file.
+* ``do_lower_case``\ : convert text to lower-case while tokenizing. **Default = True**.
+* ``max_len``\ : max length to filter the input of the Transformer. Default to pre-trained value for the model if ``None``. **Default = None**
+* ``do_basic_tokenize``\ : Do basic tokenization before wordpice tokenization. Set to false if text is pre-tokenized. **Default = True**.
+* ``never_split``\ : a list of tokens that should not be splitted during tokenization. **Default = ``["[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]"]``\ **
+
+and three methods:
+
+
+* ``tokenize(text)``\ : convert a ``str`` in a list of ``str`` tokens by (1) performing basic tokenization and (2) WordPiece tokenization.
+* ``convert_tokens_to_ids(tokens)``\ : convert a list of ``str`` tokens in a list of ``int`` indices in the vocabulary.
+* ``convert_ids_to_tokens(tokens)``\ : convert a list of ``int`` indices in a list of ``str`` tokens in the vocabulary.
+* `save_vocabulary(directory_path)`: save the vocabulary file to `directory_path`. Return the path to the saved vocabulary file: ``vocab_file_path``. The vocabulary can be reloaded with ``BertTokenizer.from_pretrained('vocab_file_path')`` or ``BertTokenizer.from_pretrained('directory_path')``.
+
+Please refer to the doc strings and code in `\ ``tokenization.py`` <./pytorch_pretrained_bert/tokenization.py>`_ for the details of the ``BasicTokenizer`` and ``WordpieceTokenizer`` classes. In general it is recommended to use ``BertTokenizer`` unless you know what you are doing.
+
+
+``BertAdam``
+~~~~~~~~~~~~~~~~
+
+``BertAdam`` is a ``torch.optimizer`` adapted to be closer to the optimizer used in the TensorFlow implementation of Bert. The differences with PyTorch Adam optimizer are the following:
+
+
+* BertAdam implements weight decay fix,
+* BertAdam doesn't compensate for bias as in the regular Adam optimizer.
+
+The optimizer accepts the following arguments:
+
+
+* ``lr`` : learning rate
+* ``warmup`` : portion of ``t_total`` for the warmup, ``-1``  means no warmup. Default : ``-1``
+* ``t_total`` : total number of training steps for the learning
+    rate schedule, ``-1``  means constant learning rate. Default : ``-1``
+* ``schedule`` : schedule to use for the warmup (see above).
+    Can be ``'warmup_linear'``\ , ``'warmup_constant'``\ , ``'warmup_cosine'``\ , ``'none'``\ , ``None`` or a ``_LRSchedule`` object (see below).
+    If ``None`` or ``'none'``\ , learning rate is always kept constant.
+    Default : ``'warmup_linear'``
+* ``b1`` : Adams b1. Default : ``0.9``
+* ``b2`` : Adams b2. Default : ``0.999``
+* ``e`` : Adams epsilon. Default : ``1e-6``
+* ``weight_decay:`` Weight decay. Default : ``0.01``
+* ``max_grad_norm`` : Maximum norm for the gradients (\ ``-1`` means no clipping). Default : ``1.0``
+
+
+1. ``BertModel``
+~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: pytorch_pretrained_bert.BertModel
+    :members:
+
+
+2. ``BertForPreTraining``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: pytorch_pretrained_bert.BertForPreTraining
+    :members:
+
+
+3. ``BertForMaskedLM``
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: pytorch_pretrained_bert.BertForMaskedLM
+    :members:
+
+
+4. ``BertForNextSentencePrediction``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: pytorch_pretrained_bert.BertForNextSentencePrediction
+    :members:
+
+
+5. ``BertForSequenceClassification``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: pytorch_pretrained_bert.BertForSequenceClassification
+    :members:
+
+
+6. ``BertForMultipleChoice``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: pytorch_pretrained_bert.BertForMultipleChoice
+    :members:
+
+
+7. ``BertForTokenClassification``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: pytorch_pretrained_bert.BertForTokenClassification
+    :members:
+
+
+8. ``BertForQuestionAnswering``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: pytorch_pretrained_bert.BertForQuestionAnswering
+    :members:
+
diff --git a/docs/source/model_doc/gpt.rst b/docs/source/model_doc/gpt.rst
new file mode 100644
index 0000000000..59e84a342a
--- /dev/null
+++ b/docs/source/model_doc/gpt.rst
@@ -0,0 +1,59 @@
+OpenAI GPT
+----------------------------------------------------
+
+
+``OpenAIGPTTokenizer``
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+``OpenAIGPTTokenizer`` perform Byte-Pair-Encoding (BPE) tokenization.
+
+This class has four arguments:
+
+
+* ``vocab_file``\ : path to a vocabulary file.
+* ``merges_file``\ : path to a file containing the BPE merges.
+* ``max_len``\ : max length to filter the input of the Transformer. Default to pre-trained value for the model if ``None``. **Default = None**
+* ``special_tokens``\ : a list of tokens to add to the vocabulary for fine-tuning. If SpaCy is not installed and BERT's ``BasicTokenizer`` is used as the pre-BPE tokenizer, these tokens are not split. **Default= None**
+
+and five methods:
+
+
+* ``tokenize(text)``\ : convert a ``str`` in a list of ``str`` tokens by performing BPE tokenization.
+* ``convert_tokens_to_ids(tokens)``\ : convert a list of ``str`` tokens in a list of ``int`` indices in the vocabulary.
+* ``convert_ids_to_tokens(tokens)``\ : convert a list of ``int`` indices in a list of ``str`` tokens in the vocabulary.
+* ``set_special_tokens(self, special_tokens)``\ : update the list of special tokens (see above arguments)
+* ``encode(text)``\ : convert a ``str`` in a list of ``int`` tokens by performing BPE encoding.
+* `decode(ids, skip_special_tokens=False, clean_up_tokenization_spaces=False)`: decode a list of `int` indices in a string and do some post-processing if needed: (i) remove special tokens from the output and (ii) clean up tokenization spaces.
+* `save_vocabulary(directory_path)`: save the vocabulary, merge and special tokens files to `directory_path`. Return the path to the three files: ``vocab_file_path``\ , ``merge_file_path``\ , ``special_tokens_file_path``. The vocabulary can be reloaded with ``OpenAIGPTTokenizer.from_pretrained('directory_path')``.
+
+Please refer to the doc strings and code in `\ ``tokenization_openai.py`` <./pytorch_pretrained_bert/tokenization_openai.py>`_ for the details of the ``OpenAIGPTTokenizer``.
+
+
+``OpenAIAdam``
+~~~~~~~~~~~~~~~~~~
+
+``OpenAIAdam`` is similar to ``BertAdam``.
+The differences with ``BertAdam`` is that ``OpenAIAdam`` compensate for bias as in the regular Adam optimizer.
+
+``OpenAIAdam`` accepts the same arguments as ``BertAdam``.
+
+
+9. ``OpenAIGPTModel``
+~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: pytorch_pretrained_bert.OpenAIGPTModel
+    :members:
+
+
+10. ``OpenAIGPTLMHeadModel``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: pytorch_pretrained_bert.OpenAIGPTLMHeadModel
+    :members:
+
+
+11. ``OpenAIGPTDoubleHeadsModel``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: pytorch_pretrained_bert.OpenAIGPTDoubleHeadsModel
+    :members:
diff --git a/docs/source/model_doc/gpt2.rst b/docs/source/model_doc/gpt2.rst
new file mode 100644
index 0000000000..bfcf26acbb
--- /dev/null
+++ b/docs/source/model_doc/gpt2.rst
@@ -0,0 +1,49 @@
+OpenAI GPT2
+----------------------------------------------------
+
+
+``GPT2Tokenizer``
+~~~~~~~~~~~~~~~~~~~~~
+
+``GPT2Tokenizer`` perform byte-level Byte-Pair-Encoding (BPE) tokenization.
+
+This class has three arguments:
+
+
+* ``vocab_file``\ : path to a vocabulary file.
+* ``merges_file``\ : path to a file containing the BPE merges.
+* ``errors``\ : How to handle unicode decoding errors. **Default = ``replace``\ **
+
+and two methods:
+
+
+* ``tokenize(text)``\ : convert a ``str`` in a list of ``str`` tokens by performing byte-level BPE.
+* ``convert_tokens_to_ids(tokens)``\ : convert a list of ``str`` tokens in a list of ``int`` indices in the vocabulary.
+* ``convert_ids_to_tokens(tokens)``\ : convert a list of ``int`` indices in a list of ``str`` tokens in the vocabulary.
+* ``set_special_tokens(self, special_tokens)``\ : update the list of special tokens (see above arguments)
+* ``encode(text)``\ : convert a ``str`` in a list of ``int`` tokens by performing byte-level BPE.
+* ``decode(tokens)``\ : convert back a list of ``int`` tokens in a ``str``.
+* `save_vocabulary(directory_path)`: save the vocabulary, merge and special tokens files to `directory_path`. Return the path to the three files: ``vocab_file_path``\ , ``merge_file_path``\ , ``special_tokens_file_path``. The vocabulary can be reloaded with ``OpenAIGPTTokenizer.from_pretrained('directory_path')``.
+
+Please refer to `\ ``tokenization_gpt2.py`` <./pytorch_pretrained_bert/tokenization_gpt2.py>`_ for more details on the ``GPT2Tokenizer``.
+
+
+14. ``GPT2Model``
+~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: pytorch_pretrained_bert.GPT2Model
+    :members:
+
+
+15. ``GPT2LMHeadModel``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: pytorch_pretrained_bert.GPT2LMHeadModel
+    :members:
+
+
+16. ``GPT2DoubleHeadsModel``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: pytorch_pretrained_bert.GPT2DoubleHeadsModel
+    :members:
diff --git a/docs/source/doc.rst b/docs/source/model_doc/overview.rst
similarity index 60%
rename from docs/source/doc.rst
rename to docs/source/model_doc/overview.rst
index 662799053c..8f5e94baf1 100644
--- a/docs/source/doc.rst
+++ b/docs/source/model_doc/overview.rst
@@ -1,8 +1,7 @@
-Docs
+Overview
 ================================================
 
 
-
 Here is a detailed documentation of the classes in the package and how to use them:
 
 .. list-table::
@@ -24,6 +23,31 @@ Here is a detailed documentation of the classes in the package and how to use th
      - API of the optimizers
 
 
+Configurations
+^^^^^^^^^^^^^^
+
+Models (BERT, GPT, GPT-2 and Transformer-XL) are defined and build from configuration classes which contains the
+parameters of the models (number of layers, dimensionalities...) and a few utilities to read and write from JSON
+configuration files. The respective configuration classes are:
+
+
+* ``BertConfig`` for ``BertModel`` and BERT classes instances.
+* ``OpenAIGPTConfig`` for ``OpenAIGPTModel`` and OpenAI GPT classes instances.
+* ``GPT2Config`` for ``GPT2Model`` and OpenAI GPT-2 classes instances.
+* ``TransfoXLConfig`` for ``TransfoXLModel`` and Transformer-XL classes instances.
+
+These configuration classes contains a few utilities to load and save configurations:
+
+
+* ``from_dict(cls, json_object)``\ : A class method to construct a configuration from a Python dictionary of parameters.
+ Returns an instance of the configuration class.
+* ``from_json_file(cls, json_file)``\ : A class method to construct a configuration from a json file of parameters.
+Returns an instance of the configuration class.
+* ``to_dict()``\ : Serializes an instance to a Python dictionary. Returns a dictionary.
+* ``to_json_string()``\ : Serializes an instance to a JSON string. Returns a string.
+* ``to_json_file(json_file_path)``\ : Save an instance to a json file.
+
+
 Loading Google AI or OpenAI pre-trained weights or PyTorch dump
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
@@ -124,7 +148,7 @@ Usually, if you don't set any specific environment variable, ``pytorch_pretraine
 You can alsways safely delete ``pytorch_pretrained_bert`` cache but the pretrained model weights and vocabulary files wil have to be re-downloaded from our S3.
 
 Serialization best-practices
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 This section explain how you can save and re-load a fine-tuned model (BERT, GPT, GPT-2 and Transformer-XL).
 There are three types of files you need to save to be able to reload a fine-tuned model:
@@ -212,267 +236,8 @@ Here is another way you can save and reload the model if you want to use specifi
    model.load_state_dict(state_dict)
    tokenizer = OpenAIGPTTokenizer(output_vocab_file)
 
-Configurations
-^^^^^^^^^^^^^^
-
-Models (BERT, GPT, GPT-2 and Transformer-XL) are defined and build from configuration classes which containes the parameters of the models (number of layers, dimensionalities...) and a few utilities to read and write from JSON configuration files. The respective configuration classes are:
-
-
-* ``BertConfig`` for ``BertModel`` and BERT classes instances.
-* ``OpenAIGPTConfig`` for ``OpenAIGPTModel`` and OpenAI GPT classes instances.
-* ``GPT2Config`` for ``GPT2Model`` and OpenAI GPT-2 classes instances.
-* ``TransfoXLConfig`` for ``TransfoXLModel`` and Transformer-XL classes instances.
-
-These configuration classes contains a few utilities to load and save configurations:
-
-
-* ``from_dict(cls, json_object)``\ : A class method to construct a configuration from a Python dictionary of parameters. Returns an instance of the configuration class.
-* ``from_json_file(cls, json_file)``\ : A class method to construct a configuration from a json file of parameters. Returns an instance of the configuration class.
-* ``to_dict()``\ : Serializes an instance to a Python dictionary. Returns a dictionary.
-* ``to_json_string()``\ : Serializes an instance to a JSON string. Returns a string.
-* ``to_json_file(json_file_path)``\ : Save an instance to a json file.
-
-Models
-^^^^^^
-
-1. ``BertModel``
-~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: pytorch_pretrained_bert.BertModel
-    :members:
-
-
-2. ``BertForPreTraining``
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: pytorch_pretrained_bert.BertForPreTraining
-    :members:
-
-
-3. ``BertForMaskedLM``
-~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: pytorch_pretrained_bert.BertForMaskedLM
-    :members:
-
-
-4. ``BertForNextSentencePrediction``
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: pytorch_pretrained_bert.BertForNextSentencePrediction
-    :members:
-
-
-5. ``BertForSequenceClassification``
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: pytorch_pretrained_bert.BertForSequenceClassification
-    :members:
-
-
-6. ``BertForMultipleChoice``
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: pytorch_pretrained_bert.BertForMultipleChoice
-    :members:
-
-
-7. ``BertForTokenClassification``
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: pytorch_pretrained_bert.BertForTokenClassification
-    :members:
-
-
-8. ``BertForQuestionAnswering``
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: pytorch_pretrained_bert.BertForQuestionAnswering
-    :members:
-
-
-9. ``OpenAIGPTModel``
-~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: pytorch_pretrained_bert.OpenAIGPTModel
-    :members:
-
-
-10. ``OpenAIGPTLMHeadModel``
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: pytorch_pretrained_bert.OpenAIGPTLMHeadModel
-    :members:
-
-
-11. ``OpenAIGPTDoubleHeadsModel``
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: pytorch_pretrained_bert.OpenAIGPTDoubleHeadsModel
-    :members:
-
-
-12. ``TransfoXLModel``
-~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: pytorch_pretrained_bert.TransfoXLModel
-    :members:
-
-
-13. ``TransfoXLLMHeadModel``
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: pytorch_pretrained_bert.TransfoXLLMHeadModel
-    :members:
-
-
-14. ``GPT2Model``
-~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: pytorch_pretrained_bert.GPT2Model
-    :members:
-
-
-15. ``GPT2LMHeadModel``
-~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: pytorch_pretrained_bert.GPT2LMHeadModel
-    :members:
-
-
-16. ``GPT2DoubleHeadsModel``
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: pytorch_pretrained_bert.GPT2DoubleHeadsModel
-    :members:
-
-
-Tokenizers
-^^^^^^^^^^
-
-``BertTokenizer``
-~~~~~~~~~~~~~~~~~~~~~
-
-``BertTokenizer`` perform end-to-end tokenization, i.e. basic tokenization followed by WordPiece tokenization.
-
-This class has five arguments:
-
-
-* ``vocab_file``\ : path to a vocabulary file.
-* ``do_lower_case``\ : convert text to lower-case while tokenizing. **Default = True**.
-* ``max_len``\ : max length to filter the input of the Transformer. Default to pre-trained value for the model if ``None``. **Default = None**
-* ``do_basic_tokenize``\ : Do basic tokenization before wordpice tokenization. Set to false if text is pre-tokenized. **Default = True**.
-* ``never_split``\ : a list of tokens that should not be splitted during tokenization. **Default = ``["[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]"]``\ **
-
-and three methods:
-
-
-* ``tokenize(text)``\ : convert a ``str`` in a list of ``str`` tokens by (1) performing basic tokenization and (2) WordPiece tokenization.
-* ``convert_tokens_to_ids(tokens)``\ : convert a list of ``str`` tokens in a list of ``int`` indices in the vocabulary.
-* ``convert_ids_to_tokens(tokens)``\ : convert a list of ``int`` indices in a list of ``str`` tokens in the vocabulary.
-* `save_vocabulary(directory_path)`: save the vocabulary file to `directory_path`. Return the path to the saved vocabulary file: ``vocab_file_path``. The vocabulary can be reloaded with ``BertTokenizer.from_pretrained('vocab_file_path')`` or ``BertTokenizer.from_pretrained('directory_path')``.
-
-Please refer to the doc strings and code in `\ ``tokenization.py`` <./pytorch_pretrained_bert/tokenization.py>`_ for the details of the ``BasicTokenizer`` and ``WordpieceTokenizer`` classes. In general it is recommended to use ``BertTokenizer`` unless you know what you are doing.
-
-``OpenAIGPTTokenizer``
-~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-``OpenAIGPTTokenizer`` perform Byte-Pair-Encoding (BPE) tokenization.
-
-This class has four arguments:
-
-
-* ``vocab_file``\ : path to a vocabulary file.
-* ``merges_file``\ : path to a file containing the BPE merges.
-* ``max_len``\ : max length to filter the input of the Transformer. Default to pre-trained value for the model if ``None``. **Default = None**
-* ``special_tokens``\ : a list of tokens to add to the vocabulary for fine-tuning. If SpaCy is not installed and BERT's ``BasicTokenizer`` is used as the pre-BPE tokenizer, these tokens are not split. **Default= None**
-
-and five methods:
-
-
-* ``tokenize(text)``\ : convert a ``str`` in a list of ``str`` tokens by performing BPE tokenization.
-* ``convert_tokens_to_ids(tokens)``\ : convert a list of ``str`` tokens in a list of ``int`` indices in the vocabulary.
-* ``convert_ids_to_tokens(tokens)``\ : convert a list of ``int`` indices in a list of ``str`` tokens in the vocabulary.
-* ``set_special_tokens(self, special_tokens)``\ : update the list of special tokens (see above arguments)
-* ``encode(text)``\ : convert a ``str`` in a list of ``int`` tokens by performing BPE encoding.
-* `decode(ids, skip_special_tokens=False, clean_up_tokenization_spaces=False)`: decode a list of `int` indices in a string and do some post-processing if needed: (i) remove special tokens from the output and (ii) clean up tokenization spaces.
-* `save_vocabulary(directory_path)`: save the vocabulary, merge and special tokens files to `directory_path`. Return the path to the three files: ``vocab_file_path``\ , ``merge_file_path``\ , ``special_tokens_file_path``. The vocabulary can be reloaded with ``OpenAIGPTTokenizer.from_pretrained('directory_path')``.
-
-Please refer to the doc strings and code in `\ ``tokenization_openai.py`` <./pytorch_pretrained_bert/tokenization_openai.py>`_ for the details of the ``OpenAIGPTTokenizer``.
-
-``TransfoXLTokenizer``
-~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-``TransfoXLTokenizer`` perform word tokenization. This tokenizer can be used for adaptive softmax and has utilities for counting tokens in a corpus to create a vocabulary ordered by toekn frequency (for adaptive softmax). See the adaptive softmax paper (\ `Efficient softmax approximation for GPUs <http://arxiv.org/abs/1609.04309>`_\ ) for more details.
-
-The API is similar to the API of ``BertTokenizer`` (see above).
-
-Please refer to the doc strings and code in `\ ``tokenization_transfo_xl.py`` <./pytorch_pretrained_bert/tokenization_transfo_xl.py>`_ for the details of these additional methods in ``TransfoXLTokenizer``.
-
-``GPT2Tokenizer``
-~~~~~~~~~~~~~~~~~~~~~
-
-``GPT2Tokenizer`` perform byte-level Byte-Pair-Encoding (BPE) tokenization.
-
-This class has three arguments:
-
-
-* ``vocab_file``\ : path to a vocabulary file.
-* ``merges_file``\ : path to a file containing the BPE merges.
-* ``errors``\ : How to handle unicode decoding errors. **Default = ``replace``\ **
-
-and two methods:
-
-
-* ``tokenize(text)``\ : convert a ``str`` in a list of ``str`` tokens by performing byte-level BPE.
-* ``convert_tokens_to_ids(tokens)``\ : convert a list of ``str`` tokens in a list of ``int`` indices in the vocabulary.
-* ``convert_ids_to_tokens(tokens)``\ : convert a list of ``int`` indices in a list of ``str`` tokens in the vocabulary.
-* ``set_special_tokens(self, special_tokens)``\ : update the list of special tokens (see above arguments)
-* ``encode(text)``\ : convert a ``str`` in a list of ``int`` tokens by performing byte-level BPE.
-* ``decode(tokens)``\ : convert back a list of ``int`` tokens in a ``str``.
-* `save_vocabulary(directory_path)`: save the vocabulary, merge and special tokens files to `directory_path`. Return the path to the three files: ``vocab_file_path``\ , ``merge_file_path``\ , ``special_tokens_file_path``. The vocabulary can be reloaded with ``OpenAIGPTTokenizer.from_pretrained('directory_path')``.
-
-Please refer to `\ ``tokenization_gpt2.py`` <./pytorch_pretrained_bert/tokenization_gpt2.py>`_ for more details on the ``GPT2Tokenizer``.
-
-Optimizers
-^^^^^^^^^^
-
-``BertAdam``
-~~~~~~~~~~~~~~~~
-
-``BertAdam`` is a ``torch.optimizer`` adapted to be closer to the optimizer used in the TensorFlow implementation of Bert. The differences with PyTorch Adam optimizer are the following:
-
-
-* BertAdam implements weight decay fix,
-* BertAdam doesn't compensate for bias as in the regular Adam optimizer.
-
-The optimizer accepts the following arguments:
-
-
-* ``lr`` : learning rate
-* ``warmup`` : portion of ``t_total`` for the warmup, ``-1``  means no warmup. Default : ``-1``
-* ``t_total`` : total number of training steps for the learning
-    rate schedule, ``-1``  means constant learning rate. Default : ``-1``
-* ``schedule`` : schedule to use for the warmup (see above).
-    Can be ``'warmup_linear'``\ , ``'warmup_constant'``\ , ``'warmup_cosine'``\ , ``'none'``\ , ``None`` or a ``_LRSchedule`` object (see below).
-    If ``None`` or ``'none'``\ , learning rate is always kept constant.
-    Default : ``'warmup_linear'``
-* ``b1`` : Adams b1. Default : ``0.9``
-* ``b2`` : Adams b2. Default : ``0.999``
-* ``e`` : Adams epsilon. Default : ``1e-6``
-* ``weight_decay:`` Weight decay. Default : ``0.01``
-* ``max_grad_norm`` : Maximum norm for the gradients (\ ``-1`` means no clipping). Default : ``1.0``
-
-``OpenAIAdam``
-~~~~~~~~~~~~~~~~~~
-
-``OpenAIAdam`` is similar to ``BertAdam``.
-The differences with ``BertAdam`` is that ``OpenAIAdam`` compensate for bias as in the regular Adam optimizer.
-
-``OpenAIAdam`` accepts the same arguments as ``BertAdam``.
-
 Learning Rate Schedules
-~~~~~~~~~~~~~~~~~~~~~~~
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 The ``.optimization`` module also provides additional schedules in the form of schedule objects that inherit from ``_LRSchedule``.
 All ``_LRSchedule`` subclasses accept ``warmup`` and ``t_total`` arguments at construction.
diff --git a/docs/source/model_doc/transformerxl.rst b/docs/source/model_doc/transformerxl.rst
new file mode 100644
index 0000000000..c84693b38d
--- /dev/null
+++ b/docs/source/model_doc/transformerxl.rst
@@ -0,0 +1,26 @@
+Transformer XL
+----------------------------------------------------
+
+
+``TransfoXLTokenizer``
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+``TransfoXLTokenizer`` perform word tokenization. This tokenizer can be used for adaptive softmax and has utilities for counting tokens in a corpus to create a vocabulary ordered by toekn frequency (for adaptive softmax). See the adaptive softmax paper (\ `Efficient softmax approximation for GPUs <http://arxiv.org/abs/1609.04309>`_\ ) for more details.
+
+The API is similar to the API of ``BertTokenizer`` (see above).
+
+Please refer to the doc strings and code in `\ ``tokenization_transfo_xl.py`` <./pytorch_pretrained_bert/tokenization_transfo_xl.py>`_ for the details of these additional methods in ``TransfoXLTokenizer``.
+
+
+12. ``TransfoXLModel``
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: pytorch_pretrained_bert.TransfoXLModel
+    :members:
+
+
+13. ``TransfoXLLMHeadModel``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: pytorch_pretrained_bert.TransfoXLLMHeadModel
+    :members:
diff --git a/docs/source/model_doc/xlm.rst b/docs/source/model_doc/xlm.rst
new file mode 100644
index 0000000000..70b5fa3b40
--- /dev/null
+++ b/docs/source/model_doc/xlm.rst
@@ -0,0 +1,2 @@
+XLM
+----------------------------------------------------
diff --git a/docs/source/model_doc/xlnet.rst b/docs/source/model_doc/xlnet.rst
new file mode 100644
index 0000000000..d2fd996cbe
--- /dev/null
+++ b/docs/source/model_doc/xlnet.rst
@@ -0,0 +1,2 @@
+XLNet
+----------------------------------------------------

From 64fd98637629f371e3692a08017eb79458755cae Mon Sep 17 00:00:00 2001
From: LysandreJik <lysandre.debut@reseau.eseo.fr>
Date: Fri, 5 Jul 2019 17:44:59 -0400
Subject: [PATCH 067/139] Tokenizers and Config classes are referenced.

---
 docs/source/model_doc/bert.rst          | 54 +++++--------------------
 docs/source/model_doc/gpt.rst           | 36 +++++------------
 docs/source/model_doc/gpt2.rst          | 29 ++++---------
 docs/source/model_doc/transformerxl.rst | 14 ++++---
 docs/source/model_doc/xlm.rst           |  3 ++
 docs/source/model_doc/xlnet.rst         |  2 +
 6 files changed, 43 insertions(+), 95 deletions(-)

diff --git a/docs/source/model_doc/bert.rst b/docs/source/model_doc/bert.rst
index 018f3e3968..7dc669af75 100644
--- a/docs/source/model_doc/bert.rst
+++ b/docs/source/model_doc/bert.rst
@@ -1,57 +1,25 @@
 BERT
 ----------------------------------------------------
 
+``BertConfig``
+~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: pytorch_pretrained_bert.BertConfig
+    :members:
+
+
 ``BertTokenizer``
 ~~~~~~~~~~~~~~~~~~~~~
 
-``BertTokenizer`` perform end-to-end tokenization, i.e. basic tokenization followed by WordPiece tokenization.
-
-This class has five arguments:
-
-
-* ``vocab_file``\ : path to a vocabulary file.
-* ``do_lower_case``\ : convert text to lower-case while tokenizing. **Default = True**.
-* ``max_len``\ : max length to filter the input of the Transformer. Default to pre-trained value for the model if ``None``. **Default = None**
-* ``do_basic_tokenize``\ : Do basic tokenization before wordpice tokenization. Set to false if text is pre-tokenized. **Default = True**.
-* ``never_split``\ : a list of tokens that should not be splitted during tokenization. **Default = ``["[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]"]``\ **
-
-and three methods:
-
-
-* ``tokenize(text)``\ : convert a ``str`` in a list of ``str`` tokens by (1) performing basic tokenization and (2) WordPiece tokenization.
-* ``convert_tokens_to_ids(tokens)``\ : convert a list of ``str`` tokens in a list of ``int`` indices in the vocabulary.
-* ``convert_ids_to_tokens(tokens)``\ : convert a list of ``int`` indices in a list of ``str`` tokens in the vocabulary.
-* `save_vocabulary(directory_path)`: save the vocabulary file to `directory_path`. Return the path to the saved vocabulary file: ``vocab_file_path``. The vocabulary can be reloaded with ``BertTokenizer.from_pretrained('vocab_file_path')`` or ``BertTokenizer.from_pretrained('directory_path')``.
-
-Please refer to the doc strings and code in `\ ``tokenization.py`` <./pytorch_pretrained_bert/tokenization.py>`_ for the details of the ``BasicTokenizer`` and ``WordpieceTokenizer`` classes. In general it is recommended to use ``BertTokenizer`` unless you know what you are doing.
+.. autoclass:: pytorch_pretrained_bert.BertTokenizer
+    :members:
 
 
 ``BertAdam``
 ~~~~~~~~~~~~~~~~
 
-``BertAdam`` is a ``torch.optimizer`` adapted to be closer to the optimizer used in the TensorFlow implementation of Bert. The differences with PyTorch Adam optimizer are the following:
-
-
-* BertAdam implements weight decay fix,
-* BertAdam doesn't compensate for bias as in the regular Adam optimizer.
-
-The optimizer accepts the following arguments:
-
-
-* ``lr`` : learning rate
-* ``warmup`` : portion of ``t_total`` for the warmup, ``-1``  means no warmup. Default : ``-1``
-* ``t_total`` : total number of training steps for the learning
-    rate schedule, ``-1``  means constant learning rate. Default : ``-1``
-* ``schedule`` : schedule to use for the warmup (see above).
-    Can be ``'warmup_linear'``\ , ``'warmup_constant'``\ , ``'warmup_cosine'``\ , ``'none'``\ , ``None`` or a ``_LRSchedule`` object (see below).
-    If ``None`` or ``'none'``\ , learning rate is always kept constant.
-    Default : ``'warmup_linear'``
-* ``b1`` : Adams b1. Default : ``0.9``
-* ``b2`` : Adams b2. Default : ``0.999``
-* ``e`` : Adams epsilon. Default : ``1e-6``
-* ``weight_decay:`` Weight decay. Default : ``0.01``
-* ``max_grad_norm`` : Maximum norm for the gradients (\ ``-1`` means no clipping). Default : ``1.0``
-
+.. autoclass:: pytorch_pretrained_bert.BertAdam
+    :members:
 
 1. ``BertModel``
 ~~~~~~~~~~~~~~~~~~~~
diff --git a/docs/source/model_doc/gpt.rst b/docs/source/model_doc/gpt.rst
index 59e84a342a..3db40719b3 100644
--- a/docs/source/model_doc/gpt.rst
+++ b/docs/source/model_doc/gpt.rst
@@ -1,41 +1,25 @@
 OpenAI GPT
 ----------------------------------------------------
 
+``OpenAIGPTConfig``
+~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: pytorch_pretrained_bert.OpenAIGPTConfig
+    :members:
+
 
 ``OpenAIGPTTokenizer``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-``OpenAIGPTTokenizer`` perform Byte-Pair-Encoding (BPE) tokenization.
-
-This class has four arguments:
-
-
-* ``vocab_file``\ : path to a vocabulary file.
-* ``merges_file``\ : path to a file containing the BPE merges.
-* ``max_len``\ : max length to filter the input of the Transformer. Default to pre-trained value for the model if ``None``. **Default = None**
-* ``special_tokens``\ : a list of tokens to add to the vocabulary for fine-tuning. If SpaCy is not installed and BERT's ``BasicTokenizer`` is used as the pre-BPE tokenizer, these tokens are not split. **Default= None**
-
-and five methods:
-
-
-* ``tokenize(text)``\ : convert a ``str`` in a list of ``str`` tokens by performing BPE tokenization.
-* ``convert_tokens_to_ids(tokens)``\ : convert a list of ``str`` tokens in a list of ``int`` indices in the vocabulary.
-* ``convert_ids_to_tokens(tokens)``\ : convert a list of ``int`` indices in a list of ``str`` tokens in the vocabulary.
-* ``set_special_tokens(self, special_tokens)``\ : update the list of special tokens (see above arguments)
-* ``encode(text)``\ : convert a ``str`` in a list of ``int`` tokens by performing BPE encoding.
-* `decode(ids, skip_special_tokens=False, clean_up_tokenization_spaces=False)`: decode a list of `int` indices in a string and do some post-processing if needed: (i) remove special tokens from the output and (ii) clean up tokenization spaces.
-* `save_vocabulary(directory_path)`: save the vocabulary, merge and special tokens files to `directory_path`. Return the path to the three files: ``vocab_file_path``\ , ``merge_file_path``\ , ``special_tokens_file_path``. The vocabulary can be reloaded with ``OpenAIGPTTokenizer.from_pretrained('directory_path')``.
-
-Please refer to the doc strings and code in `\ ``tokenization_openai.py`` <./pytorch_pretrained_bert/tokenization_openai.py>`_ for the details of the ``OpenAIGPTTokenizer``.
+.. autoclass:: pytorch_pretrained_bert.OpenAIGPTTokenizer
+    :members:
 
 
 ``OpenAIAdam``
 ~~~~~~~~~~~~~~~~~~
 
-``OpenAIAdam`` is similar to ``BertAdam``.
-The differences with ``BertAdam`` is that ``OpenAIAdam`` compensate for bias as in the regular Adam optimizer.
-
-``OpenAIAdam`` accepts the same arguments as ``BertAdam``.
+.. autoclass:: pytorch_pretrained_bert.OpenAIAdam
+    :members:
 
 
 9. ``OpenAIGPTModel``
diff --git a/docs/source/model_doc/gpt2.rst b/docs/source/model_doc/gpt2.rst
index bfcf26acbb..ca232ca876 100644
--- a/docs/source/model_doc/gpt2.rst
+++ b/docs/source/model_doc/gpt2.rst
@@ -1,31 +1,18 @@
 OpenAI GPT2
 ----------------------------------------------------
 
+``GPT2Config``
+~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: pytorch_pretrained_bert.GPT2Config
+    :members:
+
 
 ``GPT2Tokenizer``
 ~~~~~~~~~~~~~~~~~~~~~
 
-``GPT2Tokenizer`` perform byte-level Byte-Pair-Encoding (BPE) tokenization.
-
-This class has three arguments:
-
-
-* ``vocab_file``\ : path to a vocabulary file.
-* ``merges_file``\ : path to a file containing the BPE merges.
-* ``errors``\ : How to handle unicode decoding errors. **Default = ``replace``\ **
-
-and two methods:
-
-
-* ``tokenize(text)``\ : convert a ``str`` in a list of ``str`` tokens by performing byte-level BPE.
-* ``convert_tokens_to_ids(tokens)``\ : convert a list of ``str`` tokens in a list of ``int`` indices in the vocabulary.
-* ``convert_ids_to_tokens(tokens)``\ : convert a list of ``int`` indices in a list of ``str`` tokens in the vocabulary.
-* ``set_special_tokens(self, special_tokens)``\ : update the list of special tokens (see above arguments)
-* ``encode(text)``\ : convert a ``str`` in a list of ``int`` tokens by performing byte-level BPE.
-* ``decode(tokens)``\ : convert back a list of ``int`` tokens in a ``str``.
-* `save_vocabulary(directory_path)`: save the vocabulary, merge and special tokens files to `directory_path`. Return the path to the three files: ``vocab_file_path``\ , ``merge_file_path``\ , ``special_tokens_file_path``. The vocabulary can be reloaded with ``OpenAIGPTTokenizer.from_pretrained('directory_path')``.
-
-Please refer to `\ ``tokenization_gpt2.py`` <./pytorch_pretrained_bert/tokenization_gpt2.py>`_ for more details on the ``GPT2Tokenizer``.
+.. autoclass:: pytorch_pretrained_bert.GPT2Tokenizer
+    :members:
 
 
 14. ``GPT2Model``
diff --git a/docs/source/model_doc/transformerxl.rst b/docs/source/model_doc/transformerxl.rst
index c84693b38d..2d2c38b250 100644
--- a/docs/source/model_doc/transformerxl.rst
+++ b/docs/source/model_doc/transformerxl.rst
@@ -2,14 +2,18 @@ Transformer XL
 ----------------------------------------------------
 
 
+``TransfoXLConfig``
+~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: pytorch_pretrained_bert.TransfoXLConfig
+    :members:
+
+
 ``TransfoXLTokenizer``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-``TransfoXLTokenizer`` perform word tokenization. This tokenizer can be used for adaptive softmax and has utilities for counting tokens in a corpus to create a vocabulary ordered by toekn frequency (for adaptive softmax). See the adaptive softmax paper (\ `Efficient softmax approximation for GPUs <http://arxiv.org/abs/1609.04309>`_\ ) for more details.
-
-The API is similar to the API of ``BertTokenizer`` (see above).
-
-Please refer to the doc strings and code in `\ ``tokenization_transfo_xl.py`` <./pytorch_pretrained_bert/tokenization_transfo_xl.py>`_ for the details of these additional methods in ``TransfoXLTokenizer``.
+.. autoclass:: pytorch_pretrained_bert.TransfoXLTokenizer
+    :members:
 
 
 12. ``TransfoXLModel``
diff --git a/docs/source/model_doc/xlm.rst b/docs/source/model_doc/xlm.rst
index 70b5fa3b40..086bf8782c 100644
--- a/docs/source/model_doc/xlm.rst
+++ b/docs/source/model_doc/xlm.rst
@@ -1,2 +1,5 @@
 XLM
 ----------------------------------------------------
+
+
+I don't really know what to put here, I'll leave it up to you to decide @Thom
\ No newline at end of file
diff --git a/docs/source/model_doc/xlnet.rst b/docs/source/model_doc/xlnet.rst
index d2fd996cbe..8138d1bcdb 100644
--- a/docs/source/model_doc/xlnet.rst
+++ b/docs/source/model_doc/xlnet.rst
@@ -1,2 +1,4 @@
 XLNet
 ----------------------------------------------------
+
+I don't really know what to put here, I'll leave it up to you to decide @Thom
\ No newline at end of file

From a60ae1a50582e8ee1870f1e1a7dd3a02787cd07c Mon Sep 17 00:00:00 2001
From: LysandreJik <lysandre.debut@reseau.eseo.fr>
Date: Mon, 8 Jul 2019 11:50:32 -0400
Subject: [PATCH 068/139] Docstrings best practice shown in the BERT
 documentation.

---
 pytorch_pretrained_bert/modeling_bert.py     | 609 ++++++++++---------
 pytorch_pretrained_bert/optimization.py      |   3 +-
 pytorch_pretrained_bert/tokenization_bert.py |  28 +-
 3 files changed, 345 insertions(+), 295 deletions(-)

diff --git a/pytorch_pretrained_bert/modeling_bert.py b/pytorch_pretrained_bert/modeling_bert.py
index 30ea4ece9e..7438764bba 100644
--- a/pytorch_pretrained_bert/modeling_bert.py
+++ b/pytorch_pretrained_bert/modeling_bert.py
@@ -150,27 +150,11 @@ ACT2FN = {"gelu": gelu, "relu": torch.nn.functional.relu, "swish": swish}
 
 
 class BertConfig(PretrainedConfig):
-    """Configuration class to store the configuration of a `BertModel`.
-    """
-    pretrained_config_archive_map = PRETRAINED_CONFIG_ARCHIVE_MAP
+    r"""
+        :class:`~pytorch_pretrained_bert.BertConfig` is the configuration class to store the configuration of a
+        `BertModel`.
 
-    def __init__(self,
-                 vocab_size_or_config_json_file=30522,
-                 hidden_size=768,
-                 num_hidden_layers=12,
-                 num_attention_heads=12,
-                 intermediate_size=3072,
-                 hidden_act="gelu",
-                 hidden_dropout_prob=0.1,
-                 attention_probs_dropout_prob=0.1,
-                 max_position_embeddings=512,
-                 type_vocab_size=2,
-                 initializer_range=0.02,
-                 layer_norm_eps=1e-12,
-                 **kwargs):
-        """Constructs BertConfig.
-
-        Args:
+        Arguments:
             vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `BertModel`.
             hidden_size: Size of the encoder layers and the pooler layer.
             num_hidden_layers: Number of hidden layers in the Transformer encoder.
@@ -192,6 +176,24 @@ class BertConfig(PretrainedConfig):
             initializer_range: The sttdev of the truncated_normal_initializer for
                 initializing all weight matrices.
             layer_norm_eps: The epsilon used by LayerNorm.
+    """
+    pretrained_config_archive_map = PRETRAINED_CONFIG_ARCHIVE_MAP
+
+    def __init__(self,
+                 vocab_size_or_config_json_file=30522,
+                 hidden_size=768,
+                 num_hidden_layers=12,
+                 num_attention_heads=12,
+                 intermediate_size=3072,
+                 hidden_act="gelu",
+                 hidden_dropout_prob=0.1,
+                 attention_probs_dropout_prob=0.1,
+                 max_position_embeddings=512,
+                 type_vocab_size=2,
+                 initializer_range=0.02,
+                 layer_norm_eps=1e-12,
+                 **kwargs):
+        """Constructs BertConfig.
         """
         super(BertConfig, self).__init__(**kwargs)
         if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
@@ -707,53 +709,17 @@ class BertForPreTraining(BertPreTrainedModel):
         - the masked language modeling head, and
         - the next sentence classification head.
 
-    Params:
+    Args:
         `config`: a BertConfig class instance with the configuration to build a new model
         `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
         `output_hidden_states`: If True, also output hidden states computed by the model at each layer. Default: False
 
-    Inputs:
-        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
-            with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
-            `run_bert_extract_features.py`, `run_bert_classifier.py` and `run_bert_squad.py`)
-        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
-            types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
-            a `sentence B` token (see BERT paper for more details).
-        `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
-            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
-            input sequence length in the current batch. It's the mask that we typically use for attention when
-            a batch has varying length sentences.
-        `masked_lm_labels`: optional masked language modeling labels: torch.LongTensor of shape [batch_size, sequence_length]
-            with indices selected in [-1, 0, ..., vocab_size]. All labels set to -1 are ignored (masked), the loss
-            is only computed for the labels set in [0, ..., vocab_size]
-        `next_sentence_label`: optional next sentence classification loss: torch.LongTensor of shape [batch_size]
-            with indices selected in [0, 1].
-            0 => next sentence is the continuation, 1 => next sentence is a random sentence.
-        `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
-            It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
+    Example ::
 
-    Outputs:
-        if `masked_lm_labels` and `next_sentence_label` are not `None`:
-            Outputs the total_loss which is the sum of the masked language modeling loss and the next
-            sentence classification loss.
-        if `masked_lm_labels` or `next_sentence_label` is `None`:
-            Outputs a tuple comprising
-            - the masked language modeling logits of shape [batch_size, sequence_length, vocab_size], and
-            - the next sentence classification logits of shape [batch_size, 2].
+        config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
+            num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
 
-    Example usage:
-    ```python
-    # Already been converted into WordPiece token ids
-    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
-    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
-    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
-
-    config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
-        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
-
-    model = BertForPreTraining(config)
-    masked_lm_logits_scores, seq_relationship_logits = model(input_ids, token_type_ids, input_mask)
-    ```
+        model = BertForPreTraining(config)
     """
     def __init__(self, config):
         super(BertForPreTraining, self).__init__(config)
@@ -765,6 +731,56 @@ class BertForPreTraining(BertPreTrainedModel):
 
     def forward(self, input_ids, token_type_ids=None, attention_mask=None, masked_lm_labels=None,
                 next_sentence_label=None, head_mask=None):
+        """
+        Performs a model forward pass. Can be called by calling the class directly, once it has been instantiated.
+
+        Args:
+            `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
+                with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
+                `run_bert_extract_features.py`, `run_bert_classifier.py` and `run_bert_squad.py`)
+            `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
+                types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
+                a `sentence B` token (see BERT paper for more details).
+            `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
+                selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
+                input sequence length in the current batch. It's the mask that we typically use for attention when
+                a batch has varying length sentences.
+            `masked_lm_labels`: optional masked language modeling labels: torch.LongTensor of shape [batch_size, sequence_length]
+                with indices selected in [-1, 0, ..., vocab_size]. All labels set to -1 are ignored (masked), the loss
+                is only computed for the labels set in [0, ..., vocab_size]
+            `next_sentence_label`: optional next sentence classification loss: torch.LongTensor of shape [batch_size]
+                with indices selected in [0, 1].
+                0 => next sentence is the continuation, 1 => next sentence is a random sentence.
+            `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
+                It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
+
+
+        Returns:
+            Either a torch.Tensor or tuple(torch.Tensor, torch.Tensor).
+
+            if ``masked_lm_labels`` and ``next_sentence_label`` are not ``None``, outputs the total_loss which is the \
+             sum of the masked language modeling loss and the next \
+            sentence classification loss.
+
+            if ``masked_lm_labels`` or ``next_sentence_label` is `None``, outputs a tuple comprising:
+                - the masked language modeling logits of shape [batch_size, sequence_length, vocab_size], and
+                - the next sentence classification logits of shape [batch_size, 2].
+
+        Example ::
+
+            # Already been converted into WordPiece token ids
+            input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
+            input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
+            token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
+
+            config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
+                num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
+
+            model = BertForPreTraining(config)
+            masked_lm_logits_scores, seq_relationship_logits = model(input_ids, token_type_ids, input_mask)
+            # or
+            masked_lm_logits_scores, seq_relationship_logits = model.forward(input_ids, token_type_ids, input_mask)
+        """
         outputs = self.bert(input_ids, token_type_ids, attention_mask, head_mask=head_mask)
 
         sequence_output, pooled_output = outputs[:2]
@@ -786,51 +802,17 @@ class BertForMaskedLM(BertPreTrainedModel):
     """BERT model with the masked language modeling head.
     This module comprises the BERT model followed by the masked language modeling head.
 
-    Params:
+    Args:
         `config`: a BertConfig class instance with the configuration to build a new model
         `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
         `output_hidden_states`: If True, also output hidden states computed by the model at each layer. Default: False
 
-    Inputs:
-        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
-            with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
-            `run_bert_extract_features.py`, `run_bert_classifier.py` and `run_bert_squad.py`)
-        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
-            types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
-            a `sentence B` token (see BERT paper for more details).
-        `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
-            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
-            input sequence length in the current batch. It's the mask that we typically use for attention when
-            a batch has varying length sentences.
-        `masked_lm_labels`: masked language modeling labels: torch.LongTensor of shape [batch_size, sequence_length]
-            with indices selected in [-1, 0, ..., vocab_size]. All labels set to -1 are ignored (masked), the loss
-            is only computed for the labels set in [0, ..., vocab_size]
-        `head_mask`: an optional torch.LongTensor of shape [num_heads] with indices
-            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
-            input sequence length in the current batch. It's the mask that we typically use for attention when
-            a batch has varying length sentences.
-        `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
-            It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
+    Example::
 
-    Outputs:
-        if `masked_lm_labels` is  not `None`:
-            Outputs the masked language modeling loss.
-        if `masked_lm_labels` is `None`:
-            Outputs the masked language modeling logits of shape [batch_size, sequence_length, vocab_size].
+        config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
+            num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
 
-    Example usage:
-    ```python
-    # Already been converted into WordPiece token ids
-    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
-    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
-    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
-
-    config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
-        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
-
-    model = BertForMaskedLM(config)
-    masked_lm_logits_scores = model(input_ids, token_type_ids, input_mask)
-    ```
+        model = BertForMaskedLM(config)
     """
     def __init__(self, config):
         super(BertForMaskedLM, self).__init__(config)
@@ -841,6 +823,45 @@ class BertForMaskedLM(BertPreTrainedModel):
         self.apply(self.init_weights)
 
     def forward(self, input_ids, token_type_ids=None, attention_mask=None, masked_lm_labels=None, head_mask=None):
+        """
+        Performs a model forward pass. Can be called by calling the class directly, once it has been instantiated.
+
+        Args:
+            `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
+                with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
+                `run_bert_extract_features.py`, `run_bert_classifier.py` and `run_bert_squad.py`)
+            `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
+                types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
+                a `sentence B` token (see BERT paper for more details).
+            `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
+                selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
+                input sequence length in the current batch. It's the mask that we typically use for attention when
+                a batch has varying length sentences.
+            `masked_lm_labels`: masked language modeling labels: torch.LongTensor of shape [batch_size, sequence_length]
+                with indices selected in [-1, 0, ..., vocab_size]. All labels set to -1 are ignored (masked), the loss
+                is only computed for the labels set in [0, ..., vocab_size]
+            `head_mask`: an optional torch.LongTensor of shape [num_heads] with indices
+                selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
+                input sequence length in the current batch. It's the mask that we typically use for attention when
+                a batch has varying length sentences.
+            `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
+                It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
+
+        Returns:
+            Masked language modeling loss if `masked_lm_labels` is specified, masked language modeling
+            logits of shape [batch_size, sequence_length, vocab_size] otherwise.
+
+        Example::
+
+            # Already been converted into WordPiece token ids
+            input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
+            input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
+            token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
+
+            masked_lm_logits_scores = model(input_ids, token_type_ids, input_mask)
+            # or
+            masked_lm_logits_scores = model.forward(input_ids, token_type_ids, input_mask)
+        """
         outputs = self.bert(input_ids, token_type_ids, attention_mask, head_mask=head_mask)
 
         sequence_output = outputs[0]
@@ -859,48 +880,17 @@ class BertForNextSentencePrediction(BertPreTrainedModel):
     """BERT model with next sentence prediction head.
     This module comprises the BERT model followed by the next sentence classification head.
 
-    Params:
+    Args:
         `config`: a BertConfig class instance with the configuration to build a new model
         `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
         `output_hidden_states`: If True, also output hidden states computed by the model at each layer. Default: False
 
-    Inputs:
-        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
-            with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
-            `run_bert_extract_features.py`, `run_bert_classifier.py` and `run_bert_squad.py`)
-        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
-            types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
-            a `sentence B` token (see BERT paper for more details).
-        `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
-            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
-            input sequence length in the current batch. It's the mask that we typically use for attention when
-            a batch has varying length sentences.
-        `next_sentence_label`: next sentence classification loss: torch.LongTensor of shape [batch_size]
-            with indices selected in [0, 1].
-            0 => next sentence is the continuation, 1 => next sentence is a random sentence.
-        `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
-            It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
+    Example::
 
-    Outputs:
-        if `next_sentence_label` is not `None`:
-            Outputs the total_loss which is the sum of the masked language modeling loss and the next
-            sentence classification loss.
-        if `next_sentence_label` is `None`:
-            Outputs the next sentence classification logits of shape [batch_size, 2].
+        config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
+            num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
 
-    Example usage:
-    ```python
-    # Already been converted into WordPiece token ids
-    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
-    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
-    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
-
-    config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
-        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
-
-    model = BertForNextSentencePrediction(config)
-    seq_relationship_logits = model(input_ids, token_type_ids, input_mask)
-    ```
+        model = BertForNextSentencePrediction(config)
     """
     def __init__(self, config):
         super(BertForNextSentencePrediction, self).__init__(config)
@@ -911,6 +901,44 @@ class BertForNextSentencePrediction(BertPreTrainedModel):
         self.apply(self.init_weights)
 
     def forward(self, input_ids, token_type_ids=None, attention_mask=None, next_sentence_label=None, head_mask=None):
+        """
+        Performs a model forward pass. Can be called by calling the class directly, once it has been instantiated.
+
+        Args:
+            `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
+                with the word token indices in the vocabulary(see the tokens pre-processing logic in the scripts
+                `run_bert_extract_features.py`, `run_bert_classifier.py` and `run_bert_squad.py`)
+            `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
+                types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
+                a `sentence B` token (see BERT paper for more details).
+            `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
+                selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
+                input sequence length in the current batch. It's the mask that we typically use for attention when
+                a batch has varying length sentences.
+            `next_sentence_label`: next sentence classification loss: torch.LongTensor of shape [batch_size]
+                with indices selected in [0, 1].
+                0 => next sentence is the continuation, 1 => next sentence is a random sentence.
+            `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between
+                0 and 1.It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked,
+                0.0 => head is not masked.
+
+        Returns:
+            If `next_sentence_label` is specified, outputs the total_loss which is the sum of the masked language \
+            modeling loss and the next sentence classification loss.
+            if `next_sentence_label` is `None`, outputs the next sentence classification logits of shape [batch_size, 2].
+
+
+        Example::
+
+            # Already been converted into WordPiece token ids
+            input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
+            input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
+            token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
+
+            seq_relationship_logits = model(input_ids, token_type_ids, input_mask)
+            # or
+            seq_relationship_logits = model.forward(input_ids, token_type_ids, input_mask)
+        """
         outputs = self.bert(input_ids, token_type_ids, attention_mask, head_mask=head_mask)
         pooled_output = outputs[1]
 
@@ -936,43 +964,14 @@ class BertForSequenceClassification(BertPreTrainedModel):
         `output_hidden_states`: If True, also output hidden states computed by the model at each layer. Default: False
         `num_labels`: the number of classes for the classifier. Default = 2.
 
-    Inputs:
-        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
-            with the word token indices in the vocabulary. Items in the batch should begin with the special "CLS" token. (see the tokens preprocessing logic in the scripts
-            `run_bert_extract_features.py`, `run_bert_classifier.py` and `run_bert_squad.py`)
-        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
-            types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
-            a `sentence B` token (see BERT paper for more details).
-        `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
-            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
-            input sequence length in the current batch. It's the mask that we typically use for attention when
-            a batch has varying length sentences.
-        `labels`: labels for the classification output: torch.LongTensor of shape [batch_size]
-            with indices selected in [0, ..., num_labels].
-        `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
-            It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
+    Example::
 
-    Outputs:
-        if `labels` is not `None`:
-            Outputs the CrossEntropy classification loss of the output with the labels.
-        if `labels` is `None`:
-            Outputs the classification logits of shape [batch_size, num_labels].
+        config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
+            num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
 
-    Example usage:
-    ```python
-    # Already been converted into WordPiece token ids
-    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
-    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
-    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
+        num_labels = 2
 
-    config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
-        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
-
-    num_labels = 2
-
-    model = BertForSequenceClassification(config, num_labels)
-    logits = model(input_ids, token_type_ids, input_mask)
-    ```
+        model = BertForSequenceClassification(config, num_labels)
     """
     def __init__(self, config):
         super(BertForSequenceClassification, self).__init__(config)
@@ -985,6 +984,40 @@ class BertForSequenceClassification(BertPreTrainedModel):
         self.apply(self.init_weights)
 
     def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None, head_mask=None):
+        """
+        Performs a model forward pass. Can be called by calling the class directly, once it has been instantiated.
+
+        Parameters:
+            `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
+                with the word token indices in the vocabulary. Items in the batch should begin with the special "CLS" token. (see the tokens preprocessing logic in the scripts
+                `run_bert_extract_features.py`, `run_bert_classifier.py` and `run_bert_squad.py`)
+            `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
+                types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
+                a `sentence B` token (see BERT paper for more details).
+            `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
+                selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
+                input sequence length in the current batch. It's the mask that we typically use for attention when
+                a batch has varying length sentences.
+            `labels`: labels for the classification output: torch.LongTensor of shape [batch_size]
+                with indices selected in [0, ..., num_labels].
+            `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
+                It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
+
+        Returns:
+            if `labels` is not `None`, outputs the CrossEntropy classification loss of the output with the labels.
+            if `labels` is `None`, outputs the classification logits of shape `[batch_size, num_labels]`.
+
+        Example::
+
+            # Already been converted into WordPiece token ids
+            input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
+            input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
+            token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
+
+            logits = model(input_ids, token_type_ids, input_mask)
+            # or
+            logits = model.forward(input_ids, token_type_ids, input_mask)
+        """
         outputs = self.bert(input_ids, token_type_ids, attention_mask, head_mask=head_mask)
         pooled_output = outputs[1]
 
@@ -1008,48 +1041,24 @@ class BertForSequenceClassification(BertPreTrainedModel):
 
 class BertForMultipleChoice(BertPreTrainedModel):
     """BERT model for multiple choice tasks.
-    This module is composed of the BERT model with a linear layer on top of
-    the pooled output.
+    This module is composed of the BERT model with a linear layer on top of the pooled output.
 
-    Params:
+    Parameters:
         `config`: a BertConfig class instance with the configuration to build a new model
         `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
         `output_hidden_states`: If True, also output hidden states computed by the model at each layer. Default: False
 
-    Inputs:
-        `input_ids`: a torch.LongTensor of shape [batch_size, num_choices, sequence_length]
-            with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
-            `run_bert_extract_features.py`, `run_bert_classifier.py` and `run_bert_squad.py`)
-        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, num_choices, sequence_length]
-            with the token types indices selected in [0, 1]. Type 0 corresponds to a `sentence A`
-            and type 1 corresponds to a `sentence B` token (see BERT paper for more details).
-        `attention_mask`: an optional torch.LongTensor of shape [batch_size, num_choices, sequence_length] with indices
-            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
-            input sequence length in the current batch. It's the mask that we typically use for attention when
-            a batch has varying length sentences.
-        `labels`: labels for the classification output: torch.LongTensor of shape [batch_size]
-            with indices selected in [0, ..., num_choices].
-        `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
-            It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
+    Example::
 
-    Outputs:
-        if `labels` is not `None`:
-            Outputs the CrossEntropy classification loss of the output with the labels.
-        if `labels` is `None`:
-            Outputs the classification logits of shape [batch_size, num_labels].
+        # Already been converted into WordPiece token ids
+        input_ids = torch.LongTensor([[[31, 51, 99], [15, 5, 0]], [[12, 16, 42], [14, 28, 57]]])
+        input_mask = torch.LongTensor([[[1, 1, 1], [1, 1, 0]],[[1,1,0], [1, 0, 0]]])
+        token_type_ids = torch.LongTensor([[[0, 0, 1], [0, 1, 0]],[[0, 1, 1], [0, 0, 1]]])
+        config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
+            num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
 
-    Example usage:
-    ```python
-    # Already been converted into WordPiece token ids
-    input_ids = torch.LongTensor([[[31, 51, 99], [15, 5, 0]], [[12, 16, 42], [14, 28, 57]]])
-    input_mask = torch.LongTensor([[[1, 1, 1], [1, 1, 0]],[[1,1,0], [1, 0, 0]]])
-    token_type_ids = torch.LongTensor([[[0, 0, 1], [0, 1, 0]],[[0, 1, 1], [0, 0, 1]]])
-    config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
-        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
-
-    model = BertForMultipleChoice(config)
-    logits = model(input_ids, token_type_ids, input_mask)
-    ```
+        model = BertForMultipleChoice(config)
+        logits = model(input_ids, token_type_ids, input_mask)
     """
     def __init__(self, config):
         super(BertForMultipleChoice, self).__init__(config)
@@ -1061,6 +1070,41 @@ class BertForMultipleChoice(BertPreTrainedModel):
         self.apply(self.init_weights)
 
     def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None, head_mask=None):
+        """
+        Performs a model forward pass. Can be called by calling the class directly, once it has been instantiated.
+
+        Parameters:
+            `input_ids`: a torch.LongTensor of shape [batch_size, num_choices, sequence_length]
+                with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
+                `run_bert_extract_features.py`, `run_bert_classifier.py` and `run_bert_squad.py`)
+            `token_type_ids`: an optional torch.LongTensor of shape [batch_size, num_choices, sequence_length]
+                with the token types indices selected in [0, 1]. Type 0 corresponds to a `sentence A`
+                and type 1 corresponds to a `sentence B` token (see BERT paper for more details).
+            `attention_mask`: an optional torch.LongTensor of shape [batch_size, num_choices, sequence_length] with indices
+                selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
+                input sequence length in the current batch. It's the mask that we typically use for attention when
+                a batch has varying length sentences.
+            `labels`: labels for the classification output: torch.LongTensor of shape [batch_size]
+                with indices selected in [0, ..., num_choices].
+            `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
+                It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
+
+        Returns:
+            if `labels` is not `None`, outputs the CrossEntropy classification loss of the output with the labels.
+            if `labels` is `None`, outputs the classification logits of shape [batch_size, num_labels].
+
+        Example::
+
+            # Already been converted into WordPiece token ids
+            input_ids = torch.LongTensor([[[31, 51, 99], [15, 5, 0]], [[12, 16, 42], [14, 28, 57]]])
+            input_mask = torch.LongTensor([[[1, 1, 1], [1, 1, 0]],[[1,1,0], [1, 0, 0]]])
+            token_type_ids = torch.LongTensor([[[0, 0, 1], [0, 1, 0]],[[0, 1, 1], [0, 0, 1]]])
+            config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
+                num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
+
+            model = BertForMultipleChoice(config)
+            logits = model(input_ids, token_type_ids, input_mask)
+        """
         """ Input shapes should be [bsz, num choices, seq length] """
         num_choices = input_ids.shape[1]
 
@@ -1089,49 +1133,20 @@ class BertForTokenClassification(BertPreTrainedModel):
     This module is composed of the BERT model with a linear layer on top of
     the full hidden state of the last layer.
 
-    Params:
+    Parameters:
         `config`: a BertConfig class instance with the configuration to build a new model
         `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
         `output_hidden_states`: If True, also output hidden states computed by the model at each layer. Default: False
         `num_labels`: the number of classes for the classifier. Default = 2.
 
-    Inputs:
-        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
-            with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
-            `run_bert_extract_features.py`, `run_bert_classifier.py` and `run_bert_squad.py`)
-        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
-            types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
-            a `sentence B` token (see BERT paper for more details).
-        `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
-            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
-            input sequence length in the current batch. It's the mask that we typically use for attention when
-            a batch has varying length sentences.
-        `labels`: labels for the classification output: torch.LongTensor of shape [batch_size, sequence_length]
-            with indices selected in [0, ..., num_labels].
-        `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
-            It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
+    Example::
 
-    Outputs:
-        if `labels` is not `None`:
-            Outputs the CrossEntropy classification loss of the output with the labels.
-        if `labels` is `None`:
-            Outputs the classification logits of shape [batch_size, sequence_length, num_labels].
+        config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
+            num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
 
-    Example usage:
-    ```python
-    # Already been converted into WordPiece token ids
-    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
-    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
-    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
+        num_labels = 2
 
-    config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
-        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
-
-    num_labels = 2
-
-    model = BertForTokenClassification(config, num_labels)
-    logits = model(input_ids, token_type_ids, input_mask)
-    ```
+        model = BertForTokenClassification(config, num_labels)
     """
     def __init__(self, config):
         super(BertForTokenClassification, self).__init__(config)
@@ -1144,6 +1159,40 @@ class BertForTokenClassification(BertPreTrainedModel):
         self.apply(self.init_weights)
 
     def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None, head_mask=None):
+        """
+        Performs a model forward pass. Can be called by calling the class directly, once it has been instantiated.
+
+        Parameters:
+            `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
+                with the word token indices in the vocabulary(see the tokens pre-processing logic in the scripts
+                `run_bert_extract_features.py`, `run_bert_classifier.py` and `run_bert_squad.py`)
+            `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
+                types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
+                a `sentence B` token (see BERT paper for more details).
+            `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
+                selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
+                input sequence length in the current batch. It's the mask that we typically use for attention when
+                a batch has varying length sentences.
+            `labels`: labels for the classification output: torch.LongTensor of shape [batch_size, sequence_length]
+                with indices selected in [0, ..., num_labels].
+            `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
+                It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
+
+        Returns:
+            if `labels` is not `None`, outputs the CrossEntropy classification loss of the output with the labels.
+            if `labels` is `None`, outputs the classification logits of shape [batch_size, sequence_length, num_labels].
+
+        Example::
+
+            # Already been converted into WordPiece token ids
+            input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
+            input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
+            token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
+
+            logits = model(input_ids, token_type_ids, input_mask)
+            # or
+            logits = model.forward(input_ids, token_type_ids, input_mask)
+        """
         outputs = self.bert(input_ids, token_type_ids, attention_mask, head_mask=head_mask)
         sequence_output = outputs[0]
 
@@ -1171,51 +1220,17 @@ class BertForQuestionAnswering(BertPreTrainedModel):
     This module is composed of the BERT model with a linear layer on top of
     the sequence output that computes start_logits and end_logits
 
-    Params:
+    Parameters:
         `config`: a BertConfig class instance with the configuration to build a new model
         `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
         `output_hidden_states`: If True, also output hidden states computed by the model at each layer. Default: False
 
-    Inputs:
-        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
-            with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
-            `run_bert_extract_features.py`, `run_bert_classifier.py` and `run_bert_squad.py`)
-        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
-            types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
-            a `sentence B` token (see BERT paper for more details).
-        `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
-            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
-            input sequence length in the current batch. It's the mask that we typically use for attention when
-            a batch has varying length sentences.
-        `start_positions`: position of the first token for the labeled span: torch.LongTensor of shape [batch_size].
-            Positions are clamped to the length of the sequence and position outside of the sequence are not taken
-            into account for computing the loss.
-        `end_positions`: position of the last token for the labeled span: torch.LongTensor of shape [batch_size].
-            Positions are clamped to the length of the sequence and position outside of the sequence are not taken
-            into account for computing the loss.
-        `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
-            It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
+    Example::
 
-    Outputs:
-        if `start_positions` and `end_positions` are not `None`:
-            Outputs the total_loss which is the sum of the CrossEntropy loss for the start and end token positions.
-        if `start_positions` or `end_positions` is `None`:
-            Outputs a tuple of start_logits, end_logits which are the logits respectively for the start and end
-            position tokens of shape [batch_size, sequence_length].
+        config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
+            num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
 
-    Example usage:
-    ```python
-    # Already been converted into WordPiece token ids
-    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
-    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
-    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
-
-    config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
-        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
-
-    model = BertForQuestionAnswering(config)
-    start_logits, end_logits = model(input_ids, token_type_ids, input_mask)
-    ```
+        model = BertForQuestionAnswering(config)
     """
     def __init__(self, config):
         super(BertForQuestionAnswering, self).__init__(config)
@@ -1228,6 +1243,42 @@ class BertForQuestionAnswering(BertPreTrainedModel):
 
     def forward(self, input_ids, token_type_ids=None, attention_mask=None, start_positions=None,
                 end_positions=None, head_mask=None):
+        """
+        Parameters:
+            `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
+                with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
+                `run_bert_extract_features.py`, `run_bert_classifier.py` and `run_bert_squad.py`)
+            `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
+                types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
+                a `sentence B` token (see BERT paper for more details).
+            `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
+                selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
+                input sequence length in the current batch. It's the mask that we typically use for attention when
+                a batch has varying length sentences.
+            `start_positions`: position of the first token for the labeled span: torch.LongTensor of shape [batch_size].
+                Positions are clamped to the length of the sequence and position outside of the sequence are not taken
+                into account for computing the loss.
+            `end_positions`: position of the last token for the labeled span: torch.LongTensor of shape [batch_size].
+                Positions are clamped to the length of the sequence and position outside of the sequence are not taken
+                into account for computing the loss.
+            `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
+                It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
+
+        Returns:
+            if `start_positions` and `end_positions` are not `None`, outputs the total_loss which is the sum of the
+            CrossEntropy loss for the start and end token positions.
+            if `start_positions` or `end_positions` is `None`, outputs a tuple of start_logits, end_logits which are the
+            logits respectively for the start and end position tokens of shape [batch_size, sequence_length].
+
+        Example::
+
+            # Already been converted into WordPiece token ids
+            input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
+            input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
+            token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
+
+            start_logits, end_logits = model(input_ids, token_type_ids, input_mask)
+        """
         outputs = self.bert(input_ids, token_type_ids, attention_mask, head_mask=head_mask)
         sequence_output = outputs[0]
 
diff --git a/pytorch_pretrained_bert/optimization.py b/pytorch_pretrained_bert/optimization.py
index 03856956ac..d13dd45c6b 100644
--- a/pytorch_pretrained_bert/optimization.py
+++ b/pytorch_pretrained_bert/optimization.py
@@ -182,7 +182,8 @@ SCHEDULES = {
 
 class BertAdam(Optimizer):
     """Implements BERT version of Adam algorithm with weight decay fix.
-    Params:
+
+    Parameters:
         lr: learning rate
         warmup: portion of t_total for the warmup, -1  means no warmup. Default: -1
         t_total: total number of training steps for the learning
diff --git a/pytorch_pretrained_bert/tokenization_bert.py b/pytorch_pretrained_bert/tokenization_bert.py
index c8db62b9c0..2a1105affe 100644
--- a/pytorch_pretrained_bert/tokenization_bert.py
+++ b/pytorch_pretrained_bert/tokenization_bert.py
@@ -84,24 +84,22 @@ def whitespace_tokenize(text):
 
 
 class BertTokenizer(object):
-    """Runs end-to-end tokenization: punctuation splitting + wordpiece"""
+    r"""
+    Constructs a BertTokenizer.
+    :class:`~pytorch_pretrained_bert.BertTokenizer` runs end-to-end tokenization: punctuation splitting + wordpiece
+
+    Args:
+        vocab_file: Path to a one-wordpiece-per-line vocabulary file
+        do_lower_case: Whether to lower case the input. Only has an effect when do_wordpiece_only=False
+        do_basic_tokenize: Whether to do basic tokenization before wordpiece.
+        max_len: An artificial maximum length to truncate tokenized sequences to; Effective maximum length is always the
+            minimum of this value (if specified) and the underlying BERT model's sequence length.
+        never_split: List of tokens which will never be split during tokenization. Only has an effect when
+            do_wordpiece_only=False
+    """
 
     def __init__(self, vocab_file, do_lower_case=True, max_len=None, do_basic_tokenize=True,
                  never_split=("[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]")):
-        """Constructs a BertTokenizer.
-
-        Args:
-          vocab_file: Path to a one-wordpiece-per-line vocabulary file
-          do_lower_case: Whether to lower case the input
-                         Only has an effect when do_wordpiece_only=False
-          do_basic_tokenize: Whether to do basic tokenization before wordpiece.
-          max_len: An artificial maximum length to truncate tokenized sequences to;
-                         Effective maximum length is always the minimum of this
-                         value (if specified) and the underlying BERT model's
-                         sequence length.
-          never_split: List of tokens which will never be split during tokenization.
-                         Only has an effect when do_wordpiece_only=False
-        """
         if not os.path.isfile(vocab_file):
             raise ValueError(
                 "Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained "

From ab30651802304b048697447f64d9a4ea5d27cc12 Mon Sep 17 00:00:00 2001
From: LysandreJik <lysandre.debut@reseau.eseo.fr>
Date: Mon, 8 Jul 2019 16:05:26 -0400
Subject: [PATCH 069/139] Hugging Face theme.

---
 README.md                                     |  10 +-
 docs/source/_static/css/code-snippets.css     |  12 ++
 docs/source/_static/css/huggingface.css       | 144 ++++++++++++++++++
 docs/source/_static/js/custom.js              |  18 +++
 docs/source/_static/js/huggingface_logo.svg   |  47 ++++++
 docs/source/conf.py                           |   4 +
 .../imgs/warmup_constant_schedule.png         | Bin
 .../warmup_cosine_hard_restarts_schedule.png  | Bin
 .../imgs/warmup_cosine_schedule.png           | Bin
 .../warmup_cosine_warm_restarts_schedule.png  | Bin
 .../imgs/warmup_linear_schedule.png           | Bin
 docs/source/index.rst                         |   2 +-
 docs/source/model_doc/overview.rst            |  40 ++---
 13 files changed, 252 insertions(+), 25 deletions(-)
 create mode 100644 docs/source/_static/css/code-snippets.css
 create mode 100644 docs/source/_static/css/huggingface.css
 create mode 100644 docs/source/_static/js/custom.js
 create mode 100644 docs/source/_static/js/huggingface_logo.svg
 rename docs/{ => source}/imgs/warmup_constant_schedule.png (100%)
 rename docs/{ => source}/imgs/warmup_cosine_hard_restarts_schedule.png (100%)
 rename docs/{ => source}/imgs/warmup_cosine_schedule.png (100%)
 rename docs/{ => source}/imgs/warmup_cosine_warm_restarts_schedule.png (100%)
 rename docs/{ => source}/imgs/warmup_linear_schedule.png (100%)

diff --git a/README.md b/README.md
index a5234bd9ba..4c5b45ea2d 100644
--- a/README.md
+++ b/README.md
@@ -1116,22 +1116,22 @@ An overview of the implemented schedules:
 - `ConstantLR`: always returns learning rate 1.
 - `WarmupConstantSchedule`: Linearly increases learning rate from 0 to 1 over `warmup` fraction of training steps.
     Keeps learning rate equal to 1. after warmup.
-    ![](docs/imgs/warmup_constant_schedule.png)
+    ![](docs/source/imgs/warmup_constant_schedule.png)
 - `WarmupLinearSchedule`: Linearly increases learning rate from 0 to 1 over `warmup` fraction of training steps.
     Linearly decreases learning rate from 1. to 0. over remaining `1 - warmup` steps.
-    ![](docs/imgs/warmup_linear_schedule.png)
+    ![](docs/source/imgs/warmup_linear_schedule.png)
 -  `WarmupCosineSchedule`: Linearly increases learning rate from 0 to 1 over `warmup` fraction of training steps.
     Decreases learning rate from 1. to 0. over remaining `1 - warmup` steps following a cosine curve.
     If `cycles` (default=0.5) is different from default, learning rate follows cosine function after warmup.
-    ![](docs/imgs/warmup_cosine_schedule.png)
+    ![](docs/source/imgs/warmup_cosine_schedule.png)
 - `WarmupCosineWithHardRestartsSchedule`: Linearly increases learning rate from 0 to 1 over `warmup` fraction of training steps.
     If `cycles` (default=1.) is different from default, learning rate follows `cycles` times a cosine decaying learning rate (with hard restarts).
-    ![](docs/imgs/warmup_cosine_hard_restarts_schedule.png)
+    ![](docs/source/imgs/warmup_cosine_hard_restarts_schedule.png)
 - `WarmupCosineWithWarmupRestartsSchedule`: All training progress is divided in `cycles` (default=1.) parts of equal length.
     Every part follows a schedule with the first `warmup` fraction of the training steps linearly increasing from 0. to 1.,
     followed by a learning rate decreasing from 1. to 0. following a cosine curve.
     Note that the total number of all warmup steps over all cycles together is equal to `warmup` * `cycles`
-    ![](docs/imgs/warmup_cosine_warm_restarts_schedule.png)
+    ![](docs/source/imgs/warmup_cosine_warm_restarts_schedule.png)
 
 ## Examples
 
diff --git a/docs/source/_static/css/code-snippets.css b/docs/source/_static/css/code-snippets.css
new file mode 100644
index 0000000000..4d525e95d7
--- /dev/null
+++ b/docs/source/_static/css/code-snippets.css
@@ -0,0 +1,12 @@
+
+.highlight .c1{
+    color: #999
+}
+
+.highlight .nn, .highlight .k, .highlight .s1, .highlight .nb, .highlight .bp {
+    color: #FB8D68;
+}
+
+.highlight .kn, .highlight .nv, .highlight .s2 {
+    color: #6670FF;
+}
\ No newline at end of file
diff --git a/docs/source/_static/css/huggingface.css b/docs/source/_static/css/huggingface.css
new file mode 100644
index 0000000000..f50726b57d
--- /dev/null
+++ b/docs/source/_static/css/huggingface.css
@@ -0,0 +1,144 @@
+/* The literal code blocks */
+.rst-content tt.literal, .rst-content tt.literal, .rst-content code.literal {
+    color: #6670FF;
+}
+
+/* To keep the logo centered */
+.wy-side-scroll {
+    width: auto;
+}
+
+/* The div that holds the Hugging Face logo */
+.HuggingFaceDiv {
+    width: 100%
+}
+
+/* The research field on top of the toc tree */
+.wy-side-nav-search{
+    background-color: #6670FF;
+}
+
+/* The toc tree */
+.wy-nav-side{
+    background-color: #6670FF;
+}
+
+/* The selected items in the toc tree */
+.wy-menu-vertical li.current{
+    background-color: #A6B0FF;
+}
+
+/* When a list item that does belong to the selected block from the toc tree is hovered */
+.wy-menu-vertical li.current a:hover{
+    background-color: #FB8D68;
+}
+
+/* When a list item that does NOT belong to the selected block from the toc tree is hovered. */
+.wy-menu-vertical li a:hover{
+    background-color: #FB8D68;
+}
+
+/* The text items on the toc tree */
+.wy-menu-vertical a {
+    color: #FFFFDD;
+    font-family: Calibre-Light;
+}
+.wy-menu-vertical header, .wy-menu-vertical p.caption{
+    color: white;
+    font-family: Calibre-Light;
+}
+
+/* The color inside the selected toc tree block */
+.wy-menu-vertical li.toctree-l2 a, .wy-menu-vertical li.toctree-l3 a, .wy-menu-vertical li.toctree-l4 a {
+    color: black;
+}
+
+/* Inside the depth-2 selected toc tree block */
+.wy-menu-vertical li.toctree-l2.current>a {
+    background-color: #B6C0FF
+}
+.wy-menu-vertical li.toctree-l2.current li.toctree-l3>a {
+    background-color: #C6D0FF
+}
+
+/* Inside the depth-3 selected toc tree block */
+.wy-menu-vertical li.toctree-l3.current li.toctree-l4>a{
+    background-color: #D6E0FF
+}
+
+/* Inside code snippets */
+.rst-content dl:not(.docutils) dt{
+    font-size: 15px;
+}
+
+/* Links */
+a {
+    color: #6670FF;
+}
+
+/* Content bars */
+.rst-content dl:not(.docutils) dt {
+    background-color: rgba(251, 141, 104, 0.1);
+    border-right: solid 2px #FB8D68;
+    border-left: solid 2px #FB8D68;
+    color: #FB8D68;
+    font-family: Calibre-Light;
+    border-top: none;
+    font-style: normal !important;
+}
+
+/* Expand button */
+.wy-menu-vertical li.toctree-l2 span.toctree-expand,
+.wy-menu-vertical li.on a span.toctree-expand, .wy-menu-vertical li.current>a span.toctree-expand,
+.wy-menu-vertical li.toctree-l3 span.toctree-expand{
+    color: black;
+}
+
+/* Max window size */
+.wy-nav-content{
+    max-width: 1200px;
+}
+
+/* Mobile header */
+.wy-nav-top{
+    background-color: #6670FF;
+}
+
+/* FONTS */
+body{
+    font-family: Calibre;
+    font-size: 20px;
+}
+
+h1 {
+    font-family: Calibre-Thin;
+    font-size: 70px;
+}
+
+h2, .rst-content .toctree-wrapper p.caption, h3, h4, h5, h6, legend{
+    font-family: Calibre-Medium;
+}
+
+@font-face {
+    font-family: Calibre-Medium;
+    src: url(./Calibre-Medium.otf);
+    font-weight:400;
+}
+
+@font-face {
+    font-family: Calibre;
+    src: url(./Calibre-Regular.otf);
+    font-weight:400;
+}
+
+@font-face {
+    font-family: Calibre-Light;
+    src: url(./Calibre-Light.ttf);
+    font-weight:400;
+}
+
+@font-face {
+    font-family: Calibre-Thin;
+    src: url(./Calibre-Thin.otf);
+    font-weight:400;
+}
diff --git a/docs/source/_static/js/custom.js b/docs/source/_static/js/custom.js
new file mode 100644
index 0000000000..9ddbbb7c49
--- /dev/null
+++ b/docs/source/_static/js/custom.js
@@ -0,0 +1,18 @@
+function addIcon() {
+    const huggingFaceLogo = "http://lysand.re/huggingface_logo.svg";
+    const image = document.createElement("img");
+    image.setAttribute("src", huggingFaceLogo)
+
+
+    const div = document.createElement("div")
+    div.appendChild(image);
+    div.style.textAlign = 'center';
+    div.style.paddingTop = '30px';
+    div.style.backgroundColor = '#6670FF'
+
+    const scrollDiv = document.getElementsByClassName("wy-side-scroll")[0];
+    scrollDiv.prepend(div)
+}
+
+window.addEventListener("load", addIcon)
+
diff --git a/docs/source/_static/js/huggingface_logo.svg b/docs/source/_static/js/huggingface_logo.svg
new file mode 100644
index 0000000000..84974866ce
--- /dev/null
+++ b/docs/source/_static/js/huggingface_logo.svg
@@ -0,0 +1,47 @@
+<svg width="95px" height="88px" viewBox="0 0 95 88" version="1.1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
+    <!-- Generator: Sketch 43.2 (39069) - http://www.bohemiancoding.com/sketch -->
+    <title>icon</title>
+    <desc>Created with Sketch.</desc>
+    <defs>
+        <path d="M13,14.7890193 C22.8284801,14.7890193 26,6.02605902 26,1.5261751 C26,-0.812484109 24.4279133,-0.0763570998 21.9099482,1.17020987 C19.5830216,2.32219957 16.4482998,3.91011313 13,3.91011313 C5.82029825,3.91011313 0,-2.97370882 0,1.5261751 C0,6.02605902 3.17151989,14.7890193 13,14.7890193 Z" id="path-1"></path>
+    </defs>
+    <g id="Page-1" stroke="none" stroke-width="1" fill="none" fill-rule="evenodd">
+        <g id="icon_desktop">
+            <g id="icon">
+                <g id="icon_desktop">
+                    <g id="Group-2">
+                        <g id="Group">
+                            <path d="M93.7930402,70.08 C94.5430402,72.24 94.3630402,74.54 93.3630402,76.54 C92.6430402,78 91.6130402,79.13 90.3530402,80.14 C88.8330402,81.34 86.9430402,82.36 84.6630402,83.34 C81.9430402,84.5 78.6230402,85.59 77.1030402,85.99 C73.2130402,87 69.4730402,87.64 65.6830402,87.67 C60.2630402,87.72 55.5930402,86.44 52.2730402,83.17 C50.5530402,83.38 48.8130402,83.5 47.0630402,83.5 C45.4030402,83.5 43.7630402,83.4 42.1330402,83.2 C38.8030402,86.45 34.1530402,87.72 28.7530402,87.67 C24.9630402,87.64 21.2230402,87 17.3230402,85.99 C15.8130402,85.59 12.4930402,84.5 9.77304019,83.34 C7.49304019,82.36 5.60304019,81.34 4.09304019,80.14 C2.82304019,79.13 1.79304019,78 1.07304019,76.54 C0.0830401858,74.54 -0.106959814,72.24 0.653040186,70.08 C-0.0469598142,68.43 -0.226959814,66.54 0.323040186,64.45 C0.573040186,63.5 0.983040186,62.62 1.50304019,61.84 C1.39304019,61.43 1.30304019,61.01 1.24304019,60.55 C0.863040186,57.81 1.81304019,55.31 3.60304019,53.37 C4.48304019,52.4 5.43304019,51.73 6.42304019,51.3 C5.69304019,48.2 5.31304019,45.01 5.31304019,41.75 C5.31304019,18.69 24.0030402,0 47.0630402,0 C54.9830402,0 62.3930402,2.2 68.7130402,6.04 C69.8530402,6.74 70.9730402,7.49 72.0430402,8.29 C72.5730402,8.69 73.1030402,9.1 73.6130402,9.53 C74.1330402,9.95 74.6430402,10.39 75.1330402,10.84 C76.6130402,12.19 78.0030402,13.64 79.2730402,15.19 C79.7030402,15.7 80.1130402,16.23 80.5130402,16.77 C81.3230402,17.84 82.0730402,18.95 82.7630402,20.1 C83.8130402,21.82 84.7330402,23.62 85.5330402,25.49 C86.0630402,26.74 86.5230402,28.02 86.9330402,29.33 C87.5430402,31.29 88.0130402,33.31 88.3330402,35.39 C88.4330402,36.08 88.5230402,36.78 88.5930402,37.48 C88.7330402,38.88 88.8130402,40.3 88.8130402,41.75 C88.8130402,44.97 88.4330402,48.13 87.7230402,51.18 C88.8230402,51.61 89.8630402,52.31 90.8330402,53.37 C92.6230402,55.31 93.5730402,57.82 93.1930402,60.56 C93.1330402,61.01 93.0430402,61.43 92.9330402,61.84 C93.4530402,62.62 93.8630402,63.5 94.1130402,64.45 C94.6630402,66.54 94.4830402,68.43 93.7930402,70.08" id="Fill-1" fill="#FFFFFF" fill-rule="nonzero"></path>
+                            <circle id="Oval" fill="#FFD21E" fill-rule="nonzero" cx="46.75" cy="41.75" r="34.75"></circle>
+                            <path d="M81.5,41.75 C81.5,22.5581049 65.9418951,7 46.75,7 C27.5581049,7 12,22.5581049 12,41.75 C12,60.9418951 27.5581049,76.5 46.75,76.5 C65.9418951,76.5 81.5,60.9418951 81.5,41.75 Z M8,41.75 C8,20.3489659 25.3489659,3 46.75,3 C68.1510341,3 85.5,20.3489659 85.5,41.75 C85.5,63.1510341 68.1510341,80.5 46.75,80.5 C25.3489659,80.5 8,63.1510341 8,41.75 Z" id="Oval" fill="#FFAC03" fill-rule="nonzero"></path>
+                            <path d="M57.1723547,31.7151181 C58.0863134,32.7107502 57.3040427,35.2620959 58.7620957,35.2620959 C61.5235194,35.2620959 63.7620957,33.0235196 63.7620957,30.2620959 C63.7620957,27.5006721 61.5235194,25.2620959 58.7620957,25.2620959 C56.0006719,25.2620959 53.7620957,27.5006721 53.7620957,30.2620959 C53.7620957,31.5654666 56.3553563,30.8251108 57.1723547,31.7151181 Z" id="Oval-2" fill="#3A3B45" fill-rule="nonzero" transform="translate(58.762096, 30.262096) rotate(-28.000000) translate(-58.762096, -30.262096) "></path>
+                            <path d="M32.1723553,31.7151181 C33.086314,32.7107502 32.3040433,35.2620959 33.7620963,35.2620959 C36.52352,35.2620959 38.7620963,33.0235196 38.7620963,30.2620959 C38.7620963,27.5006721 36.52352,25.2620959 33.7620963,25.2620959 C31.0006725,25.2620959 28.7620963,27.5006721 28.7620963,30.2620959 C28.7620963,31.5654666 31.3553569,30.8251108 32.1723553,31.7151181 Z" id="Oval-2" fill="#3A3B45" fill-rule="nonzero" transform="translate(33.762096, 30.262096) scale(-1, 1) rotate(-28.000000) translate(-33.762096, -30.262096) "></path>
+                            <g id="Oval-4" transform="translate(33.500000, 41.500000)">
+                                <g id="Mask" fill-rule="nonzero" fill="#3A3B45">
+                                    <path d="M13,14.7890193 C22.8284801,14.7890193 26,6.02605902 26,1.5261751 C26,-0.812484109 24.4279133,-0.0763570998 21.9099482,1.17020987 C19.5830216,2.32219957 16.4482998,3.91011313 13,3.91011313 C5.82029825,3.91011313 0,-2.97370882 0,1.5261751 C0,6.02605902 3.17151989,14.7890193 13,14.7890193 Z" id="path-1"></path>
+                                </g>
+                                <g id="Clipped">
+                                    <mask id="mask-2" fill="white">
+                                        <use xlink:href="#path-1"></use>
+                                    </mask>
+                                    <g id="path-1"></g>
+                                    <path d="M13.25,25 C18.0399291,25 21.9229338,21.1169953 21.9229338,16.3270662 C21.9229338,12.5962324 19.5672252,9.41560375 16.2620987,8.19147116 C16.1404592,8.14641904 16.0175337,8.10401696 15.8933923,8.06433503 C15.0599892,7.79793679 14.1717882,10.6623144 13.25,10.6623144 C12.3886883,10.6623144 11.5567012,7.77968641 10.7713426,8.01349068 C7.18916268,9.07991937 4.57706621,12.3984489 4.57706621,16.3270662 C4.57706621,21.1169953 8.46007093,25 13.25,25 Z" id="Shape" fill="#EF4E4E" fill-rule="nonzero" mask="url(#mask-2)"></path>
+                                </g>
+                            </g>
+                            <circle id="Oval-3" fill="#FFD21E" fill-rule="nonzero" style="mix-blend-mode: multiply;" cx="70.25" cy="33.75" r="3.25"></circle>
+                            <circle id="Oval-3" fill="#FFD21E" fill-rule="nonzero" style="mix-blend-mode: multiply;" cx="23.75" cy="33.75" r="3.25"></circle>
+                        </g>
+                    </g>
+                </g>
+                <g id="Group-4" transform="translate(3.000000, 48.000000)" fill-rule="nonzero">
+                    <path d="M14.0619453,0 L14.0619453,0 C12.4429453,0 10.9959453,0.665 9.98694534,1.871 C9.36294534,2.618 8.71094534,3.822 8.65794534,5.625 C7.97894534,5.43 7.32594534,5.321 6.71594534,5.321 C5.16594534,5.321 3.76594534,5.915 2.77594534,6.994 C1.50394534,8.379 0.938945345,10.081 1.18494534,11.784 C1.30194534,12.595 1.57294534,13.322 1.97794534,13.995 C1.12394534,14.686 0.494945345,15.648 0.190945345,16.805 C-0.0470546551,17.712 -0.291054655,19.601 0.982945345,21.547 C0.901945345,21.674 0.825945345,21.806 0.754945345,21.941 C-0.0110546551,23.395 -0.0600546551,25.038 0.615945345,26.568 C1.64094534,28.887 4.18794534,30.714 9.13394534,32.675 C12.2109453,33.895 15.0259453,34.675 15.0509453,34.682 C19.1189453,35.737 22.7979453,36.273 25.9829453,36.273 C31.8369453,36.273 36.0279453,34.48 38.4399453,30.944 C42.3219453,25.25 41.7669453,20.042 36.7439453,15.022 C33.9639453,12.244 32.1159453,8.148 31.7309453,7.249 C30.9549453,4.587 28.9029453,1.628 25.4919453,1.628 L25.4909453,1.628 C25.2039453,1.628 24.9139453,1.651 24.6279453,1.696 C23.1339453,1.931 21.8279453,2.791 20.8949453,4.085 C19.8879453,2.833 18.9099453,1.837 18.0249453,1.275 C16.6909453,0.429 15.3579453,0 14.0619453,0 M14.0619453,4 C14.5719453,4 15.1949453,4.217 15.8819453,4.653 C18.0149453,6.006 22.1309453,13.081 23.6379453,15.833 C24.1429453,16.755 25.0059453,17.145 25.7829453,17.145 C27.3249453,17.145 28.5289453,15.612 25.9239453,13.664 C22.0069453,10.733 23.3809453,5.942 25.2509453,5.647 C25.3329453,5.634 25.4139453,5.628 25.4919453,5.628 C27.1919453,5.628 27.9419453,8.558 27.9419453,8.558 C27.9419453,8.558 30.1399453,14.078 33.9159453,17.851 C37.6919453,21.625 37.8869453,24.654 35.1349453,28.69 C33.2579453,31.442 29.6649453,32.273 25.9829453,32.273 C22.1639453,32.273 18.2489453,31.379 16.0549453,30.81 C15.9469453,30.782 2.60394534,27.013 4.29394534,23.805 C4.57794534,23.266 5.04594534,23.05 5.63494534,23.05 C8.01494534,23.05 12.3439453,26.592 14.2049453,26.592 C14.6209453,26.592 14.9139453,26.415 15.0339453,25.983 C15.8269453,23.138 2.97694534,21.942 4.05994534,17.821 C4.25094534,17.092 4.76894534,16.796 5.49694534,16.797 C8.64194534,16.797 15.6979453,22.328 17.1769453,22.328 C17.2899453,22.328 17.3709453,22.295 17.4149453,22.225 C18.1559453,21.029 17.7499453,20.194 12.5269453,17.033 C7.30394534,13.871 3.63794534,11.969 5.72294534,9.699 C5.96294534,9.437 6.30294534,9.321 6.71594534,9.321 C9.88694534,9.322 17.3789453,16.14 17.3789453,16.14 C17.3789453,16.14 19.4009453,18.243 20.6239453,18.243 C20.9049453,18.243 21.1439453,18.132 21.3059453,17.858 C22.1729453,16.396 13.2529453,9.636 12.7499453,6.847 C12.4089453,4.957 12.9889453,4 14.0619453,4" id="Fill-1" fill="#FFAC03"></path>
+                    <path d="M35.1348,28.6899 C37.8868,24.6539 37.6918,21.6249 33.9158,17.8509 C30.1398,14.0779 27.9418,8.5579 27.9418,8.5579 C27.9418,8.5579 27.1208,5.3519 25.2508,5.6469 C23.3808,5.9419 22.0078,10.7329 25.9248,13.6639 C29.8418,16.5939 25.1448,18.5849 23.6378,15.8329 C22.1308,13.0809 18.0158,6.0059 15.8818,4.6529 C13.7488,3.2999 12.2468,4.0579 12.7498,6.8469 C13.2528,9.6359 22.1738,16.3959 21.3058,17.8589 C20.4378,19.3209 17.3788,16.1399 17.3788,16.1399 C17.3788,16.1399 7.8068,7.4289 5.7228,9.6989 C3.6388,11.9689 7.3038,13.8709 12.5268,17.0329 C17.7508,20.1939 18.1558,21.0289 17.4148,22.2249 C16.6728,23.4209 5.1428,13.6999 4.0598,17.8209 C2.9778,21.9419 15.8268,23.1379 15.0338,25.9829 C14.2408,28.8289 5.9828,20.5979 4.2938,23.8049 C2.6038,27.0129 15.9468,30.7819 16.0548,30.8099 C20.3648,31.9279 31.3108,34.2969 35.1348,28.6899" id="Fill-4" fill="#FFD21E"></path>
+                </g>
+                <g id="Group-4" transform="translate(70.500000, 66.500000) scale(-1, 1) translate(-70.500000, -66.500000) translate(50.000000, 48.000000)" fill-rule="nonzero">
+                    <path d="M14.0619453,0 L14.0619453,0 C12.4429453,0 10.9959453,0.665 9.98694534,1.871 C9.36294534,2.618 8.71094534,3.822 8.65794534,5.625 C7.97894534,5.43 7.32594534,5.321 6.71594534,5.321 C5.16594534,5.321 3.76594534,5.915 2.77594534,6.994 C1.50394534,8.379 0.938945345,10.081 1.18494534,11.784 C1.30194534,12.595 1.57294534,13.322 1.97794534,13.995 C1.12394534,14.686 0.494945345,15.648 0.190945345,16.805 C-0.0470546551,17.712 -0.291054655,19.601 0.982945345,21.547 C0.901945345,21.674 0.825945345,21.806 0.754945345,21.941 C-0.0110546551,23.395 -0.0600546551,25.038 0.615945345,26.568 C1.64094534,28.887 4.18794534,30.714 9.13394534,32.675 C12.2109453,33.895 15.0259453,34.675 15.0509453,34.682 C19.1189453,35.737 22.7979453,36.273 25.9829453,36.273 C31.8369453,36.273 36.0279453,34.48 38.4399453,30.944 C42.3219453,25.25 41.7669453,20.042 36.7439453,15.022 C33.9639453,12.244 32.1159453,8.148 31.7309453,7.249 C30.9549453,4.587 28.9029453,1.628 25.4919453,1.628 L25.4909453,1.628 C25.2039453,1.628 24.9139453,1.651 24.6279453,1.696 C23.1339453,1.931 21.8279453,2.791 20.8949453,4.085 C19.8879453,2.833 18.9099453,1.837 18.0249453,1.275 C16.6909453,0.429 15.3579453,0 14.0619453,0 M14.0619453,4 C14.5719453,4 15.1949453,4.217 15.8819453,4.653 C18.0149453,6.006 22.1309453,13.081 23.6379453,15.833 C24.1429453,16.755 25.0059453,17.145 25.7829453,17.145 C27.3249453,17.145 28.5289453,15.612 25.9239453,13.664 C22.0069453,10.733 23.3809453,5.942 25.2509453,5.647 C25.3329453,5.634 25.4139453,5.628 25.4919453,5.628 C27.1919453,5.628 27.9419453,8.558 27.9419453,8.558 C27.9419453,8.558 30.1399453,14.078 33.9159453,17.851 C37.6919453,21.625 37.8869453,24.654 35.1349453,28.69 C33.2579453,31.442 29.6649453,32.273 25.9829453,32.273 C22.1639453,32.273 18.2489453,31.379 16.0549453,30.81 C15.9469453,30.782 2.60394534,27.013 4.29394534,23.805 C4.57794534,23.266 5.04594534,23.05 5.63494534,23.05 C8.01494534,23.05 12.3439453,26.592 14.2049453,26.592 C14.6209453,26.592 14.9139453,26.415 15.0339453,25.983 C15.8269453,23.138 2.97694534,21.942 4.05994534,17.821 C4.25094534,17.092 4.76894534,16.796 5.49694534,16.797 C8.64194534,16.797 15.6979453,22.328 17.1769453,22.328 C17.2899453,22.328 17.3709453,22.295 17.4149453,22.225 C18.1559453,21.029 17.7499453,20.194 12.5269453,17.033 C7.30394534,13.871 3.63794534,11.969 5.72294534,9.699 C5.96294534,9.437 6.30294534,9.321 6.71594534,9.321 C9.88694534,9.322 17.3789453,16.14 17.3789453,16.14 C17.3789453,16.14 19.4009453,18.243 20.6239453,18.243 C20.9049453,18.243 21.1439453,18.132 21.3059453,17.858 C22.1729453,16.396 13.2529453,9.636 12.7499453,6.847 C12.4089453,4.957 12.9889453,4 14.0619453,4" id="Fill-1" fill="#FFAC03"></path>
+                    <path d="M35.1348,28.6899 C37.8868,24.6539 37.6918,21.6249 33.9158,17.8509 C30.1398,14.0779 27.9418,8.5579 27.9418,8.5579 C27.9418,8.5579 27.1208,5.3519 25.2508,5.6469 C23.3808,5.9419 22.0078,10.7329 25.9248,13.6639 C29.8418,16.5939 25.1448,18.5849 23.6378,15.8329 C22.1308,13.0809 18.0158,6.0059 15.8818,4.6529 C13.7488,3.2999 12.2468,4.0579 12.7498,6.8469 C13.2528,9.6359 22.1738,16.3959 21.3058,17.8589 C20.4378,19.3209 17.3788,16.1399 17.3788,16.1399 C17.3788,16.1399 7.8068,7.4289 5.7228,9.6989 C3.6388,11.9689 7.3038,13.8709 12.5268,17.0329 C17.7508,20.1939 18.1558,21.0289 17.4148,22.2249 C16.6728,23.4209 5.1428,13.6999 4.0598,17.8209 C2.9778,21.9419 15.8268,23.1379 15.0338,25.9829 C14.2408,28.8289 5.9828,20.5979 4.2938,23.8049 C2.6038,27.0129 15.9468,30.7819 16.0548,30.8099 C20.3648,31.9279 31.3108,34.2969 35.1348,28.6899" id="Fill-4" fill="#FFD21E"></path>
+                </g>
+            </g>
+        </g>
+    </g>
+</svg>
\ No newline at end of file
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 7675393807..978b204466 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -176,5 +176,9 @@ epub_title = project
 # A list of files that should not be packed into the epub file.
 epub_exclude_files = ['search.html']
 
+def setup(app):
+    app.add_stylesheet('css/huggingface.css')
+    app.add_stylesheet('css/code-snippets.css')
+    app.add_js_file('js/custom.js')
 
 # -- Extension configuration -------------------------------------------------
diff --git a/docs/imgs/warmup_constant_schedule.png b/docs/source/imgs/warmup_constant_schedule.png
similarity index 100%
rename from docs/imgs/warmup_constant_schedule.png
rename to docs/source/imgs/warmup_constant_schedule.png
diff --git a/docs/imgs/warmup_cosine_hard_restarts_schedule.png b/docs/source/imgs/warmup_cosine_hard_restarts_schedule.png
similarity index 100%
rename from docs/imgs/warmup_cosine_hard_restarts_schedule.png
rename to docs/source/imgs/warmup_cosine_hard_restarts_schedule.png
diff --git a/docs/imgs/warmup_cosine_schedule.png b/docs/source/imgs/warmup_cosine_schedule.png
similarity index 100%
rename from docs/imgs/warmup_cosine_schedule.png
rename to docs/source/imgs/warmup_cosine_schedule.png
diff --git a/docs/imgs/warmup_cosine_warm_restarts_schedule.png b/docs/source/imgs/warmup_cosine_warm_restarts_schedule.png
similarity index 100%
rename from docs/imgs/warmup_cosine_warm_restarts_schedule.png
rename to docs/source/imgs/warmup_cosine_warm_restarts_schedule.png
diff --git a/docs/imgs/warmup_linear_schedule.png b/docs/source/imgs/warmup_linear_schedule.png
similarity index 100%
rename from docs/imgs/warmup_linear_schedule.png
rename to docs/source/imgs/warmup_linear_schedule.png
diff --git a/docs/source/index.rst b/docs/source/index.rst
index d7b60bd660..49df768561 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -1,4 +1,4 @@
-Pytorch-Transformers: The Big & Extending Repository of pretrained Transformers
+Pytorch-Transformers
 ================================================================================================================================================
 
 
diff --git a/docs/source/model_doc/overview.rst b/docs/source/model_doc/overview.rst
index 8f5e94baf1..7c426aa798 100644
--- a/docs/source/model_doc/overview.rst
+++ b/docs/source/model_doc/overview.rst
@@ -39,10 +39,8 @@ configuration files. The respective configuration classes are:
 These configuration classes contains a few utilities to load and save configurations:
 
 
-* ``from_dict(cls, json_object)``\ : A class method to construct a configuration from a Python dictionary of parameters.
- Returns an instance of the configuration class.
-* ``from_json_file(cls, json_file)``\ : A class method to construct a configuration from a json file of parameters.
-Returns an instance of the configuration class.
+* ``from_dict(cls, json_object)``\ : A class method to construct a configuration from a Python dictionary of parameters. Returns an instance of the configuration class.
+* ``from_json_file(cls, json_file)``\ : A class method to construct a configuration from a json file of parameters. Returns an instance of the configuration class.
 * ``to_dict()``\ : Serializes an instance to a Python dictionary. Returns a dictionary.
 * ``to_json_string()``\ : Serializes an instance to a JSON string. Returns a string.
 * ``to_json_file(json_file_path)``\ : Save an instance to a json file.
@@ -247,40 +245,44 @@ An overview of the implemented schedules:
 
 
 * ``ConstantLR``\ : always returns learning rate 1.
-* ``WarmupConstantSchedule``\ : Linearly increases learning rate from 0 to 1 over ``warmup`` fraction of training steps.
+* ``WarmupConstantSchedule`` : Linearly increases learning rate from 0 to 1 over ``warmup`` fraction of training steps.
     Keeps learning rate equal to 1. after warmup.
 
-  .. image:: docs/imgs/warmup_constant_schedule.png
-     :target: docs/imgs/warmup_constant_schedule.png
+  .. image:: /imgs/warmup_constant_schedule.png
+     :target: /imgs/warmup_constant_schedule.png
      :alt:
 
-* ``WarmupLinearSchedule``\ : Linearly increases learning rate from 0 to 1 over ``warmup`` fraction of training steps.
+
+* ``WarmupLinearSchedule`` : Linearly increases learning rate from 0 to 1 over ``warmup`` fraction of training steps.
     Linearly decreases learning rate from 1. to 0. over remaining ``1 - warmup`` steps.
 
-  .. image:: docs/imgs/warmup_linear_schedule.png
-     :target: docs/imgs/warmup_linear_schedule.png
+  .. image:: /imgs/warmup_linear_schedule.png
+     :target: /imgs/warmup_linear_schedule.png
      :alt:
 
-* ``WarmupCosineSchedule``\ : Linearly increases learning rate from 0 to 1 over ``warmup`` fraction of training steps.
+
+* ``WarmupCosineSchedule`` : Linearly increases learning rate from 0 to 1 over ``warmup`` fraction of training steps.
    Decreases learning rate from 1. to 0. over remaining ``1 - warmup`` steps following a cosine curve.
    If ``cycles`` (default=0.5) is different from default, learning rate follows cosine function after warmup.
 
-  .. image:: docs/imgs/warmup_cosine_schedule.png
-     :target: docs/imgs/warmup_cosine_schedule.png
+  .. image:: /imgs/warmup_cosine_schedule.png
+     :target: /imgs/warmup_cosine_schedule.png
      :alt:
 
-* ``WarmupCosineWithHardRestartsSchedule``\ : Linearly increases learning rate from 0 to 1 over ``warmup`` fraction of training steps.
+
+* ``WarmupCosineWithHardRestartsSchedule`` : Linearly increases learning rate from 0 to 1 over ``warmup`` fraction of training steps.
     If ``cycles`` (default=1.) is different from default, learning rate follows ``cycles`` times a cosine decaying learning rate (with hard restarts).
 
-  .. image:: docs/imgs/warmup_cosine_hard_restarts_schedule.png
-     :target: docs/imgs/warmup_cosine_hard_restarts_schedule.png
+  .. image:: /imgs/warmup_cosine_hard_restarts_schedule.png
+     :target: /imgs/warmup_cosine_hard_restarts_schedule.png
      :alt:
 
-* ``WarmupCosineWithWarmupRestartsSchedule``\ : All training progress is divided in ``cycles`` (default=1.) parts of equal length.
+
+* ``WarmupCosineWithWarmupRestartsSchedule`` : All training progress is divided in ``cycles`` (default=1.) parts of equal length.
     Every part follows a schedule with the first ``warmup`` fraction of the training steps linearly increasing from 0. to 1.,
     followed by a learning rate decreasing from 1. to 0. following a cosine curve.
     Note that the total number of all warmup steps over all cycles together is equal to ``warmup`` * ``cycles``
 
-  .. image:: docs/imgs/warmup_cosine_warm_restarts_schedule.png
-     :target: docs/imgs/warmup_cosine_warm_restarts_schedule.png
+  .. image:: /imgs/warmup_cosine_warm_restarts_schedule.png
+     :target: /imgs/warmup_cosine_warm_restarts_schedule.png
      :alt:
\ No newline at end of file

From 6847e30e1ca742bf42768ab1ac25a52fde7f1aac Mon Sep 17 00:00:00 2001
From: LysandreJik <lysandre.debut@reseau.eseo.fr>
Date: Mon, 8 Jul 2019 17:34:24 -0400
Subject: [PATCH 070/139] New page detailing the use of TorchScript.

---
 docs/source/index.rst       |   1 +
 docs/source/torchscript.rst | 130 ++++++++++++++++++++++++++++++++++++
 2 files changed, 131 insertions(+)
 create mode 100644 docs/source/torchscript.rst

diff --git a/docs/source/index.rst b/docs/source/index.rst
index 49df768561..4b5b982148 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -12,6 +12,7 @@ Pytorch-Transformers
     notebooks
     tpu
     cli
+    torchscript
 
 
 .. toctree::
diff --git a/docs/source/torchscript.rst b/docs/source/torchscript.rst
new file mode 100644
index 0000000000..c94ce35fe2
--- /dev/null
+++ b/docs/source/torchscript.rst
@@ -0,0 +1,130 @@
+TorchScript
+================================================
+
+According to Pytorch's documentation: "TorchScript is a way to create serializable and optimizable models from PyTorch code".
+Pytorch's two modules `JIT and TRACE <https://pytorch.org/docs/stable/jit.html>`_ allow the developer to export
+their model to be re-used in other programs, such as efficiency-oriented C++ programs.
+
+We have provided an interface that allows the export of `pytorch-transformers` models to TorchScript so that they can
+be reused in a different environment than a Pytorch-based python program. Here we explain how to use our models so that
+they can be exported, and what to be mindful of when using these models with TorchScript.
+
+Exporting a model needs two things:
+
+* dummy inputs to execute a model forward pass.
+* the model needs to be instantiated with the ``torchscript`` flag.
+
+These necessities imply several things developers should be careful about. These are detailed below.
+
+
+Implications
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+TorchScript flag and tied weights
+------------------------------------------------
+This flag is necessary because most of the language models in this repository have tied weights between their
+``Embedding`` layer and their ``Decoding`` layer. TorchScript does not allow the export of models that have tied weights,
+it is therefore necessary to untie the weights beforehand.
+
+This implies that models instantiated with the ``torchscript`` flag have their ``Embedding`` layer and ``Decoding`` layer
+separate, which means that they should not be trained down the line. Training would de-synchronize the two layers,
+leading to unexpected results.
+
+This is not the case for models that do not have a Language Model head, as those do not have tied weights. These models
+can be safely exported without the ``torchscript`` flag.
+
+Dummy inputs and standard lengths
+------------------------------------------------
+
+The dummy inputs are used to do a model forward pass. While the inputs' values are propagating through the layers,
+Pytorch keeps track of the different operations executed on each tensor. These recorded operations are then used
+to create the "trace" of the model.
+
+The trace is created relatively to the inputs' dimensions. It is therefore constrained by the dimensions of the dummy
+input, and will not work for any other sequence length or batch size. When trying with a different size, an error such
+as:
+
+``The expanded size of the tensor (3) must match the existing size (7) at non-singleton dimension 2``
+
+will be raised. It is therefore recommended to trace the model with a dummy input size at least as large as the largest
+input that will be fed to the model during inference. Padding can be performed to fill the missing values. As the model
+will have been traced with a large input size however, the dimensions of the different matrix will be large as well,
+resulting in more calculations.
+
+It is recommended to be careful of the total number of operations done on each input and to follow performance closely
+when exporting varying sequence-length models.
+
+Using TorchScript in Python
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Below are examples of using the Python to save, load models as well as how to use the trace for inference.
+
+Saving a model
+------------------------------------------------
+
+This snippet shows how to use TorchScript to export a ``BertModel``. Here the ``BertModel`` is instantiated
+according to a ``BertConfig`` class and then saved to disk under the filename ``traced_bert.pt``
+
+.. code-block:: python
+
+    from pytorch_pretrained_bert import BertModel, BertTokenizer, BertConfig
+    import torch
+
+    enc = BertTokenizer.from_pretrained("bert-base-uncased")
+
+    # Tokenizing input text
+    text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
+    tokenized_text = enc.tokenize(text)
+
+    # Masking one of the input tokens
+    masked_index = 8
+    tokenized_text[masked_index] = '[MASK]'
+    indexed_tokens = enc.convert_tokens_to_ids(tokenized_text)
+    segments_ids = [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]
+
+    # Creating a dummy input
+    tokens_tensor = torch.tensor([indexed_tokens])
+    segments_tensors = torch.tensor([segments_ids])
+    dummy_input = [tokens_tensor, segments_tensors]
+
+    # Initializing the model with the torchscript flag
+    # Flag set to True even though it is not necessary as this model does not have an LM Head.
+    config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
+        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072, torchscript=True)
+
+    # Instantiating the model
+    model = BertModel(config)
+
+    # The model needs to be in evaluation mode
+    model.eval()
+
+    # Creating the trace
+    traced_model = torch.jit.trace(model, [tokens_tensor, segments_tensors])
+    torch.jit.save(traced_model, "traced_bert.pt")
+
+Loading a model
+------------------------------------------------
+
+This snippet shows how to load the ``BertModel`` that was previously saved to disk under the name ``traced_bert.pt``.
+We are re-using the previously initialised ``dummy_input``.
+
+.. code-block:: python
+
+    loaded_model = torch.jit.load("traced_model.pt")
+    loaded_model.eval()
+
+    all_encoder_layers, pooled_output = loaded_model(dummy_input)
+
+Using a traced model for inference
+------------------------------------------------
+
+Using the traced model for inference is as simple as using its ``__call__`` dunder method:
+
+.. code-block:: python
+
+    traced_model(tokens_tensor, segments_tensors)
+
+(Optional) Using TorchScript in C++
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Below are examples of using a model exported using Python in C++.

From b19786985d2dde9f91e20f5ce01f78a0cf7b6d0c Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Tue, 9 Jul 2019 10:25:18 +0200
Subject: [PATCH 071/139] unified tokenizer api and serialization + tests

---
 examples/run_glue.py                          |  78 ++--
 examples/run_xlnet_classifier.py              |   8 +-
 examples/utils_glue.py                        |   5 +-
 pytorch_transformers/__init__.py              |  18 +-
 ...onvert_transfo_xl_checkpoint_to_pytorch.py |   5 +-
 .../convert_xlm_checkpoint_to_pytorch.py      |   4 +-
 pytorch_transformers/modeling_bert.py         |   8 +-
 pytorch_transformers/modeling_gpt2.py         |   8 +-
 pytorch_transformers/modeling_openai.py       |   8 +-
 pytorch_transformers/modeling_transfo_xl.py   |   8 +-
 pytorch_transformers/modeling_utils.py        |  16 +
 pytorch_transformers/modeling_xlm.py          |   8 +-
 pytorch_transformers/modeling_xlnet.py        |   8 +-
 .../tests/modeling_bert_test.py               |   4 +-
 .../tests/modeling_tests_commons.py           |   2 +-
 .../tests/modeling_transfo_xl_test.py         |   4 +-
 .../tests/modeling_utils_test.py              |   4 +-
 .../tests/modeling_xlm_test.py                |   4 +-
 .../tests/modeling_xlnet_test.py              |   4 +-
 .../tests/tokenization_bert_test.py           |  20 +-
 .../tests/tokenization_gpt2_test.py           |  41 +-
 .../tests/tokenization_openai_test.py         |  43 +-
 .../tests/tokenization_tests_commons.py       |  63 ++-
 .../tests/tokenization_transfo_xl_test.py     |  28 +-
 .../tests/tokenization_utils_test.py          |  10 +
 .../tests/tokenization_xlm_test.py            |  43 +-
 .../tests/tokenization_xlnet_test.py          |  54 +--
 pytorch_transformers/tokenization_bert.py     | 141 ++-----
 pytorch_transformers/tokenization_gpt2.py     | 113 ++----
 pytorch_transformers/tokenization_openai.py   | 125 ++----
 .../tokenization_transfo_xl.py                |  51 +--
 pytorch_transformers/tokenization_utils.py    | 383 ++++++++++++++++--
 pytorch_transformers/tokenization_xlm.py      | 132 ++----
 pytorch_transformers/tokenization_xlnet.py    | 128 ++----
 34 files changed, 824 insertions(+), 755 deletions(-)

diff --git a/examples/run_glue.py b/examples/run_glue.py
index 8dd845a553..59583ed712 100644
--- a/examples/run_glue.py
+++ b/examples/run_glue.py
@@ -32,9 +32,11 @@ from torch.utils.data.distributed import DistributedSampler
 
 from tensorboardX import SummaryWriter
 
-from pytorch_transformers import WEIGHTS_NAME, CONFIG_NAME
-from pytorch_transformers.modeling_bert import BertForSequenceClassification
-from pytorch_transformers.tokenization_bert import BertTokenizer
+from pytorch_transformers import (BertForSequenceClassification, XLNetForSequenceClassification,
+                                  XLMForSequenceClassification, BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+                                  XLNET_PRETRAINED_MODEL_ARCHIVE_MAP, XLM_PRETRAINED_MODEL_ARCHIVE_MAP)
+from pytorch_transformers import (BertTokenizer, XLNetTokenizer,
+                                  XLMTokenizer)
 from pytorch_transformers.optimization import BertAdam, WarmupLinearSchedule
 
 from utils_glue import processors, output_modes, convert_examples_to_features, compute_metrics
@@ -42,6 +44,21 @@ from utils_glue import processors, output_modes, convert_examples_to_features, c
 
 logger = logging.getLogger(__name__)
 
+ALL_MODELS = sum((tuple(m.keys()) for m in (BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+                                            XLNET_PRETRAINED_MODEL_ARCHIVE_MAP,
+                                            XLM_PRETRAINED_MODEL_ARCHIVE_MAP)), ())
+
+MODEL_CLASSES = {
+    'bert': BertForSequenceClassification,
+    'xlnet': XLNetForSequenceClassification,
+    'xlm': XLMForSequenceClassification,
+}
+
+TOKENIZER_CLASSES = {
+    'bert': BertTokenizer,
+    'xlnet': XLNetTokenizer,
+    'xlm': XLMTokenizer,
+}
 
 def train(args, train_features, model):
     """ Train the model """
@@ -156,7 +173,7 @@ def evalutate(args, eval_task, eval_output_dir, eval_features, model):
 
     # Eval!
     logger.info("***** Running evaluation *****")
-    logger.info("  Num examples = %d", len(eval_examples))
+    logger.info("  Num examples = %d", len(eval_features))
     logger.info("  Batch size = %d", args.eval_batch_size)
     model.eval()
     eval_loss = 0
@@ -208,7 +225,7 @@ def load_and_cache_examples(args, task, tokenizer, eval=False):
     examples = processor.get_dev_examples(args.data_dir)
     cached_features_file = os.path.join(args.data_dir, '{}_{}_{}_{}'.format(
         'dev' if eval else 'train',
-        list(filter(None, args.bert_model.split('/'))).pop(),
+        list(filter(None, args.model_name.split('/'))).pop(),
         str(args.max_seq_length),
         str(task)))
 
@@ -217,6 +234,11 @@ def load_and_cache_examples(args, task, tokenizer, eval=False):
         features = torch.load(cached_features_file)
     else:
         features = convert_examples_to_features(examples, label_list, args.max_seq_length, tokenizer, output_mode)
+        features = convert_examples_to_features(examples, label_list, args.max_seq_length, tokenizer, output_mode,
+            cls_token_at_end=bool(args.model_type not in ['bert', 'xlm']),
+            cls_token=tokenizer.cls_token,
+            sep_token=tokenizer.sep_token, cls_token_segment_id=2,
+            pad_on_left=True, pad_token_segment_id=4)
         if args.local_rank == -1 or torch.distributed.get_rank() == 0:
             logger.info("Saving features into cached file %s", cached_features_file)
             torch.save(features, cached_features_file)
@@ -230,12 +252,10 @@ def main():
     ## Required parameters
     parser.add_argument("--data_dir", default=None, type=str, required=True,
                         help="The input data dir. Should contain the .tsv files (or other data files) for the task.")
-    parser.add_argument("--bert_model", default=None, type=str, required=True,
-                        help="Bert pre-trained model selected in the list: bert-base-uncased, "
-                        "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, "
-                        "bert-base-multilingual-cased, bert-base-chinese.")
+    parser.add_argument("--model_name", default=None, type=str, required=True,
+                        help="Bert/XLNet/XLM pre-trained model selected in the list: " + ", ".join(ALL_MODELS))
     parser.add_argument("--task_name", default=None, type=str, required=True,
-                        help="The name of the task to train.")
+                        help="The name of the task to train selected in the list: " + ", ".join(processors.keys()))
     parser.add_argument("--output_dir", default=None, type=str, required=True,
                         help="The output directory where the model predictions and checkpoints will be written.")
 
@@ -243,9 +263,8 @@ def main():
     parser.add_argument("--cache_dir", default="", type=str,
                         help="Where do you want to store the pre-trained models downloaded from s3")
     parser.add_argument("--max_seq_length", default=128, type=int,
-                        help="The maximum total input sequence length after WordPiece tokenization. \n"
-                             "Sequences longer than this will be truncated, and sequences shorter \n"
-                             "than this will be padded.")
+                        help="The maximum total input sequence length after tokenization. Sequences longer "
+                             "than this will be truncated, sequences shorter will be padded.")
     parser.add_argument("--do_train", action='store_true',
                         help="Whether to run training.")
     parser.add_argument("--do_eval", action='store_true',
@@ -263,8 +282,7 @@ def main():
     parser.add_argument("--num_train_epochs", default=3.0, type=float,
                         help="Total number of training epochs to perform.")
     parser.add_argument("--warmup_proportion", default=0.1, type=float,
-                        help="Proportion of training to perform linear learning rate warmup for. "
-                             "E.g., 0.1 = 10%% of training.")
+                        help="Proportion of training with linear learning rate warmup (0.1 = 10%% of training).")
     parser.add_argument("--no_cuda", action='store_true',
                         help="Avoid using CUDA when available")
     parser.add_argument('--overwrite_output_dir', action='store_true',
@@ -331,8 +349,11 @@ def main():
         # Make sure only the first process in distributed training will download model & vocab
         torch.distributed.barrier()
 
-    tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)
-    model = BertForSequenceClassification.from_pretrained(args.bert_model, num_labels=num_labels)
+    args.model_type = args.model_name.lower().split('-')[0]
+    args.tokenizer_class = TOKENIZER_CLASSES[args.model_type]
+    args.model_class = MODEL_CLASSES[args.model_type]
+    tokenizer = args.tokenizer_class.from_pretrained(args.model_name, do_lower_case=args.do_lower_case)
+    model = args.model_class.from_pretrained(args.model_name, num_labels=num_labels)
 
     if args.local_rank == 0:
         torch.distributed.barrier()
@@ -359,27 +380,16 @@ def main():
     # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
     if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
         # Save a trained model, configuration and tokenizer
-        model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
-
-        # If we save using the predefined names, we can load using `from_pretrained`
-        output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
-        output_config_file = os.path.join(args.output_dir, CONFIG_NAME)
-
-        torch.save(model_to_save.state_dict(), output_model_file)
-        model_to_save.config.to_json_file(output_config_file)
+        model.save_pretrained(args.output_dir)
         tokenizer.save_vocabulary(args.output_dir)
 
-        # Load a trained model and vocabulary that you have fine-tuned
-        model = BertForSequenceClassification.from_pretrained(args.output_dir)
-        tokenizer = BertTokenizer.from_pretrained(args.output_dir)
-
         # Good practice: save your training arguments together with the trained model
-        output_args_file = os.path.join(args.output_dir, 'training_args.bin')
-        torch.save(args, output_args_file)
-    else:
-        model = BertForSequenceClassification.from_pretrained(args.bert_model)
+        torch.save(args, os.path.join(args.output_dir, 'training_args.bin'))
 
-    model.to(args.device)
+        # Load a trained model and vocabulary that you have fine-tuned
+        model = args.model_class.from_pretrained(args.output_dir)
+        tokenizer = args.tokenizer_class.from_pretrained(args.output_dir)
+        model.to(args.device)
 
     # Evaluation
     if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
diff --git a/examples/run_xlnet_classifier.py b/examples/run_xlnet_classifier.py
index 7cf8a8d877..35b0ebfbd1 100644
--- a/examples/run_xlnet_classifier.py
+++ b/examples/run_xlnet_classifier.py
@@ -211,8 +211,8 @@ def main():
             logger.info("No cache file at %s, preparing train features", cached_train_features_file)
             train_features = convert_examples_to_features(
                 train_examples, label_list, args.max_seq_length, tokenizer, output_mode,
-                cls_token_at_end=True, cls_token=tokenizer.CLS_TOKEN,
-                sep_token=tokenizer.SEP_TOKEN, cls_token_segment_id=2,
+                cls_token_at_end=True, cls_token=tokenizer.cls_token,
+                sep_token=tokenizer.sep_token, cls_token_segment_id=2,
                 pad_on_left=True, pad_token_segment_id=4)
             if args.local_rank == -1 or torch.distributed.get_rank() == 0:
                 logger.info("  Saving train features into cached file %s", cached_train_features_file)
@@ -369,8 +369,8 @@ def main():
             logger.info("No cache file at %s, preparing eval features", cached_eval_features_file)
             eval_features = convert_examples_to_features(
                 eval_examples, label_list, args.max_seq_length, tokenizer, output_mode,
-                cls_token_at_end=True, cls_token=tokenizer.CLS_TOKEN,
-                sep_token=tokenizer.SEP_TOKEN, cls_token_segment_id=2,
+                cls_token_at_end=True, cls_token=tokenizer.cls_token,
+                sep_token=tokenizer.sep_token, cls_token_segment_id=2,
                 pad_on_left=True, pad_token_segment_id=4)
             if args.local_rank == -1 or torch.distributed.get_rank() == 0:
                 logger.info("  Saving eval features into cached file %s", cached_eval_features_file)
diff --git a/examples/utils_glue.py b/examples/utils_glue.py
index 18e733567d..4750592957 100644
--- a/examples/utils_glue.py
+++ b/examples/utils_glue.py
@@ -396,7 +396,7 @@ def convert_examples_to_features(examples, label_list, max_seq_length,
                                  mask_padding_with_zero=True):
     """ Loads a data file into a list of `InputBatch`s
         `cls_token_at_end` define the location of the CLS token:
-            - False (BERT pattern): [CLS] + A + [SEP] + B + [SEP]
+            - False (Default, BERT/XLM pattern): [CLS] + A + [SEP] + B + [SEP]
             - True (XLNet/GPT pattern): A + [SEP] + B + [SEP] + [CLS]
         `cls_token_segment_id` define the segment id associated to the CLS token (0 for BERT, 2 for XLNet)
     """
@@ -489,8 +489,7 @@ def convert_examples_to_features(examples, label_list, max_seq_length,
                     [str(x) for x in tokens]))
             logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
             logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
-            logger.info(
-                    "segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
+            logger.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
             logger.info("label: %s (id = %d)" % (example.label, label_id))
 
         features.append(
diff --git a/pytorch_transformers/__init__.py b/pytorch_transformers/__init__.py
index 6dd78dfd02..c8f64a07de 100644
--- a/pytorch_transformers/__init__.py
+++ b/pytorch_transformers/__init__.py
@@ -11,22 +11,28 @@ from .modeling_bert import (BertConfig, BertModel, BertForPreTraining,
                        BertForMaskedLM, BertForNextSentencePrediction,
                        BertForSequenceClassification, BertForMultipleChoice,
                        BertForTokenClassification, BertForQuestionAnswering,
-                       load_tf_weights_in_bert)
+                       load_tf_weights_in_bert, BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+                       BERT_PRETRAINED_CONFIG_ARCHIVE_MAP)
 from .modeling_openai import (OpenAIGPTConfig, OpenAIGPTModel,
                               OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel,
-                              load_tf_weights_in_openai_gpt)
+                              load_tf_weights_in_openai_gpt, OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP,
+                              OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP)
 from .modeling_transfo_xl import (TransfoXLConfig, TransfoXLModel, TransfoXLLMHeadModel,
-                                  load_tf_weights_in_transfo_xl)
+                                  load_tf_weights_in_transfo_xl, TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP,
+                                  TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP)
 from .modeling_gpt2 import (GPT2Config, GPT2Model,
                             GPT2LMHeadModel, GPT2DoubleHeadsModel,
-                            load_tf_weights_in_gpt2)
+                            load_tf_weights_in_gpt2, GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP,
+                            GPT2_PRETRAINED_MODEL_ARCHIVE_MAP)
 from .modeling_xlnet import (XLNetConfig,
                              XLNetPreTrainedModel, XLNetModel, XLNetLMHeadModel,
                              XLNetForSequenceClassification, XLNetForQuestionAnswering,
-                             load_tf_weights_in_xlnet)
+                             load_tf_weights_in_xlnet, XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP,
+                             XLNET_PRETRAINED_MODEL_ARCHIVE_MAP)
 from .modeling_xlm import (XLMConfig, XLMModel,
                            XLMWithLMHeadModel, XLMForSequenceClassification,
-                           XLMForQuestionAnswering)
+                           XLMForQuestionAnswering, XLM_PRETRAINED_CONFIG_ARCHIVE_MAP,
+                           XLM_PRETRAINED_MODEL_ARCHIVE_MAP)
 from .modeling_utils import (WEIGHTS_NAME, CONFIG_NAME, TF_WEIGHTS_NAME,
                           PretrainedConfig, PreTrainedModel, prune_layer, Conv1D)
 
diff --git a/pytorch_transformers/convert_transfo_xl_checkpoint_to_pytorch.py b/pytorch_transformers/convert_transfo_xl_checkpoint_to_pytorch.py
index 2d666a1f03..db23e5bffe 100755
--- a/pytorch_transformers/convert_transfo_xl_checkpoint_to_pytorch.py
+++ b/pytorch_transformers/convert_transfo_xl_checkpoint_to_pytorch.py
@@ -29,8 +29,7 @@ from pytorch_transformers.modeling_transfo_xl import (CONFIG_NAME,
                                                          TransfoXLConfig,
                                                          TransfoXLLMHeadModel,
                                                          load_tf_weights_in_transfo_xl)
-from pytorch_transformers.tokenization_transfo_xl import (CORPUS_NAME,
-                                                             VOCAB_NAME)
+from pytorch_transformers.tokenization_transfo_xl import (CORPUS_NAME, VOCAB_FILES_NAMES)
 
 if sys.version_info[0] == 2:
     import cPickle as pickle
@@ -53,7 +52,7 @@ def convert_transfo_xl_checkpoint_to_pytorch(tf_checkpoint_path,
         with open(transfo_xl_dataset_file, "rb") as fp:
             corpus = pickle.load(fp, encoding="latin1")
         # Save vocabulary and dataset cache as Dictionaries (should be better than pickles for the long-term)
-        pytorch_vocab_dump_path = pytorch_dump_folder_path + '/' + VOCAB_NAME
+        pytorch_vocab_dump_path = pytorch_dump_folder_path + '/' + VOCAB_FILES_NAMES['pretrained_vocab_file']
         print("Save vocabulary to {}".format(pytorch_vocab_dump_path))
         corpus_vocab_dict = corpus.vocab.__dict__
         torch.save(corpus_vocab_dict, pytorch_vocab_dump_path)
diff --git a/pytorch_transformers/convert_xlm_checkpoint_to_pytorch.py b/pytorch_transformers/convert_xlm_checkpoint_to_pytorch.py
index 0cbe962cea..e5815252f1 100755
--- a/pytorch_transformers/convert_xlm_checkpoint_to_pytorch.py
+++ b/pytorch_transformers/convert_xlm_checkpoint_to_pytorch.py
@@ -24,7 +24,7 @@ import torch
 import numpy
 
 from pytorch_transformers.modeling_xlm import (CONFIG_NAME, WEIGHTS_NAME, XLMConfig, XLMModel)
-from pytorch_transformers.tokenization_xlm import MERGES_NAME, VOCAB_NAME
+from pytorch_transformers.tokenization_xlm import VOCAB_FILES_NAMES
 
 
 def convert_xlm_checkpoint_to_pytorch(xlm_checkpoint_path, pytorch_dump_folder_path):
@@ -42,7 +42,7 @@ def convert_xlm_checkpoint_to_pytorch(xlm_checkpoint_path, pytorch_dump_folder_p
     # Save pytorch-model
     pytorch_weights_dump_path = pytorch_dump_folder_path + '/' + WEIGHTS_NAME
     pytorch_config_dump_path = pytorch_dump_folder_path + '/' + CONFIG_NAME
-    pytorch_vocab_dump_path = pytorch_dump_folder_path + '/' + VOCAB_NAME
+    pytorch_vocab_dump_path = pytorch_dump_folder_path + '/' +  VOCAB_FILES_NAMES['vocab_file']
 
     print("Save PyTorch model to {}".format(pytorch_weights_dump_path))
     torch.save(model, pytorch_weights_dump_path)
diff --git a/pytorch_transformers/modeling_bert.py b/pytorch_transformers/modeling_bert.py
index b2a456209d..0dd72b2969 100644
--- a/pytorch_transformers/modeling_bert.py
+++ b/pytorch_transformers/modeling_bert.py
@@ -33,7 +33,7 @@ from .modeling_utils import WEIGHTS_NAME, CONFIG_NAME, PretrainedConfig, PreTrai
 
 logger = logging.getLogger(__name__)
 
-PRETRAINED_MODEL_ARCHIVE_MAP = {
+BERT_PRETRAINED_MODEL_ARCHIVE_MAP = {
     'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-pytorch_model.bin",
     'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-pytorch_model.bin",
     'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-pytorch_model.bin",
@@ -49,7 +49,7 @@ PRETRAINED_MODEL_ARCHIVE_MAP = {
     'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-pytorch_model.bin",
 }
 
-PRETRAINED_CONFIG_ARCHIVE_MAP = {
+BERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
     'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-config.json",
     'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-config.json",
     'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-config.json",
@@ -152,7 +152,7 @@ ACT2FN = {"gelu": gelu, "relu": torch.nn.functional.relu, "swish": swish}
 class BertConfig(PretrainedConfig):
     """Configuration class to store the configuration of a `BertModel`.
     """
-    pretrained_config_archive_map = PRETRAINED_CONFIG_ARCHIVE_MAP
+    pretrained_config_archive_map = BERT_PRETRAINED_CONFIG_ARCHIVE_MAP
 
     def __init__(self,
                  vocab_size_or_config_json_file=30522,
@@ -543,7 +543,7 @@ class BertPreTrainedModel(PreTrainedModel):
         a simple interface for dowloading and loading pretrained models.
     """
     config_class = BertConfig
-    pretrained_model_archive_map = PRETRAINED_MODEL_ARCHIVE_MAP
+    pretrained_model_archive_map = BERT_PRETRAINED_MODEL_ARCHIVE_MAP
     load_tf_weights = load_tf_weights_in_bert
     base_model_prefix = "bert"
 
diff --git a/pytorch_transformers/modeling_gpt2.py b/pytorch_transformers/modeling_gpt2.py
index 090763cda1..9340ce8489 100644
--- a/pytorch_transformers/modeling_gpt2.py
+++ b/pytorch_transformers/modeling_gpt2.py
@@ -37,9 +37,9 @@ from .modeling_bert import BertLayerNorm as LayerNorm
 
 logger = logging.getLogger(__name__)
 
-PRETRAINED_MODEL_ARCHIVE_MAP = {"gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-pytorch_model.bin",
+GPT2_PRETRAINED_MODEL_ARCHIVE_MAP = {"gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-pytorch_model.bin",
                                 "gpt2-medium": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-pytorch_model.bin"}
-PRETRAINED_CONFIG_ARCHIVE_MAP = {"gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-config.json",
+GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP = {"gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-config.json",
                                  "gpt2-medium": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-config.json"}
 
 def load_tf_weights_in_gpt2(model, config, gpt2_checkpoint_path):
@@ -103,7 +103,7 @@ def gelu(x):
 class GPT2Config(PretrainedConfig):
     """Configuration class to store the configuration of a `GPT2Model`.
     """
-    pretrained_config_archive_map = PRETRAINED_CONFIG_ARCHIVE_MAP
+    pretrained_config_archive_map = GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP
 
     def __init__(
         self,
@@ -358,7 +358,7 @@ class GPT2PreTrainedModel(PreTrainedModel):
         a simple interface for dowloading and loading pretrained models.
     """
     config_class = GPT2Config
-    pretrained_model_archive_map = PRETRAINED_MODEL_ARCHIVE_MAP
+    pretrained_model_archive_map = GPT2_PRETRAINED_MODEL_ARCHIVE_MAP
     load_tf_weights = load_tf_weights_in_gpt2
     base_model_prefix = "transformer"
 
diff --git a/pytorch_transformers/modeling_openai.py b/pytorch_transformers/modeling_openai.py
index b715b18371..4a3ff732f6 100644
--- a/pytorch_transformers/modeling_openai.py
+++ b/pytorch_transformers/modeling_openai.py
@@ -37,8 +37,8 @@ from .modeling_bert import BertLayerNorm as LayerNorm
 
 logger = logging.getLogger(__name__)
 
-PRETRAINED_MODEL_ARCHIVE_MAP = {"openai-gpt": "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-pytorch_model.bin"}
-PRETRAINED_CONFIG_ARCHIVE_MAP = {"openai-gpt": "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-config.json"}
+OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP = {"openai-gpt": "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-pytorch_model.bin"}
+OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP = {"openai-gpt": "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-config.json"}
 
 
 def load_tf_weights_in_openai_gpt(model, config, openai_checkpoint_folder_path):
@@ -130,7 +130,7 @@ ACT_FNS = {"relu": nn.ReLU, "swish": swish, "gelu": gelu}
 class OpenAIGPTConfig(PretrainedConfig):
     """Configuration class to store the configuration of a `OpenAIGPTModel`.
     """
-    pretrained_config_archive_map = PRETRAINED_CONFIG_ARCHIVE_MAP
+    pretrained_config_archive_map = OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP
 
     def __init__(
         self,
@@ -384,7 +384,7 @@ class OpenAIGPTPreTrainedModel(PreTrainedModel):
         a simple interface for dowloading and loading pretrained models.
     """
     config_class = OpenAIGPTConfig
-    pretrained_model_archive_map = PRETRAINED_MODEL_ARCHIVE_MAP
+    pretrained_model_archive_map = OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP
     load_tf_weights = load_tf_weights_in_openai_gpt
     base_model_prefix = "transformer"
 
diff --git a/pytorch_transformers/modeling_transfo_xl.py b/pytorch_transformers/modeling_transfo_xl.py
index 465577b002..35a1b635f9 100644
--- a/pytorch_transformers/modeling_transfo_xl.py
+++ b/pytorch_transformers/modeling_transfo_xl.py
@@ -41,10 +41,10 @@ from .modeling_utils import CONFIG_NAME, WEIGHTS_NAME, PretrainedConfig, PreTrai
 
 logger = logging.getLogger(__name__)
 
-PRETRAINED_MODEL_ARCHIVE_MAP = {
+TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP = {
     'transfo-xl-wt103': "https://s3.amazonaws.com/models.huggingface.co/bert/transfo-xl-wt103-pytorch_model.bin",
 }
-PRETRAINED_CONFIG_ARCHIVE_MAP = {
+TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP = {
     'transfo-xl-wt103': "https://s3.amazonaws.com/models.huggingface.co/bert/transfo-xl-wt103-config.json",
 }
 
@@ -179,7 +179,7 @@ def load_tf_weights_in_transfo_xl(model, config, tf_path):
 class TransfoXLConfig(PretrainedConfig):
     """Configuration class to store the configuration of a `TransfoXLModel`.
     """
-    pretrained_config_archive_map = PRETRAINED_CONFIG_ARCHIVE_MAP
+    pretrained_config_archive_map = TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP
 
     def __init__(self,
                  vocab_size_or_config_json_file=267735,
@@ -838,7 +838,7 @@ class TransfoXLPreTrainedModel(PreTrainedModel):
         a simple interface for dowloading and loading pretrained models.
     """
     config_class = TransfoXLConfig
-    pretrained_model_archive_map = PRETRAINED_MODEL_ARCHIVE_MAP
+    pretrained_model_archive_map = TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP
     load_tf_weights = load_tf_weights_in_transfo_xl
     base_model_prefix = "transformer"
 
diff --git a/pytorch_transformers/modeling_utils.py b/pytorch_transformers/modeling_utils.py
index 96558704ea..b9be1a3813 100644
--- a/pytorch_transformers/modeling_utils.py
+++ b/pytorch_transformers/modeling_utils.py
@@ -169,6 +169,22 @@ class PreTrainedModel(nn.Module):
         model_to_prune = getattr(self, self.base_model_prefix, self)  # get the base model if needed
         model_to_prune._prune_heads(heads_to_prune)
 
+    def save_pretrained(self, save_directory):
+        """ Save a model with its configuration file to a directory, so that it
+            can be re-loaded using the `from_pretrained(save_directory)` class method.
+        """
+        assert os.path.isdir(save_directory), "Saving path should be a directory where the model and configuration can be saved"
+
+        # Only save the model it-self if we are using distributed training
+        model_to_save = self.module if hasattr(self, 'module') else self
+
+        # If we save using the predefined names, we can load using `from_pretrained`
+        output_model_file = os.path.join(save_directory, WEIGHTS_NAME)
+        output_config_file = os.path.join(save_directory, CONFIG_NAME)
+
+        torch.save(model_to_save.state_dict(), output_model_file)
+        model_to_save.config.to_json_file(output_config_file)
+
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
         """
diff --git a/pytorch_transformers/modeling_xlm.py b/pytorch_transformers/modeling_xlm.py
index 14f8848a42..c7ea294dbd 100644
--- a/pytorch_transformers/modeling_xlm.py
+++ b/pytorch_transformers/modeling_xlm.py
@@ -40,10 +40,10 @@ from .modeling_utils import (CONFIG_NAME, WEIGHTS_NAME, PretrainedConfig, PreTra
 
 logger = logging.getLogger(__name__)
 
-PRETRAINED_MODEL_ARCHIVE_MAP = {
+XLM_PRETRAINED_MODEL_ARCHIVE_MAP = {
     'xlm-mlm-en-2048': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-en-2048-pytorch_model.bin",
 }
-PRETRAINED_CONFIG_ARCHIVE_MAP = {
+XLM_PRETRAINED_CONFIG_ARCHIVE_MAP = {
     'xlm-mlm-en-2048': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-en-2048-config.json",
 }
 
@@ -51,7 +51,7 @@ PRETRAINED_CONFIG_ARCHIVE_MAP = {
 class XLMConfig(PretrainedConfig):
     """Configuration class to store the configuration of a `XLMModel`.
     """
-    pretrained_config_archive_map = PRETRAINED_CONFIG_ARCHIVE_MAP
+    pretrained_config_archive_map = XLM_PRETRAINED_CONFIG_ARCHIVE_MAP
 
     def __init__(self,
                  vocab_size_or_config_json_file=30145,
@@ -357,7 +357,7 @@ class XLMPreTrainedModel(PreTrainedModel):
         a simple interface for dowloading and loading pretrained models.
     """
     config_class = XLMConfig
-    pretrained_model_archive_map = PRETRAINED_MODEL_ARCHIVE_MAP
+    pretrained_model_archive_map = XLM_PRETRAINED_MODEL_ARCHIVE_MAP
     load_tf_weights = None
     base_model_prefix = "transformer"
 
diff --git a/pytorch_transformers/modeling_xlnet.py b/pytorch_transformers/modeling_xlnet.py
index 289dcbd9db..628dbe7450 100644
--- a/pytorch_transformers/modeling_xlnet.py
+++ b/pytorch_transformers/modeling_xlnet.py
@@ -38,10 +38,10 @@ from .modeling_utils import (CONFIG_NAME, WEIGHTS_NAME, PretrainedConfig, PreTra
 
 logger = logging.getLogger(__name__)
 
-PRETRAINED_MODEL_ARCHIVE_MAP = {
+XLNET_PRETRAINED_MODEL_ARCHIVE_MAP = {
     'xlnet-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-large-cased-pytorch_model.bin",
 }
-PRETRAINED_CONFIG_ARCHIVE_MAP = {
+XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP = {
     'xlnet-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-large-cased-config.json",
 }
 
@@ -195,7 +195,7 @@ ACT2FN = {"gelu": gelu, "relu": torch.nn.functional.relu, "swish": swish}
 class XLNetConfig(PretrainedConfig):
     """Configuration class to store the configuration of a `XLNetModel`.
     """
-    pretrained_config_archive_map = PRETRAINED_CONFIG_ARCHIVE_MAP
+    pretrained_config_archive_map = XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP
 
     def __init__(self,
                  vocab_size_or_config_json_file=32000,
@@ -593,7 +593,7 @@ class XLNetPreTrainedModel(PreTrainedModel):
         a simple interface for dowloading and loading pretrained models.
     """
     config_class = XLNetConfig
-    pretrained_model_archive_map = PRETRAINED_MODEL_ARCHIVE_MAP
+    pretrained_model_archive_map = XLNET_PRETRAINED_MODEL_ARCHIVE_MAP
     load_tf_weights = load_tf_weights_in_xlnet
     base_model_prefix = "transformer"
 
diff --git a/pytorch_transformers/tests/modeling_bert_test.py b/pytorch_transformers/tests/modeling_bert_test.py
index 2ba59317be..fbdce29366 100644
--- a/pytorch_transformers/tests/modeling_bert_test.py
+++ b/pytorch_transformers/tests/modeling_bert_test.py
@@ -24,7 +24,7 @@ from pytorch_transformers import (BertConfig, BertModel, BertForMaskedLM,
                                      BertForNextSentencePrediction, BertForPreTraining,
                                      BertForQuestionAnswering, BertForSequenceClassification,
                                      BertForTokenClassification, BertForMultipleChoice)
-from pytorch_transformers.modeling_bert import PRETRAINED_MODEL_ARCHIVE_MAP
+from pytorch_transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP
 
 from .modeling_tests_commons import (create_and_check_commons, ConfigTester, ids_tensor)
 
@@ -267,7 +267,7 @@ class BertModelTest(unittest.TestCase):
     @pytest.mark.slow
     def test_model_from_pretrained(self):
         cache_dir = "/tmp/pytorch_transformers_test/"
-        for model_name in list(PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
+        for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
             model = BertModel.from_pretrained(model_name, cache_dir=cache_dir)
             shutil.rmtree(cache_dir)
             self.assertIsNotNone(model)
diff --git a/pytorch_transformers/tests/modeling_tests_commons.py b/pytorch_transformers/tests/modeling_tests_commons.py
index b831f85552..db79b017c1 100644
--- a/pytorch_transformers/tests/modeling_tests_commons.py
+++ b/pytorch_transformers/tests/modeling_tests_commons.py
@@ -413,7 +413,7 @@ class GPTModelTester(object):
 
     def create_and_check_model_from_pretrained(self):
         cache_dir = "/tmp/pytorch_transformers_test/"
-        for model_name in list(self.base_model_class.PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
+        for model_name in list(self.base_model_class.pretrained_model_archive_map.keys())[:1]:
             model = self.base_model_class.from_pretrained(model_name, cache_dir=cache_dir)
             shutil.rmtree(cache_dir)
             self.parent.assertIsNotNone(model)
diff --git a/pytorch_transformers/tests/modeling_transfo_xl_test.py b/pytorch_transformers/tests/modeling_transfo_xl_test.py
index f2906d879f..49ba1addf1 100644
--- a/pytorch_transformers/tests/modeling_transfo_xl_test.py
+++ b/pytorch_transformers/tests/modeling_transfo_xl_test.py
@@ -26,7 +26,7 @@ import pytest
 import torch
 
 from pytorch_transformers import (TransfoXLConfig, TransfoXLModel, TransfoXLLMHeadModel)
-from pytorch_transformers.modeling_transfo_xl import PRETRAINED_MODEL_ARCHIVE_MAP
+from pytorch_transformers.modeling_transfo_xl import TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP
 
 from .modeling_tests_commons import ConfigTester, create_and_check_commons, ids_tensor
 
@@ -185,7 +185,7 @@ class TransfoXLModelTest(unittest.TestCase):
     @pytest.mark.slow
     def test_model_from_pretrained(self):
         cache_dir = "/tmp/pytorch_transformers_test/"
-        for model_name in list(PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
+        for model_name in list(TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
             model = TransfoXLModel.from_pretrained(model_name, cache_dir=cache_dir)
             shutil.rmtree(cache_dir)
             self.assertIsNotNone(model)
diff --git a/pytorch_transformers/tests/modeling_utils_test.py b/pytorch_transformers/tests/modeling_utils_test.py
index 5e3b8e676a..a168c24611 100644
--- a/pytorch_transformers/tests/modeling_utils_test.py
+++ b/pytorch_transformers/tests/modeling_utils_test.py
@@ -20,12 +20,12 @@ import unittest
 import logging
 
 from pytorch_transformers import PretrainedConfig, PreTrainedModel
-from pytorch_transformers.modeling_bert import BertModel, BertConfig, PRETRAINED_MODEL_ARCHIVE_MAP
+from pytorch_transformers.modeling_bert import BertModel, BertConfig, BERT_PRETRAINED_MODEL_ARCHIVE_MAP
 
 class ModelUtilsTest(unittest.TestCase):
     def test_model_from_pretrained(self):
         logging.basicConfig(level=logging.INFO)
-        for model_name in list(PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
+        for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
             config = BertConfig.from_pretrained(model_name)
             self.assertIsNotNone(config)
             self.assertIsInstance(config, PretrainedConfig)
diff --git a/pytorch_transformers/tests/modeling_xlm_test.py b/pytorch_transformers/tests/modeling_xlm_test.py
index 9c511f21a8..6e2e082d19 100644
--- a/pytorch_transformers/tests/modeling_xlm_test.py
+++ b/pytorch_transformers/tests/modeling_xlm_test.py
@@ -21,7 +21,7 @@ import shutil
 import pytest
 
 from pytorch_transformers import (XLMConfig, XLMModel, XLMWithLMHeadModel, XLMForQuestionAnswering, XLMForSequenceClassification)
-from pytorch_transformers.modeling_xlm import PRETRAINED_MODEL_ARCHIVE_MAP
+from pytorch_transformers.modeling_xlm import XLM_PRETRAINED_MODEL_ARCHIVE_MAP
 
 from .modeling_tests_commons import (create_and_check_commons, ConfigTester, ids_tensor)
 
@@ -251,7 +251,7 @@ class XLMModelTest(unittest.TestCase):
     @pytest.mark.slow
     def test_model_from_pretrained(self):
         cache_dir = "/tmp/pytorch_transformers_test/"
-        for model_name in list(PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
+        for model_name in list(XLM_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
             model = XLMModel.from_pretrained(model_name, cache_dir=cache_dir)
             shutil.rmtree(cache_dir)
             self.assertIsNotNone(model)
diff --git a/pytorch_transformers/tests/modeling_xlnet_test.py b/pytorch_transformers/tests/modeling_xlnet_test.py
index b762426d2c..e167e2d2e8 100644
--- a/pytorch_transformers/tests/modeling_xlnet_test.py
+++ b/pytorch_transformers/tests/modeling_xlnet_test.py
@@ -26,7 +26,7 @@ import pytest
 import torch
 
 from pytorch_transformers import (XLNetConfig, XLNetModel, XLNetLMHeadModel, XLNetForSequenceClassification, XLNetForQuestionAnswering)
-from pytorch_transformers.modeling_xlnet import PRETRAINED_MODEL_ARCHIVE_MAP
+from pytorch_transformers.modeling_xlnet import XLNET_PRETRAINED_MODEL_ARCHIVE_MAP
 
 from .modeling_tests_commons import ConfigTester, create_and_check_commons, ids_tensor
 
@@ -279,7 +279,7 @@ class XLNetModelTest(unittest.TestCase):
     @pytest.mark.slow
     def test_model_from_pretrained(self):
         cache_dir = "/tmp/pytorch_transformers_test/"
-        for model_name in list(PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
+        for model_name in list(XLNET_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
             model = XLNetModel.from_pretrained(model_name, cache_dir=cache_dir)
             shutil.rmtree(cache_dir)
             self.assertIsNotNone(model)
diff --git a/pytorch_transformers/tests/tokenization_bert_test.py b/pytorch_transformers/tests/tokenization_bert_test.py
index 37e20cc286..220bf45346 100644
--- a/pytorch_transformers/tests/tokenization_bert_test.py
+++ b/pytorch_transformers/tests/tokenization_bert_test.py
@@ -17,14 +17,12 @@ from __future__ import absolute_import, division, print_function, unicode_litera
 import os
 import unittest
 from io import open
-import shutil
-import pytest
 
 from pytorch_transformers.tokenization_bert import (BasicTokenizer,
-                                                  BertTokenizer,
-                                                  WordpieceTokenizer,
-                                                  _is_control, _is_punctuation,
-                                                  _is_whitespace)
+                                                    BertTokenizer,
+                                                    WordpieceTokenizer,
+                                                    _is_control, _is_punctuation,
+                                                    _is_whitespace, VOCAB_FILES_NAMES)
 
 from .tokenization_tests_commons import create_and_check_tokenizer_commons
 
@@ -33,13 +31,15 @@ class TokenizationTest(unittest.TestCase):
     def test_full_tokenizer(self):
         vocab_tokens = [
             "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn",
-            "##ing", ","
+            "##ing", ",", "low", "lowest",
         ]
-        with open("/tmp/bert_tokenizer_test.txt", "w", encoding='utf-8') as vocab_writer:
+        vocab_directory = "/tmp/"
+        vocab_file = os.path.join(vocab_directory, VOCAB_FILES_NAMES['vocab_file'])
+        with open(vocab_file, "w", encoding='utf-8') as vocab_writer:
             vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
             vocab_file = vocab_writer.name
 
-        create_and_check_tokenizer_commons(self, BertTokenizer, vocab_file)
+        create_and_check_tokenizer_commons(self, BertTokenizer, pretrained_model_name_or_path=vocab_directory)
 
         tokenizer = BertTokenizer(vocab_file)
 
@@ -80,7 +80,7 @@ class TokenizationTest(unittest.TestCase):
         vocab = {}
         for (i, token) in enumerate(vocab_tokens):
             vocab[token] = i
-        tokenizer = WordpieceTokenizer(vocab=vocab)
+        tokenizer = WordpieceTokenizer(vocab=vocab, unk_token="[UNK]")
 
         self.assertListEqual(tokenizer.tokenize(""), [])
 
diff --git a/pytorch_transformers/tests/tokenization_gpt2_test.py b/pytorch_transformers/tests/tokenization_gpt2_test.py
index 8b06161b53..30959ceed1 100644
--- a/pytorch_transformers/tests/tokenization_gpt2_test.py
+++ b/pytorch_transformers/tests/tokenization_gpt2_test.py
@@ -17,8 +17,9 @@ from __future__ import absolute_import, division, print_function, unicode_litera
 import os
 import unittest
 import json
+import tempfile
 
-from pytorch_transformers.tokenization_gpt2 import GPT2Tokenizer
+from pytorch_transformers.tokenization_gpt2 import GPT2Tokenizer, VOCAB_FILES_NAMES
 
 from .tokenization_tests_commons import create_and_check_tokenizer_commons
 
@@ -28,31 +29,31 @@ class GPT2TokenizationTest(unittest.TestCase):
         """ Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt """
         vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n",
                  "lo", "low", "er",
-                 "low", "lowest", "newer", "wider"]
+                 "low", "lowest", "newer", "wider", "<unk>"]
         vocab_tokens = dict(zip(vocab, range(len(vocab))))
         merges = ["#version: 0.2", "l o", "lo w", "e r", ""]
-        with open("/tmp/openai_tokenizer_vocab_test.json", "w") as fp:
-            fp.write(json.dumps(vocab_tokens))
-            vocab_file = fp.name
-        with open("/tmp/openai_tokenizer_merges_test.txt", "w") as fp:
-            fp.write("\n".join(merges))
-            merges_file = fp.name
+        special_tokens_map = {"unk_token": "<unk>"}
 
-        create_and_check_tokenizer_commons(self, GPT2Tokenizer, vocab_file, merges_file, special_tokens=["<unk>", "<pad>"])
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            vocab_file = os.path.join(tmpdirname, VOCAB_FILES_NAMES['vocab_file'])
+            merges_file = os.path.join(tmpdirname, VOCAB_FILES_NAMES['merges_file'])
+            with open(vocab_file, "w") as fp:
+                fp.write(json.dumps(vocab_tokens))
+            with open(merges_file, "w") as fp:
+                fp.write("\n".join(merges))
 
-        tokenizer = GPT2Tokenizer(vocab_file, merges_file, special_tokens=["<unk>", "<pad>"])
-        text = "lower"
-        bpe_tokens = ["low", "er"]
-        tokens = tokenizer.tokenize(text)
-        self.assertListEqual(tokens, bpe_tokens)
+            create_and_check_tokenizer_commons(self, GPT2Tokenizer, tmpdirname, **special_tokens_map)
 
-        input_tokens = tokens + ["<unk>"]
-        input_bpe_tokens = [13, 12, 16]
-        self.assertListEqual(
-            tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
+            tokenizer = GPT2Tokenizer(vocab_file, merges_file, **special_tokens_map)
+            text = "lower"
+            bpe_tokens = ["low", "er"]
+            tokens = tokenizer.tokenize(text)
+            self.assertListEqual(tokens, bpe_tokens)
 
-        os.remove(vocab_file)
-        os.remove(merges_file)
+            input_tokens = tokens + [tokenizer.unk_token]
+            input_bpe_tokens = [13, 12, 17]
+            self.assertListEqual(
+                tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
 
 
 if __name__ == '__main__':
diff --git a/pytorch_transformers/tests/tokenization_openai_test.py b/pytorch_transformers/tests/tokenization_openai_test.py
index 3f8c49f888..22f7d70017 100644
--- a/pytorch_transformers/tests/tokenization_openai_test.py
+++ b/pytorch_transformers/tests/tokenization_openai_test.py
@@ -17,10 +17,9 @@ from __future__ import absolute_import, division, print_function, unicode_litera
 import os
 import unittest
 import json
-import shutil
-import pytest
+import tempfile
 
-from pytorch_transformers.tokenization_openai import OpenAIGPTTokenizer
+from pytorch_transformers.tokenization_openai import OpenAIGPTTokenizer, VOCAB_FILES_NAMES
 
 from.tokenization_tests_commons import create_and_check_tokenizer_commons
 
@@ -32,31 +31,31 @@ class OpenAIGPTTokenizationTest(unittest.TestCase):
         vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n",
                  "w</w>", "r</w>", "t</w>",
                  "lo", "low", "er</w>",
-                 "low</w>", "lowest</w>", "newer</w>", "wider</w>"]
+                 "low</w>", "lowest</w>", "newer</w>", "wider</w>", "<unk>"]
         vocab_tokens = dict(zip(vocab, range(len(vocab))))
         merges = ["#version: 0.2", "l o", "lo w", "e r</w>", ""]
-        with open("/tmp/openai_tokenizer_vocab_test.json", "w") as fp:
-            fp.write(json.dumps(vocab_tokens))
-            vocab_file = fp.name
-        with open("/tmp/openai_tokenizer_merges_test.txt", "w") as fp:
-            fp.write("\n".join(merges))
-            merges_file = fp.name
 
-        create_and_check_tokenizer_commons(self, OpenAIGPTTokenizer, vocab_file, merges_file, special_tokens=["<unk>", "<pad>"])
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            vocab_file = os.path.join(tmpdirname, VOCAB_FILES_NAMES['vocab_file'])
+            merges_file = os.path.join(tmpdirname, VOCAB_FILES_NAMES['merges_file'])
+            with open(vocab_file, "w") as fp:
+                fp.write(json.dumps(vocab_tokens))
+            with open(merges_file, "w") as fp:
+                fp.write("\n".join(merges))
 
-        tokenizer = OpenAIGPTTokenizer(vocab_file, merges_file, special_tokens=["<unk>", "<pad>"])
-        os.remove(vocab_file)
-        os.remove(merges_file)
+            create_and_check_tokenizer_commons(self, OpenAIGPTTokenizer, tmpdirname)
 
-        text = "lower"
-        bpe_tokens = ["low", "er</w>"]
-        tokens = tokenizer.tokenize(text)
-        self.assertListEqual(tokens, bpe_tokens)
+            tokenizer = OpenAIGPTTokenizer(vocab_file, merges_file)
 
-        input_tokens = tokens + ["<unk>"]
-        input_bpe_tokens = [14, 15, 20]
-        self.assertListEqual(
-            tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
+            text = "lower"
+            bpe_tokens = ["low", "er</w>"]
+            tokens = tokenizer.tokenize(text)
+            self.assertListEqual(tokens, bpe_tokens)
+
+            input_tokens = tokens + ["<unk>"]
+            input_bpe_tokens = [14, 15, 20]
+            self.assertListEqual(
+                tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
 
 
 if __name__ == '__main__':
diff --git a/pytorch_transformers/tests/tokenization_tests_commons.py b/pytorch_transformers/tests/tokenization_tests_commons.py
index 876f7747be..07f962bcab 100644
--- a/pytorch_transformers/tests/tokenization_tests_commons.py
+++ b/pytorch_transformers/tests/tokenization_tests_commons.py
@@ -17,6 +17,7 @@ from __future__ import absolute_import, division, print_function, unicode_litera
 import os
 import sys
 from io import open
+import tempfile
 
 if sys.version_info[0] == 3:
     unicode = str
@@ -28,22 +29,19 @@ else:
 
 
 def create_and_check_save_and_load_tokenizer(tester, tokenizer_class, *inputs, **kwargs):
-    tokenizer = tokenizer_class(*inputs, **kwargs)
+    tokenizer = tokenizer_class.from_pretrained(*inputs, **kwargs)
 
     before_tokens = tokenizer.encode(u"He is very happy, UNwant\u00E9d,running")
 
-    vocab_path="/tmp/"
-    output_files = tokenizer.save_vocabulary(vocab_path=vocab_path)
-    tokenizer = tokenizer.from_pretrained(vocab_path)
-
-    for f in output_files:
-        os.remove(f)
+    with tempfile.TemporaryDirectory() as tmpdirname:
+        tokenizer.save_pretrained(tmpdirname)
+        tokenizer = tokenizer.from_pretrained(tmpdirname)
 
     after_tokens = tokenizer.encode(u"He is very happy, UNwant\u00E9d,running")
     tester.assertListEqual(before_tokens, after_tokens)
 
 def create_and_check_pickle_tokenizer(tester, tokenizer_class, *inputs, **kwargs):
-    tokenizer = tokenizer_class(*inputs, **kwargs)
+    tokenizer = tokenizer_class.from_pretrained(*inputs, **kwargs)
 
     text = u"Munich and Berlin are nice cities"
     filename = u"/tmp/tokenizer.bin"
@@ -58,8 +56,54 @@ def create_and_check_pickle_tokenizer(tester, tokenizer_class, *inputs, **kwargs
     tester.assertListEqual(subwords, subwords_loaded)
 
 
+def create_and_check_add_tokens_tokenizer(tester, tokenizer_class, *inputs, **kwargs):
+    tokenizer = tokenizer_class.from_pretrained(*inputs, **kwargs)
+
+    vocab_size = tokenizer.vocab_size
+    all_size = len(tokenizer)
+
+    tester.assertNotEqual(vocab_size, 0)
+    tester.assertEqual(vocab_size, all_size)
+
+    new_toks = ["aaaaabbbbbb", "cccccccccdddddddd"]
+    added_toks = tokenizer.add_tokens(new_toks)
+    vocab_size_2 = tokenizer.vocab_size
+    all_size_2 = len(tokenizer)
+
+    tester.assertNotEqual(vocab_size_2, 0)
+    tester.assertEqual(vocab_size, vocab_size_2)
+    tester.assertEqual(added_toks, len(new_toks))
+    tester.assertEqual(all_size_2, all_size + len(new_toks))
+
+    tokens = tokenizer.encode("aaaaabbbbbb low cccccccccdddddddd l")
+    tester.assertGreaterEqual(len(tokens), 4)
+    tester.assertGreater(tokens[0], tokenizer.vocab_size - 1)
+    tester.assertGreater(tokens[-2], tokenizer.vocab_size - 1)
+
+    new_toks_2 = {'eos_token': ">>>>|||<||<<|<<",
+                  'pad_token': "<<<<<|||>|>>>>|>"}
+    added_toks_2 = tokenizer.add_special_tokens(new_toks_2)
+    vocab_size_3 = tokenizer.vocab_size
+    all_size_3 = len(tokenizer)
+
+    tester.assertNotEqual(vocab_size_3, 0)
+    tester.assertEqual(vocab_size, vocab_size_3)
+    tester.assertEqual(added_toks_2, len(new_toks_2))
+    tester.assertEqual(all_size_3, all_size_2 + len(new_toks_2))
+
+    tokens = tokenizer.encode(">>>>|||<||<<|<< aaaaabbbbbb low cccccccccdddddddd <<<<<|||>|>>>>|> l")
+
+    tester.assertGreaterEqual(len(tokens), 6)
+    tester.assertGreater(tokens[0], tokenizer.vocab_size - 1)
+    tester.assertGreater(tokens[0], tokens[1])
+    tester.assertGreater(tokens[-2], tokenizer.vocab_size - 1)
+    tester.assertGreater(tokens[-2], tokens[-3])
+    tester.assertEqual(tokens[0], tokenizer.convert_tokens_to_ids(tokenizer.eos_token))
+    tester.assertEqual(tokens[-2], tokenizer.convert_tokens_to_ids(tokenizer.pad_token))
+
+
 def create_and_check_required_methods_tokenizer(tester, tokenizer_class, *inputs, **kwargs):
-    tokenizer = tokenizer_class(*inputs, **kwargs)
+    tokenizer = tokenizer_class.from_pretrained(*inputs, **kwargs)
 
     text = u"He is very happy, UNwant\u00E9d,running"
     tokens = tokenizer.tokenize(text)
@@ -75,5 +119,6 @@ def create_and_check_required_methods_tokenizer(tester, tokenizer_class, *inputs
 
 def create_and_check_tokenizer_commons(tester, tokenizer_class, *inputs, **kwargs):
     create_and_check_required_methods_tokenizer(tester, tokenizer_class, *inputs, **kwargs)
+    create_and_check_add_tokens_tokenizer(tester, tokenizer_class, *inputs, **kwargs)
     create_and_check_save_and_load_tokenizer(tester, tokenizer_class, *inputs, **kwargs)
     create_and_check_pickle_tokenizer(tester, tokenizer_class, *inputs, **kwargs)
diff --git a/pytorch_transformers/tests/tokenization_transfo_xl_test.py b/pytorch_transformers/tests/tokenization_transfo_xl_test.py
index f583e30b56..a4ddd357b9 100644
--- a/pytorch_transformers/tests/tokenization_transfo_xl_test.py
+++ b/pytorch_transformers/tests/tokenization_transfo_xl_test.py
@@ -17,10 +17,9 @@ from __future__ import absolute_import, division, print_function, unicode_litera
 import os
 import unittest
 from io import open
-import shutil
-import pytest
+import tempfile
 
-from pytorch_transformers.tokenization_transfo_xl import TransfoXLTokenizer
+from pytorch_transformers.tokenization_transfo_xl import TransfoXLTokenizer, VOCAB_FILES_NAMES
 
 from.tokenization_tests_commons import create_and_check_tokenizer_commons
 
@@ -28,22 +27,23 @@ class TransfoXLTokenizationTest(unittest.TestCase):
 
     def test_full_tokenizer(self):
         vocab_tokens = [
-            "<unk>", "[CLS]", "[SEP]", "want", "unwanted", "wa", "un", "running", ","
+            "<unk>", "[CLS]", "[SEP]", "want", "unwanted", "wa", "un",
+            "running", ",", "low", "l",
         ]
-        with open("/tmp/transfo_xl_tokenizer_test.txt", "w", encoding='utf-8') as vocab_writer:
-            vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
-            vocab_file = vocab_writer.name
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            vocab_file = os.path.join(tmpdirname, VOCAB_FILES_NAMES['vocab_file'])
+            with open(vocab_file, "w", encoding='utf-8') as vocab_writer:
+                vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
 
-        create_and_check_tokenizer_commons(self, TransfoXLTokenizer, vocab_file=vocab_file, lower_case=True)
+            create_and_check_tokenizer_commons(self, TransfoXLTokenizer, tmpdirname, lower_case=True)
 
-        tokenizer = TransfoXLTokenizer(vocab_file=vocab_file, lower_case=True)
-        os.remove(vocab_file)
+            tokenizer = TransfoXLTokenizer(vocab_file=vocab_file, lower_case=True)
 
-        tokens = tokenizer.tokenize(u"<unk> UNwanted , running")
-        self.assertListEqual(tokens, ["<unk>", "unwanted", ",", "running"])
+            tokens = tokenizer.tokenize(u"<unk> UNwanted , running")
+            self.assertListEqual(tokens, ["<unk>", "unwanted", ",", "running"])
 
-        self.assertListEqual(
-            tokenizer.convert_tokens_to_ids(tokens), [0, 4, 8, 7])
+            self.assertListEqual(
+                tokenizer.convert_tokens_to_ids(tokens), [0, 4, 8, 7])
 
     def test_full_tokenizer_lower(self):
         tokenizer = TransfoXLTokenizer(lower_case=True)
diff --git a/pytorch_transformers/tests/tokenization_utils_test.py b/pytorch_transformers/tests/tokenization_utils_test.py
index e8856d50c2..26ec2d7a39 100644
--- a/pytorch_transformers/tests/tokenization_utils_test.py
+++ b/pytorch_transformers/tests/tokenization_utils_test.py
@@ -17,6 +17,7 @@ from __future__ import division
 from __future__ import print_function
 
 import unittest
+import six
 
 from pytorch_transformers import PreTrainedTokenizer
 from pytorch_transformers.tokenization_gpt2 import GPT2Tokenizer
@@ -27,8 +28,17 @@ class TokenizerUtilsTest(unittest.TestCase):
         for model_name in s3_models[:1]:
             tokenizer = tokenizer_class.from_pretrained(model_name)
             self.assertIsNotNone(tokenizer)
+            self.assertIsInstance(tokenizer, tokenizer_class)
             self.assertIsInstance(tokenizer, PreTrainedTokenizer)
 
+            for special_tok in tokenizer.all_special_tokens:
+                if six.PY2:
+                    self.assertIsInstance(special_tok, unicode)
+                else:
+                    self.assertIsInstance(special_tok, str)
+                special_tok_id = tokenizer.convert_tokens_to_ids(special_tok)
+                self.assertIsInstance(special_tok_id, int)
+
     def test_pretrained_tokenizers(self):
         self.check_tokenizer_from_pretrained(GPT2Tokenizer)
 
diff --git a/pytorch_transformers/tests/tokenization_xlm_test.py b/pytorch_transformers/tests/tokenization_xlm_test.py
index 00d273a628..b543ed23f8 100644
--- a/pytorch_transformers/tests/tokenization_xlm_test.py
+++ b/pytorch_transformers/tests/tokenization_xlm_test.py
@@ -17,10 +17,9 @@ from __future__ import absolute_import, division, print_function, unicode_litera
 import os
 import unittest
 import json
-import shutil
-import pytest
+import tempfile
 
-from pytorch_transformers.tokenization_xlm import XLMTokenizer
+from pytorch_transformers.tokenization_xlm import XLMTokenizer, VOCAB_FILES_NAMES
 
 from .tokenization_tests_commons import create_and_check_tokenizer_commons
 
@@ -31,31 +30,31 @@ class XLMTokenizationTest(unittest.TestCase):
         vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n",
                  "w</w>", "r</w>", "t</w>",
                  "lo", "low", "er</w>",
-                 "low</w>", "lowest</w>", "newer</w>", "wider</w>"]
+                 "low</w>", "lowest</w>", "newer</w>", "wider</w>", "<unk>"]
         vocab_tokens = dict(zip(vocab, range(len(vocab))))
         merges = ["l o 123", "lo w 1456", "e r</w> 1789", ""]
-        with open("/tmp/openai_tokenizer_vocab_test.json", "w") as fp:
-            fp.write(json.dumps(vocab_tokens))
-            vocab_file = fp.name
-        with open("/tmp/openai_tokenizer_merges_test.txt", "w") as fp:
-            fp.write("\n".join(merges))
-            merges_file = fp.name
 
-        create_and_check_tokenizer_commons(self, XLMTokenizer, vocab_file, merges_file, special_tokens=["<unk>", "<pad>"])
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            vocab_file = os.path.join(tmpdirname, VOCAB_FILES_NAMES['vocab_file'])
+            merges_file = os.path.join(tmpdirname, VOCAB_FILES_NAMES['merges_file'])
+            with open(vocab_file, "w") as fp:
+                fp.write(json.dumps(vocab_tokens))
+            with open(merges_file, "w") as fp:
+                fp.write("\n".join(merges))
 
-        tokenizer = XLMTokenizer(vocab_file, merges_file, special_tokens=["<unk>", "<pad>"])
-        os.remove(vocab_file)
-        os.remove(merges_file)
+            create_and_check_tokenizer_commons(self, XLMTokenizer, tmpdirname)
 
-        text = "lower"
-        bpe_tokens = ["low", "er</w>"]
-        tokens = tokenizer.tokenize(text)
-        self.assertListEqual(tokens, bpe_tokens)
+            tokenizer = XLMTokenizer(vocab_file, merges_file)
 
-        input_tokens = tokens + ["<unk>"]
-        input_bpe_tokens = [14, 15, 20]
-        self.assertListEqual(
-            tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
+            text = "lower"
+            bpe_tokens = ["low", "er</w>"]
+            tokens = tokenizer.tokenize(text)
+            self.assertListEqual(tokens, bpe_tokens)
+
+            input_tokens = tokens + ["<unk>"]
+            input_bpe_tokens = [14, 15, 20]
+            self.assertListEqual(
+                tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
 
 
 if __name__ == '__main__':
diff --git a/pytorch_transformers/tests/tokenization_xlnet_test.py b/pytorch_transformers/tests/tokenization_xlnet_test.py
index 6e81f214b7..8fc98209ba 100644
--- a/pytorch_transformers/tests/tokenization_xlnet_test.py
+++ b/pytorch_transformers/tests/tokenization_xlnet_test.py
@@ -16,10 +16,9 @@ from __future__ import absolute_import, division, print_function, unicode_litera
 
 import os
 import unittest
-import shutil
-import pytest
+import tempfile
 
-from pytorch_transformers.tokenization_xlnet import (XLNetTokenizer, SPIECE_UNDERLINE)
+from pytorch_transformers.tokenization_xlnet import (XLNetTokenizer, SPIECE_UNDERLINE, VOCAB_FILES_NAMES)
 
 from.tokenization_tests_commons import create_and_check_tokenizer_commons
 
@@ -29,34 +28,37 @@ SAMPLE_VOCAB = os.path.join(os.path.dirname(os.path.abspath(__file__)),
 class XLNetTokenizationTest(unittest.TestCase):
 
     def test_full_tokenizer(self):
-        create_and_check_tokenizer_commons(self, XLNetTokenizer, SAMPLE_VOCAB)
-
         tokenizer = XLNetTokenizer(SAMPLE_VOCAB, keep_accents=True)
 
-        tokens = tokenizer.tokenize(u'This is a test')
-        self.assertListEqual(tokens, [u'▁This', u'▁is', u'▁a', u'▁t', u'est'])
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            tokenizer.save_pretrained(tmpdirname)
 
-        self.assertListEqual(
-            tokenizer.convert_tokens_to_ids(tokens), [285, 46, 10, 170, 382])
+            create_and_check_tokenizer_commons(self, XLNetTokenizer, tmpdirname)
 
-        tokens = tokenizer.tokenize(u"I was born in 92000, and this is falsé.")
-        self.assertListEqual(tokens, [SPIECE_UNDERLINE + u'I', SPIECE_UNDERLINE + u'was', SPIECE_UNDERLINE + u'b',
-                                      u'or', u'n', SPIECE_UNDERLINE + u'in', SPIECE_UNDERLINE + u'',
-                                      u'9', u'2', u'0', u'0', u'0', u',', SPIECE_UNDERLINE + u'and', SPIECE_UNDERLINE + u'this',
-                                      SPIECE_UNDERLINE + u'is', SPIECE_UNDERLINE + u'f', u'al', u's', u'é', u'.'])
-        ids = tokenizer.convert_tokens_to_ids(tokens)
-        self.assertListEqual(
-            ids, [8, 21, 84, 55, 24, 19, 7, 0,
-                            602, 347, 347, 347, 3, 12, 66,
-                            46, 72, 80, 6, 0, 4])
+            tokens = tokenizer.tokenize(u'This is a test')
+            self.assertListEqual(tokens, [u'▁This', u'▁is', u'▁a', u'▁t', u'est'])
 
-        back_tokens = tokenizer.convert_ids_to_tokens(ids)
-        self.assertListEqual(back_tokens, [SPIECE_UNDERLINE + u'I', SPIECE_UNDERLINE + u'was', SPIECE_UNDERLINE + u'b',
-                                           u'or', u'n', SPIECE_UNDERLINE + u'in',
-                                           SPIECE_UNDERLINE + u'', u'<unk>', u'2', u'0', u'0', u'0', u',',
-                                           SPIECE_UNDERLINE + u'and', SPIECE_UNDERLINE + u'this',
-                                           SPIECE_UNDERLINE + u'is', SPIECE_UNDERLINE + u'f', u'al', u's',
-                                           u'<unk>', u'.'])
+            self.assertListEqual(
+                tokenizer.convert_tokens_to_ids(tokens), [285, 46, 10, 170, 382])
+
+            tokens = tokenizer.tokenize(u"I was born in 92000, and this is falsé.")
+            self.assertListEqual(tokens, [SPIECE_UNDERLINE + u'I', SPIECE_UNDERLINE + u'was', SPIECE_UNDERLINE + u'b',
+                                        u'or', u'n', SPIECE_UNDERLINE + u'in', SPIECE_UNDERLINE + u'',
+                                        u'9', u'2', u'0', u'0', u'0', u',', SPIECE_UNDERLINE + u'and', SPIECE_UNDERLINE + u'this',
+                                        SPIECE_UNDERLINE + u'is', SPIECE_UNDERLINE + u'f', u'al', u's', u'é', u'.'])
+            ids = tokenizer.convert_tokens_to_ids(tokens)
+            self.assertListEqual(
+                ids, [8, 21, 84, 55, 24, 19, 7, 0,
+                    602, 347, 347, 347, 3, 12, 66,
+                    46, 72, 80, 6, 0, 4])
+
+            back_tokens = tokenizer.convert_ids_to_tokens(ids)
+            self.assertListEqual(back_tokens, [SPIECE_UNDERLINE + u'I', SPIECE_UNDERLINE + u'was', SPIECE_UNDERLINE + u'b',
+                                            u'or', u'n', SPIECE_UNDERLINE + u'in',
+                                            SPIECE_UNDERLINE + u'', u'<unk>', u'2', u'0', u'0', u'0', u',',
+                                            SPIECE_UNDERLINE + u'and', SPIECE_UNDERLINE + u'this',
+                                            SPIECE_UNDERLINE + u'is', SPIECE_UNDERLINE + u'f', u'al', u's',
+                                            u'<unk>', u'.'])
 
     def test_tokenizer_lower(self):
         tokenizer = XLNetTokenizer(SAMPLE_VOCAB, do_lower_case=True)
diff --git a/pytorch_transformers/tokenization_bert.py b/pytorch_transformers/tokenization_bert.py
index b26e5066e9..3e14673f46 100644
--- a/pytorch_transformers/tokenization_bert.py
+++ b/pytorch_transformers/tokenization_bert.py
@@ -22,7 +22,6 @@ import os
 import unicodedata
 from io import open
 
-from .file_utils import cached_path
 from .tokenization_utils import PreTrainedTokenizer, clean_up_tokenization
 
 logger = logging.getLogger(__name__)
@@ -32,20 +31,21 @@ VOCAB_FILES_NAMES = {'vocab_file': 'vocab.txt'}
 PRETRAINED_VOCAB_FILES_MAP = {
     'vocab_file':
     {
-    'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt",
-    'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt",
-    'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-vocab.txt",
-    'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-vocab.txt",
-    'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-vocab.txt",
-    'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-vocab.txt",
-    'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-vocab.txt",
-    'bert-base-german-cased': "https://int-deepset-models-bert.s3.eu-central-1.amazonaws.com/pytorch/bert-base-german-cased-vocab.txt",
-    'bert-large-uncased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-vocab.txt",
-    'bert-large-cased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-vocab.txt",
-    'bert-large-uncased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-vocab.txt",
-    'bert-large-cased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-vocab.txt",
-    'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-vocab.txt",
-}}
+        'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt",
+        'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt",
+        'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-vocab.txt",
+        'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-vocab.txt",
+        'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-vocab.txt",
+        'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-vocab.txt",
+        'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-vocab.txt",
+        'bert-base-german-cased': "https://int-deepset-models-bert.s3.eu-central-1.amazonaws.com/pytorch/bert-base-german-cased-vocab.txt",
+        'bert-large-uncased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-vocab.txt",
+        'bert-large-cased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-vocab.txt",
+        'bert-large-uncased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-vocab.txt",
+        'bert-large-cased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-vocab.txt",
+        'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-vocab.txt",
+    }
+}
 
 PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
     'bert-base-uncased': 512,
@@ -93,8 +93,9 @@ class BertTokenizer(PreTrainedTokenizer):
     pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
     max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
 
-    def __init__(self, vocab_file, do_lower_case=True, max_len=None, do_basic_tokenize=True,
-                 never_split=("[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]")):
+    def __init__(self, vocab_file, do_lower_case=True, do_basic_tokenize=True, never_split=None,
+                 unk_token="[UNK]", sep_token="[SEP]", pad_token="[PAD]", cls_token="[CLS]",
+                 mask_token="[MASK]", **kwargs):
         """Constructs a BertTokenizer.
 
         Args:
@@ -102,17 +103,18 @@ class BertTokenizer(PreTrainedTokenizer):
           do_lower_case: Whether to lower case the input
                          Only has an effect when do_wordpiece_only=False
           do_basic_tokenize: Whether to do basic tokenization before wordpiece.
-          max_len: An artificial maximum length to truncate tokenized sequences to;
-                         Effective maximum length is always the minimum of this
-                         value (if specified) and the underlying BERT model's
-                         sequence length.
           never_split: List of tokens which will never be split during tokenization.
                          Only has an effect when do_wordpiece_only=False
         """
+        super(BertTokenizer, self).__init__(unk_token=unk_token, sep_token=sep_token,
+                                            pad_token=pad_token, cls_token=cls_token,
+                                            mask_token=mask_token, **kwargs)
         if not os.path.isfile(vocab_file):
             raise ValueError(
                 "Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained "
                 "model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file))
+        if never_split is None:
+            never_split = self.all_special_tokens
         self.vocab = load_vocab(vocab_file)
         self.ids_to_tokens = collections.OrderedDict(
             [(ids, tok) for tok, ids in self.vocab.items()])
@@ -120,90 +122,34 @@ class BertTokenizer(PreTrainedTokenizer):
         if do_basic_tokenize:
           self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case,
                                                 never_split=never_split)
-        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
-        self.max_len = max_len if max_len is not None else int(1e12)
+        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token)
 
     @property
-    def UNK_TOKEN(self):
-        return "[UNK]"
+    def vocab_size(self):
+        return len(self.vocab)
 
-    @property
-    def SEP_TOKEN(self):
-        return "[SEP]"
-
-    @property
-    def PAD_TOKEN(self):
-        return "[PAD]"
-
-    @property
-    def CLS_TOKEN(self):
-        return "[CLS]"
-
-    @property
-    def MASK_TOKEN(self):
-        return "[MASK]"
-
-    @property
-    def UNK_ID(self):
-        return self.vocab["[UNK]"]
-
-    @property
-    def SEP_ID(self):
-        return self.vocab["[SEP]"]
-
-    @property
-    def PAD_ID(self):
-        return self.vocab["[PAD]"]
-
-    @property
-    def CLS_ID(self):
-        return self.vocab["[CLS]"]
-
-    @property
-    def MASK_ID(self):
-        return self.vocab["[MASK]"]
-
-    def tokenize(self, text):
+    def _tokenize(self, text):
         split_tokens = []
         if self.do_basic_tokenize:
-            for token in self.basic_tokenizer.tokenize(text):
+            for token in self.basic_tokenizer.tokenize(text, never_split=self.all_special_tokens):
                 for sub_token in self.wordpiece_tokenizer.tokenize(token):
                     split_tokens.append(sub_token)
         else:
             split_tokens = self.wordpiece_tokenizer.tokenize(text)
         return split_tokens
 
-    def convert_tokens_to_ids(self, tokens):
-        """Converts a sequence of tokens into ids using the vocab."""
-        ids = []
-        for token in tokens:
-            ids.append(self.vocab[token])
-        if len(ids) > self.max_len:
-            logger.warning(
-                "Token indices sequence length is longer than the specified maximum "
-                " sequence length for this BERT model ({} > {}). Running this"
-                " sequence through BERT will result in indexing errors".format(len(ids), self.max_len)
-            )
-        return ids
+    def _convert_token_to_id(self, token):
+        """ Converts a token (str/unicode) in an id using the vocab. """
+        return self.vocab.get(token, self.vocab.get(self.unk_token))
 
-    def convert_ids_to_tokens(self, ids):
-        """Converts a sequence of ids in wordpiece tokens using the vocab."""
-        tokens = []
-        for i in ids:
-            tokens.append(self.ids_to_tokens[i])
-        return tokens
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (string/unicode) using the vocab."""
+        return self.ids_to_tokens.get(index, self.unk_token)
 
-    def encode(self, text):
-        return self.convert_tokens_to_ids(self.tokenize(text))
-
-    def decode(self, token_ids, clean_up_tokenization_spaces=True):
+    def _convert_ids_to_string(self, tokens_ids):
         """Converts a sequence of ids in a string."""
-        tokens = self.convert_ids_to_tokens(token_ids)
+        tokens = self.convert_ids_to_tokens(tokens_ids)
         out_string = ''.join(tokens).replace(' ##', '').strip()
-        if clean_up_tokenization_spaces:
-            for special_tok in (self.UNK_TOKEN, self.SEP_TOKEN, self.PAD_TOKEN, self.CLS_TOKEN, self.MASK_TOKEN):
-                out_string = out_string.replace(special_tok, '')
-            out_string = clean_up_tokenization(out_string)
         return out_string
 
     def save_vocabulary(self, vocab_path):
@@ -245,17 +191,20 @@ class BasicTokenizer(object):
 
     def __init__(self,
                  do_lower_case=True,
-                 never_split=("[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]")):
+                 never_split=None):
         """Constructs a BasicTokenizer.
 
         Args:
           do_lower_case: Whether to lower case the input.
         """
+        if never_split is None:
+            never_split = []
         self.do_lower_case = do_lower_case
         self.never_split = never_split
 
-    def tokenize(self, text):
+    def tokenize(self, text, never_split=None):
         """Tokenizes a piece of text."""
+        never_split = self.never_split + (never_split if never_split is not None else [])
         text = self._clean_text(text)
         # This was added on November 1st, 2018 for the multilingual and Chinese
         # models. This is also applied to the English models now, but it doesn't
@@ -267,7 +216,7 @@ class BasicTokenizer(object):
         orig_tokens = whitespace_tokenize(text)
         split_tokens = []
         for token in orig_tokens:
-            if self.do_lower_case and token not in self.never_split:
+            if self.do_lower_case and token not in never_split:
                 token = token.lower()
                 token = self._run_strip_accents(token)
             split_tokens.extend(self._run_split_on_punc(token))
@@ -286,9 +235,9 @@ class BasicTokenizer(object):
             output.append(char)
         return "".join(output)
 
-    def _run_split_on_punc(self, text):
+    def _run_split_on_punc(self, text, never_split=None):
         """Splits punctuation on a piece of text."""
-        if text in self.never_split:
+        if never_split is not None and text in never_split:
             return [text]
         chars = list(text)
         i = 0
@@ -360,7 +309,7 @@ class BasicTokenizer(object):
 class WordpieceTokenizer(object):
     """Runs WordPiece tokenization."""
 
-    def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=100):
+    def __init__(self, vocab, unk_token, max_input_chars_per_word=100):
         self.vocab = vocab
         self.unk_token = unk_token
         self.max_input_chars_per_word = max_input_chars_per_word
diff --git a/pytorch_transformers/tokenization_gpt2.py b/pytorch_transformers/tokenization_gpt2.py
index abdfe39c1c..af1ad2cf8f 100644
--- a/pytorch_transformers/tokenization_gpt2.py
+++ b/pytorch_transformers/tokenization_gpt2.py
@@ -38,7 +38,6 @@ logger = logging.getLogger(__name__)
 VOCAB_FILES_NAMES = {
     'vocab_file': 'vocab.json',
     'merges_file': 'merges.txt',
-    'special_tokens_file': 'special_tokens.txt'
 }
 
 PRETRAINED_VOCAB_FILES_MAP = {
@@ -52,11 +51,6 @@ PRETRAINED_VOCAB_FILES_MAP = {
         'gpt2': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt",
         'gpt2-medium': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-merges.txt",
     },
-    'special_tokens_file':
-    {
-        'gpt2': None,
-        'gpt2-medium': None,
-    }
 }
 
 PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
@@ -108,8 +102,10 @@ class GPT2Tokenizer(PreTrainedTokenizer):
     pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
     max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
 
-    def __init__(self, vocab_file, merges_file, special_tokens_file=None, special_tokens=None, errors='replace', max_len=None):
-        self.max_len = max_len if max_len is not None else int(1e12)
+    def __init__(self, vocab_file, merges_file, errors='replace',
+                 bos_token="<|endoftext|>", eos_token="<|endoftext|>", **kwargs):
+        super(GPT2Tokenizer, self).__init__(bos_token=bos_token, eos_token=eos_token, **kwargs)
+
         self.encoder = json.load(open(vocab_file))
         self.decoder = {v:k for k,v in self.encoder.items()}
         self.errors = errors # how to handle errors in decoding
@@ -123,32 +119,9 @@ class GPT2Tokenizer(PreTrainedTokenizer):
         # Should haved added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
         self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
 
-        all_special_tokens = []
-        if special_tokens_file is not None:
-            special_tokens_to_add = open(special_tokens_file, encoding='utf-8').read().split('\n')[:-1]
-            all_special_tokens.extend(special_tokens_to_add)
-        if special_tokens is not None and special_tokens:
-            all_special_tokens.extend(special_tokens)
-
-        self.special_tokens = {}
-        self.special_tokens_decoder = {}
-        self.set_special_tokens(all_special_tokens)
-
-    def __len__(self):
-        return len(self.encoder) + len(self.special_tokens)
-
-    def set_special_tokens(self, special_tokens):
-        """ Add a list of additional tokens to the encoder.
-            The additional tokens are indexed starting from the last index of the
-            current vocabulary in the order of the `special_tokens` list.
-        """
-        if not special_tokens:
-            self.special_tokens = {}
-            self.special_tokens_decoder = {}
-            return
-        self.special_tokens = dict((tok, len(self.encoder) + i) for i, tok in enumerate(special_tokens))
-        self.special_tokens_decoder = {v:k for k, v in self.special_tokens.items()}
-        logger.info("Special tokens {}".format(self.special_tokens))
+    @property
+    def vocab_size(self):
+        return len(self.encoder)
 
     def bpe(self, token):
         if token in self.cache:
@@ -191,7 +164,7 @@ class GPT2Tokenizer(PreTrainedTokenizer):
         self.cache[token] = word
         return word
 
-    def tokenize(self, text):
+    def _tokenize(self, text):
         """ Tokenize a string. """
         bpe_tokens = []
         for token in re.findall(self.pat, text):
@@ -202,57 +175,27 @@ class GPT2Tokenizer(PreTrainedTokenizer):
             bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(' '))
         return bpe_tokens
 
-    def convert_tokens_to_ids(self, tokens):
-        """ Converts a sequence of tokens into ids using the vocab. """
-        ids = []
-        if isinstance(tokens, str) or (sys.version_info[0] == 2 and isinstance(tokens, unicode)):
-            if tokens in self.special_tokens:
-                return self.special_tokens[tokens]
-            else:
-                return self.encoder.get(tokens, 0)
-        for token in tokens:
-            if token in self.special_tokens:
-                ids.append(self.special_tokens[token])
-            else:
-                ids.append(self.encoder.get(token, 0))
-        if len(ids) > self.max_len:
-            logger.warning(
-                "Token indices sequence length is longer than the specified maximum "
-                " sequence length for this OpenAI GPT model ({} > {}). Running this"
-                " sequence through the model will result in indexing errors".format(len(ids), self.max_len)
-            )
-        return ids
+    def _convert_token_to_id(self, token):
+        """ Converts a token (str/unicode) in an id using the vocab. """
+        return self.encoder.get(token, self.encoder.get(self.unk_token))
 
-    def convert_ids_to_tokens(self, ids, skip_special_tokens=False):
-        """Converts a sequence of ids in BPE tokens using the vocab."""
-        tokens = []
-        for i in ids:
-            if i in self.special_tokens_decoder:
-                if not skip_special_tokens:
-                    tokens.append(self.special_tokens_decoder[i])
-            else:
-                tokens.append(self.decoder[i])
-        return tokens
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (string/unicode) using the vocab."""
+        return self.decoder.get(index, self.unk_token)
 
-    def encode(self, text):
-        return self.convert_tokens_to_ids(self.tokenize(text))
-
-    def decode(self, tokens, skip_special_tokens=False, clean_up_tokenization_spaces=True):
-        text = ''.join(self.convert_ids_to_tokens(tokens, skip_special_tokens=skip_special_tokens))
+    def _convert_ids_to_string(self, tokens_ids):
+        """Converts a sequence of ids in a string."""
+        text = ''.join(tokens_ids)
         text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors=self.errors)
-        if clean_up_tokenization_spaces:
-            text = text.replace('<unk>', '')
-            text = clean_up_tokenization(text)
         return text
 
-    def save_vocabulary(self, vocab_path):
+    def save_vocabulary(self, save_directory):
         """Save the tokenizer vocabulary and merge files to a directory."""
-        if not os.path.isdir(vocab_path):
-            logger.error("Vocabulary path ({}) should be a directory".format(vocab_path))
+        if not os.path.isdir(save_directory):
+            logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
             return
-        vocab_file = os.path.join(vocab_path, VOCAB_FILES_NAMES['vocab_file'])
-        merge_file = os.path.join(vocab_path, VOCAB_FILES_NAMES['merges_file'])
-        special_tokens_file = os.path.join(vocab_path, VOCAB_FILES_NAMES['special_tokens_file'])
+        vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES['vocab_file'])
+        merge_file = os.path.join(save_directory, VOCAB_FILES_NAMES['merges_file'])
 
         with open(vocab_file, 'w', encoding='utf-8') as f:
             f.write(json.dumps(self.encoder, ensure_ascii=False))
@@ -268,14 +211,4 @@ class GPT2Tokenizer(PreTrainedTokenizer):
                 writer.write(' '.join(bpe_tokens) + u'\n')
                 index += 1
 
-        index = len(self.encoder)
-        with open(special_tokens_file, 'w', encoding='utf-8') as writer:
-            for token, token_index in sorted(self.special_tokens.items(), key=lambda kv: kv[1]):
-                if index != token_index:
-                    logger.warning("Saving special tokens vocabulary to {}: BPE indices are not consecutive."
-                                   " Please check that the tokenizer is not corrupted!".format(special_tokens_file))
-                    index = token_index
-                writer.write(token + u'\n')
-                index += 1
-
-        return vocab_file, merge_file, special_tokens_file
+        return vocab_file, merge_file
diff --git a/pytorch_transformers/tokenization_openai.py b/pytorch_transformers/tokenization_openai.py
index 419dfdad92..16d355c57d 100644
--- a/pytorch_transformers/tokenization_openai.py
+++ b/pytorch_transformers/tokenization_openai.py
@@ -20,13 +20,9 @@ import json
 import logging
 import os
 import re
-import sys
 from io import open
 
-from tqdm import tqdm
-
-from .file_utils import cached_path
-from .tokenization_utils import PreTrainedTokenizer, clean_up_tokenization
+from .tokenization_utils import PreTrainedTokenizer
 from .tokenization_bert import BasicTokenizer
 
 logger = logging.getLogger(__name__)
@@ -34,7 +30,6 @@ logger = logging.getLogger(__name__)
 VOCAB_FILES_NAMES = {
     'vocab_file': 'vocab.json',
     'merges_file': 'merges.txt',
-    'special_tokens_file': 'special_tokens.txt'
 }
 
 PRETRAINED_VOCAB_FILES_MAP = {
@@ -46,10 +41,6 @@ PRETRAINED_VOCAB_FILES_MAP = {
     {
         'openai-gpt': "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-merges.txt",
     },
-    'special_tokens_file':
-    {
-        'openai-gpt': None,
-    }
 }
 
 PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
@@ -88,14 +79,14 @@ class OpenAIGPTTokenizer(PreTrainedTokenizer):
     BPE tokenizer. Peculiarities:
         - lower case all inputs
         - uses SpaCy tokenizer and ftfy for pre-BPE tokenization if they are installed, fallback to BERT's BasicTokenizer if not.
-        - argument special_tokens and function set_special_tokens:
-            can be used to add additional symbols (ex: "__classify__") to a vocabulary.
     """
     vocab_files_names = VOCAB_FILES_NAMES
     pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
     max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
 
-    def __init__(self, vocab_file, merges_file, special_tokens_file=None, special_tokens=None, max_len=None):
+    def __init__(self, vocab_file, merges_file, unk_token="<unk>", **kwargs):
+        super(OpenAIGPTTokenizer, self).__init__(unk_token=unk_token, **kwargs)
+
         try:
             import ftfy
             import spacy
@@ -103,11 +94,9 @@ class OpenAIGPTTokenizer(PreTrainedTokenizer):
             self.fix_text = ftfy.fix_text
         except ImportError:
             logger.warning("ftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy.")
-            self.nlp = BasicTokenizer(do_lower_case=True,
-                                      never_split=special_tokens if special_tokens is not None else [])
+            self.nlp = BasicTokenizer(do_lower_case=True)
             self.fix_text = None
 
-        self.max_len = max_len if max_len is not None else int(1e12)
         self.encoder = json.load(open(vocab_file, encoding="utf-8"))
         self.decoder = {v:k for k,v in self.encoder.items()}
         merges = open(merges_file, encoding='utf-8').read().split('\n')[1:-1]
@@ -115,35 +104,9 @@ class OpenAIGPTTokenizer(PreTrainedTokenizer):
         self.bpe_ranks = dict(zip(merges, range(len(merges))))
         self.cache = {}
 
-        all_special_tokens = []
-        if special_tokens_file is not None:
-            special_tokens_to_add = open(special_tokens_file, encoding='utf-8').read().split('\n')[:-1]
-            all_special_tokens.extend(special_tokens_to_add)
-        if special_tokens is not None and special_tokens:
-            all_special_tokens.extend(special_tokens)
-
-        self.special_tokens = {}
-        self.special_tokens_decoder = {}
-        self.set_special_tokens(all_special_tokens)
-
-    def __len__(self):
-        return len(self.encoder) + len(self.special_tokens)
-
-    def set_special_tokens(self, special_tokens):
-        """ Add a list of additional tokens to the encoder.
-            The additional tokens are indexed starting from the last index of the
-            current vocabulary in the order of the `special_tokens` list.
-        """
-        if not special_tokens:
-            self.special_tokens = {}
-            self.special_tokens_decoder = {}
-            return
-        self.special_tokens = dict((tok, len(self.encoder) + i) for i, tok in enumerate(special_tokens))
-        self.special_tokens_decoder = {v:k for k, v in self.special_tokens.items()}
-        if self.fix_text is None:
-            # Using BERT's BasicTokenizer: we can update the tokenizer
-            self.nlp.never_split = special_tokens
-        logger.info("Special tokens {}".format(self.special_tokens))
+    @property
+    def vocab_size(self):
+        return len(self.encoder)
 
     def bpe(self, token):
         word = tuple(token[:-1]) + (token[-1] + '</w>',)
@@ -188,7 +151,7 @@ class OpenAIGPTTokenizer(PreTrainedTokenizer):
         self.cache[token] = word
         return word
 
-    def tokenize(self, text):
+    def _tokenize(self, text):
         """ Tokenize a string. """
         split_tokens = []
         if self.fix_text is None:
@@ -203,58 +166,26 @@ class OpenAIGPTTokenizer(PreTrainedTokenizer):
                 split_tokens.extend([t for t in self.bpe(token.text.lower()).split(' ')])
         return split_tokens
 
-    def convert_tokens_to_ids(self, tokens):
-        """ Converts a sequence of tokens into ids using the vocab. """
-        ids = []
-        if isinstance(tokens, str) or (sys.version_info[0] == 2 and isinstance(tokens, unicode)):
-            if tokens in self.special_tokens:
-                return self.special_tokens[tokens]
-            else:
-                return self.encoder.get(tokens, 0)
-        for token in tokens:
-            if token in self.special_tokens:
-                ids.append(self.special_tokens[token])
-            else:
-                ids.append(self.encoder.get(token, 0))
-        if len(ids) > self.max_len:
-            logger.warning(
-                "Token indices sequence length is longer than the specified maximum "
-                " sequence length for this OpenAI GPT model ({} > {}). Running this"
-                " sequence through the model will result in indexing errors".format(len(ids), self.max_len)
-            )
-        return ids
+    def _convert_token_to_id(self, token):
+        """ Converts a token (str/unicode) in an id using the vocab. """
+        return self.encoder.get(token, self.encoder.get(self.unk_token))
 
-    def convert_ids_to_tokens(self, ids, skip_special_tokens=False):
-        """Converts a sequence of ids in BPE tokens using the vocab."""
-        tokens = []
-        for i in ids:
-            if i in self.special_tokens_decoder:
-                if not skip_special_tokens:
-                    tokens.append(self.special_tokens_decoder[i])
-            else:
-                tokens.append(self.decoder[i])
-        return tokens
+    def _convert_id_to_token(self, index):
+        """Converts an id in a token (BPE) using the vocab."""
+        return self.decoder.get(index, self.unk_token)
 
-    def encode(self, text):
-        return self.convert_tokens_to_ids(self.tokenize(text))
-
-    def decode(self, ids, skip_special_tokens=False, clean_up_tokenization_spaces=True):
+    def _convert_ids_to_string(self, tokens_ids):
         """Converts a sequence of ids in a string."""
-        tokens = self.convert_ids_to_tokens(ids, skip_special_tokens=skip_special_tokens)
-        out_string = ''.join(tokens).replace('</w>', ' ').strip()
-        if clean_up_tokenization_spaces:
-            out_string = out_string.replace('<unk>', '')
-            out_string = clean_up_tokenization(out_string)
+        out_string = ''.join(tokens_ids).replace('</w>', ' ').strip()
         return out_string
 
-    def save_vocabulary(self, vocab_path):
+    def save_vocabulary(self, save_directory):
         """Save the tokenizer vocabulary and merge files to a directory."""
-        if not os.path.isdir(vocab_path):
-            logger.error("Vocabulary path ({}) should be a directory".format(vocab_path))
+        if not os.path.isdir(save_directory):
+            logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
             return
-        vocab_file = os.path.join(vocab_path, VOCAB_FILES_NAMES['vocab_file'])
-        merge_file = os.path.join(vocab_path, VOCAB_FILES_NAMES['merges_file'])
-        special_tokens_file = os.path.join(vocab_path, VOCAB_FILES_NAMES['special_tokens_file'])
+        vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES['vocab_file'])
+        merge_file = os.path.join(save_directory, VOCAB_FILES_NAMES['merges_file'])
 
         with open(vocab_file, 'w', encoding='utf-8') as f:
             f.write(json.dumps(self.encoder, ensure_ascii=False))
@@ -270,14 +201,4 @@ class OpenAIGPTTokenizer(PreTrainedTokenizer):
                 writer.write(' '.join(bpe_tokens) + u'\n')
                 index += 1
 
-        index = len(self.encoder)
-        with open(special_tokens_file, 'w', encoding='utf-8') as writer:
-            for token, token_index in sorted(self.special_tokens.items(), key=lambda kv: kv[1]):
-                if index != token_index:
-                    logger.warning("Saving special tokens vocabulary to {}: BPE indices are not consecutive."
-                                   " Please check that the tokenizer is not corrupted!".format(special_tokens_file))
-                    index = token_index
-                writer.write(token + u'\n')
-                index += 1
-
-        return vocab_file, merge_file, special_tokens_file
+        return vocab_file, merge_file
diff --git a/pytorch_transformers/tokenization_transfo_xl.py b/pytorch_transformers/tokenization_transfo_xl.py
index a86c8fe460..0b4e8c0ca5 100644
--- a/pytorch_transformers/tokenization_transfo_xl.py
+++ b/pytorch_transformers/tokenization_transfo_xl.py
@@ -41,7 +41,7 @@ else:
 
 logger = logging.getLogger(__name__)
 
-VOCAB_FILES_NAMES = {'pretrained_vocab_file': 'vocab.bin'}
+VOCAB_FILES_NAMES = {'pretrained_vocab_file': 'vocab.bin', 'vocab_file': 'vocab.txt'}
 
 PRETRAINED_VOCAB_FILES_MAP = {
     'pretrained_vocab_file':
@@ -67,9 +67,17 @@ class TransfoXLTokenizer(PreTrainedTokenizer):
     pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
     max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
 
-    def __init__(self, special=[], min_freq=0, max_size=None, lower_case=False,
+    def __init__(self, special=None, min_freq=0, max_size=None, lower_case=False,
                  delimiter=None, vocab_file=None, pretrained_vocab_file=None,
-                 never_split=("<unk>", "<eos>", "<formula>")):
+                 never_split=None, unk_token="<unk>", eos_token="<eos>",
+                 additional_special_tokens=["<formula>"], **kwargs):
+        super(TransfoXLTokenizer, self).__init__(unk_token=unk_token, eos_token=eos_token,
+                                                 additional_special_tokens=additional_special_tokens,
+                                                 **kwargs)
+        if never_split is None:
+            never_split = self.all_special_tokens
+        if special is None:
+            special = []
         self.counter = Counter()
         self.special = special
         self.min_freq = min_freq
@@ -200,11 +208,13 @@ class TransfoXLTokenizer(PreTrainedTokenizer):
             self.idx2sym.append(sym)
             self.sym2idx[sym] = len(self.idx2sym) - 1
 
-    def get_sym(self, idx):
+    def _convert_id_to_token(self, idx):
+        """Converts an id in a token (BPE) using the vocab."""
         assert 0 <= idx < len(self), 'Index {} out of vocabulary range'.format(idx)
         return self.idx2sym[idx]
 
-    def get_idx(self, sym):
+    def _convert_token_to_id(self, sym):
+        """ Converts a token (str/unicode) in an id using the vocab. """
         if sym in self.sym2idx:
             return self.sym2idx[sym]
         else:
@@ -220,36 +230,19 @@ class TransfoXLTokenizer(PreTrainedTokenizer):
             else:
                 raise ValueError('Token not in vocabulary and no <unk> token in vocabulary for replacement')
 
-    def convert_ids_to_tokens(self, indices):
-        """Converts a sequence of indices in symbols using the vocab."""
-        return [self.get_sym(idx) for idx in indices]
-
-    def convert_tokens_to_ids(self, symbols):
-        """Converts a sequence of symbols into ids using the vocab."""
-        return [self.get_idx(sym) for sym in symbols]
+    def _convert_ids_to_string(self, tokens_ids):
+        """Converts a sequence of ids in a string."""
+        out_string = ' '.join(tokens_ids).strip()
+        return out_string
 
     def convert_to_tensor(self, symbols):
         return torch.LongTensor(self.convert_tokens_to_ids(symbols))
 
-    def encode(self, text):
-        return self.convert_tokens_to_ids(self.tokenize(text))
-
-    def decode(self, indices, exclude=None, clean_up_tokenization_spaces=True):
-        """Converts a sequence of indices in a string."""
-        if exclude is None:
-            out_string = ' '.join([self.get_sym(idx) for idx in indices])
-        else:
-            out_string = ' '.join([self.get_sym(idx) for idx in indices if idx not in exclude])
-
-        if clean_up_tokenization_spaces:
-            out_string = clean_up_tokenization(out_string)
-
-        return out_string
-
-    def __len__(self):
+    @property
+    def vocab_size(self):
         return len(self.idx2sym)
 
-    def tokenize(self, line, add_eos=False, add_double_eos=False):
+    def _tokenize(self, line, add_eos=False, add_double_eos=False):
         line = line.strip()
         # convert to lower case
         if self.lower_case:
diff --git a/pytorch_transformers/tokenization_utils.py b/pytorch_transformers/tokenization_utils.py
index 9004315657..b191dd22e6 100644
--- a/pytorch_transformers/tokenization_utils.py
+++ b/pytorch_transformers/tokenization_utils.py
@@ -16,37 +16,145 @@
 from __future__ import (absolute_import, division, print_function,
                         unicode_literals)
 
-import sys
-import json
 import logging
 import os
-import regex as re
+import json
+import six
 from io import open
 
-try:
-    from functools import lru_cache
-except ImportError:
-    # Just a dummy decorator to get the checks to run on python2
-    # because honestly I don't want to support a byte-level unicode BPE tokenizer on python 2 right now.
-    def lru_cache():
-        return lambda func: func
-
 from .file_utils import cached_path
 
 logger = logging.getLogger(__name__)
 
+SPECIAL_TOKENS_MAP_FILE = 'special_tokens_map.json'
+ADDED_TOKENS_FILE = 'added_tokens.json'
 
 class PreTrainedTokenizer(object):
-    """ An abstract class to handle dowloading and loading pretrained tokenizers.
+    """ An abstract class to handle dowloading and loading pretrained tokenizers and adding tokens to the vocabulary.
+
+        Derived class can set up a few special tokens to be used in common scripts and internals:
+            bos_token, eos_token, EOP_TOKEN, EOD_TOKEN, unk_token, sep_token, pad_token, cls_token, mask_token
+            additional_special_tokens = []
+
+        We defined an added_tokens_encoder to add new tokens to the vocabulary without having to handle the
+            specific vocabulary augmentation methods of the various underlying dictionnary structures (BPE, sentencepiece...).
     """
     vocab_files_names = {}
     pretrained_vocab_files_map = {}
     max_model_input_sizes = {}
 
+    SPECIAL_TOKENS_ATTRIBUTES = ["bos_token", "eos_token", "unk_token", "sep_token",
+                                 "pad_token", "cls_token", "mask_token",
+                                 "additional_special_tokens"]
+
+    @property
+    def bos_token(self):
+        if self._bos_token is None:
+            logger.error("Using bos_token, but it is not set yet.")
+        return self._bos_token
+
+    @property
+    def eos_token(self):
+        if self._eos_token is None:
+            logger.error("Using eos_token, but it is not set yet.")
+        return self._eos_token
+
+    @property
+    def unk_token(self):
+        if self._unk_token is None:
+            logger.error("Using unk_token, but it is not set yet.")
+        return self._unk_token
+
+    @property
+    def sep_token(self):
+        if self._sep_token is None:
+            logger.error("Using sep_token, but it is not set yet.")
+        return self._sep_token
+
+    @property
+    def pad_token(self):
+        if self._pad_token is None:
+            logger.error("Using pad_token, but it is not set yet.")
+        return self._pad_token
+
+    @property
+    def cls_token(self):
+        if self._cls_token is None:
+            logger.error("Using cls_token, but it is not set yet.")
+        return self._cls_token
+
+    @property
+    def mask_token(self):
+        if self._mask_token is None:
+            logger.error("Using mask_token, but it is not set yet.")
+        return self._mask_token
+
+    @property
+    def additional_special_tokens(self):
+        if self._additional_special_tokens is None:
+            logger.error("Using additional_special_tokens, but it is not set yet.")
+        return self._additional_special_tokens
+
+    @bos_token.setter
+    def bos_token(self, value):
+        self._bos_token = value
+
+    @eos_token.setter
+    def eos_token(self, value):
+        self._eos_token = value
+
+    @unk_token.setter
+    def unk_token(self, value):
+        self._unk_token = value
+
+    @sep_token.setter
+    def sep_token(self, value):
+        self._sep_token = value
+
+    @pad_token.setter
+    def pad_token(self, value):
+        self._pad_token = value
+
+    @cls_token.setter
+    def cls_token(self, value):
+        self._cls_token = value
+
+    @mask_token.setter
+    def mask_token(self, value):
+        self._mask_token = value
+
+    @additional_special_tokens.setter
+    def additional_special_tokens(self, value):
+        self._additional_special_tokens = value
+
+    def __init__(self, max_len=None, **kwargs):
+        self._bos_token = None
+        self._eos_token = None
+        self._unk_token = None
+        self._sep_token = None
+        self._pad_token = None
+        self._cls_token = None
+        self._mask_token = None
+        self._additional_special_tokens = []
+
+        self.max_len = max_len if max_len is not None else int(1e12)
+        self.added_tokens_encoder = {}
+        self.added_tokens_decoder = {}
+
+        for key, value in kwargs.items():
+            if key not in self.SPECIAL_TOKENS_ATTRIBUTES:
+                raise ValueError(
+                    "PreTrainedTokenizer.__init__() argument {} should be in {}".format(
+                        key, ', '.join(self.SPECIAL_TOKENS_ATTRIBUTES)))
+            else:
+                setattr(self, key, value)
+
+
     @classmethod
     def from_pretrained(cls, *inputs, **kwargs):
         return cls._from_pretrained(*inputs, **kwargs)
 
+
     @classmethod
     def _from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, *inputs, **kwargs):
         """
@@ -59,16 +167,20 @@ class PreTrainedTokenizer(object):
             for file_id, map_list in cls.pretrained_vocab_files_map.items():
                 vocab_files[file_id] = map_list[pretrained_model_name_or_path]
         else:
-            for file_id, file_name in cls.vocab_files_names.items():
+            all_vocab_files_names = {'added_tokens_file': ADDED_TOKENS_FILE,
+                                     'special_tokens_map_file': SPECIAL_TOKENS_MAP_FILE}
+            all_vocab_files_names.update(cls.vocab_files_names)
+            for file_id, file_name in all_vocab_files_names.items():
                 if os.path.isdir(pretrained_model_name_or_path):
                     full_file_name = os.path.join(pretrained_model_name_or_path, file_name)
                 else:
                     full_file_name = pretrained_model_name_or_path
                 if not os.path.exists(full_file_name):
-                    logger.info("Didn't find file {}. We don't load it.".format(full_file_name))
+                    logger.info("Didn't find file {}. We won't load it.".format(full_file_name))
                     full_file_name = None
                 vocab_files[file_id] = full_file_name
-        # redirect to the cache, if necessary
+
+        # Get files from url, cache, or disk depending on the case
         try:
             resolved_vocab_files = {}
             for file_id, file_path in vocab_files.items():
@@ -95,6 +207,7 @@ class PreTrainedTokenizer(object):
                 logger.info("loading file {} from cache at {}".format(
                     file_path, resolved_vocab_files[file_id]))
 
+        # Set max length if needed
         if pretrained_model_name_or_path in cls.max_model_input_sizes:
             # if we're using a pretrained model, ensure the tokenizer
             # wont index sequences longer than the number of positional embeddings
@@ -102,31 +215,255 @@ class PreTrainedTokenizer(object):
             kwargs['max_len'] = min(kwargs.get('max_len', int(1e12)), max_len)
 
         # Merge resolved_vocab_files arguments in kwargs.
+        added_tokens_file = resolved_vocab_files.pop('added_tokens_file', None)
+        special_tokens_map_file = resolved_vocab_files.pop('special_tokens_map_file', None)
         for args_name, file_path in resolved_vocab_files.items():
-            kwargs[args_name] = file_path
+            if args_name not in kwargs:
+                kwargs[args_name] = file_path
+        if special_tokens_map_file is not None:
+            special_tokens_map = json.load(open(special_tokens_map_file, encoding="utf-8"))
+            for key, value in special_tokens_map.items():
+                if key not in kwargs:
+                    kwargs[key] = value
 
         # Instantiate tokenizer.
         tokenizer = cls(*inputs, **kwargs)
 
+        # Add supplementary tokens.
+        if added_tokens_file is not None:
+            added_tokens = json.load(open(added_tokens_file, encoding="utf-8"))
+            added_tok_encoder = dict((tok, len(tokenizer) + i) for i, tok in enumerate(added_tokens))
+            added_tok_decoder = {v:k for k, v in added_tok_encoder.items()}
+            tokenizer.added_tokens_encoder.update(added_tok_encoder)
+            tokenizer.added_tokens_decoder.update(added_tok_decoder)
+
         return tokenizer
 
-    def tokenize(self, text):
+
+    def save_pretrained(self, save_directory):
+        """ Save the tokenizer vocabulary files (with added tokens) and the
+            special-tokens-to-class-attributes-mapping to a directory, so that it
+            can be re-loaded using the `from_pretrained(save_directory)` class method.
+        """
+        if not os.path.isdir(save_directory):
+            logger.error("Saving directory ({}) should be a directory".format(save_directory))
+            return
+
+        special_tokens_map_file = os.path.join(save_directory, SPECIAL_TOKENS_MAP_FILE)
+        added_tokens_file = os.path.join(save_directory, ADDED_TOKENS_FILE)
+
+        with open(special_tokens_map_file, 'w', encoding='utf-8') as f:
+            f.write(json.dumps(self.special_tokens_map, ensure_ascii=False))
+
+        with open(added_tokens_file, 'w', encoding='utf-8') as f:
+            f.write(json.dumps(self.added_tokens_decoder, ensure_ascii=False))
+
+        vocab_files = self.save_vocabulary(save_directory)
+
+        return vocab_files + (special_tokens_map_file, added_tokens_file)
+
+
+    def save_vocabulary(self, save_directory):
+        """ Save the tokenizer vocabulary to a directory. This method doesn't save added tokens
+            and special token mappings.
+            
+            Please use `save_pretrained()` to save the full Tokenizer state so that it can be
+            reloaded using the `from_pretrained(save_directory)` class method.
+        """
+        raise NotImplementedError
+
+
+    def vocab_size(self):
+        raise NotImplementedError
+
+
+    def __len__(self):
+        return self.vocab_size + len(self.added_tokens_encoder)
+
+
+    def add_tokens(self, new_tokens):
+        """ Add a list of new tokens to the tokenizer class. If the new tokens are not in the
+            vocabulary, they are added to the added_tokens_encoder with indices starting from
+            the last index of the current vocabulary.
+
+            Returns:
+                Number of tokens added to the vocabulary which can be used to correspondingly
+                    increase the size of the associated model embedding matrices.
+        """
+        if not new_tokens:
+            return 0
+
+        to_add_tokens = []
+        for token in new_tokens:
+            if self.convert_tokens_to_ids(token) == self.convert_tokens_to_ids(self.unk_token):
+                to_add_tokens.append(token)
+                logger.info("Adding %s to the vocabulary", token)
+
+        added_tok_encoder = dict((tok, len(self) + i) for i, tok in enumerate(to_add_tokens))
+        added_tok_decoder = {v:k for k, v in added_tok_encoder.items()}
+        self.added_tokens_encoder.update(added_tok_encoder)
+        self.added_tokens_decoder.update(added_tok_decoder)
+
+        return len(to_add_tokens)
+
+
+    def add_special_tokens(self, special_tokens_dict):
+        """ Add a dictionnary of special tokens (eos, pad, cls...) to the encoder and link them
+            to class attributes. If the special tokens are not in the vocabulary, they are added
+            to it and indexed starting from the last index of the current vocabulary.
+
+            Returns:
+                Number of tokens added to the vocabulary which can be used to correspondingly
+                    increase the size of the associated model embedding matrices.
+        """
+        if not special_tokens_dict:
+            return 0
+
+        added_special_tokens = self.add_tokens(special_tokens_dict.values())
+        for key, value in special_tokens_dict.items():
+            logger.info("Assigning %s to the %s key of the tokenizer", value, key)
+            setattr(self, key, value)
+
+        return added_special_tokens
+
+
+    def tokenize(self, text, **kwargs):
+        """ Converts a string in a sequence of tokens (string), using the tokenizer.
+            Split in words for word-based vocabulary or sub-words for sub-word-based
+            vocabularies (BPE/SentencePieces/WordPieces).
+
+            Take care of added tokens.
+        """
+        def split_on_tokens(tok_list, text):
+            if not text:
+                return []
+            if not tok_list:
+                return self._tokenize(text, **kwargs)
+            tok = tok_list[0]
+            split_text = text.split(tok)
+            return sum((split_on_tokens(tok_list[1:], sub_text.strip()) + [tok] \
+                        for sub_text in split_text), [])[:-1]
+
+        added_tokens = list(self.added_tokens_encoder.keys())
+        tokenized_text = split_on_tokens(added_tokens, text)
+        return tokenized_text
+
+    def _tokenize(self, text, **kwargs):
+        """ Converts a string in a sequence of tokens (string), using the tokenizer.
+            Split in words for word-based vocabulary or sub-words for sub-word-based
+            vocabularies (BPE/SentencePieces/WordPieces).
+
+            Don't take care of added tokens.
+        """
         raise NotImplementedError
 
     def convert_tokens_to_ids(self, tokens):
+        """ Converts a single token or a sequence of tokens (str/unicode) in a integer id
+            (resp.) a sequence of ids, using the vocabulary.
+        """
+        if isinstance(tokens, str) or (six.PY2 and isinstance(tokens, unicode)):
+            return self.convert_token_to_id_with_added_voc(tokens)
+
+        ids = []
+        for token in tokens:
+            ids.append(self.convert_token_to_id_with_added_voc(token))
+        if len(ids) > self.max_len:
+            logger.warning("Token indices sequence length is longer than the specified maximum sequence length "
+                           "for this model ({} > {}). Running this sequence through the model will result in "
+                           "indexing errors".format(len(ids), self.max_len))
+        return ids
+
+
+    def convert_token_to_id_with_added_voc(self, token):
+        if token in self.added_tokens_encoder:
+            return self.added_tokens_encoder[token]
+        return self._convert_token_to_id(token)
+
+
+    def _convert_token_to_id(self, token):
         raise NotImplementedError
 
-    def convert_ids_to_tokens(self, ids):
+
+    def convert_ids_to_tokens(self, ids, skip_special_tokens=False):
+        """ Converts a single index or a sequence of indices (integers) in a token "
+            (resp.) a sequence of tokens (str/unicode), using the vocabulary and added tokens.
+
+            Args:
+                skip_special_tokens: Don't decode special tokens (self.all_special_tokens). Default: False
+        """
+        if isinstance(ids, int):
+            return self.convert_id_to_token(ids)
+        tokens = []
+        for index in ids:
+            if index in self.all_special_ids and skip_special_tokens:
+                continue
+            if index in self.added_tokens_decoder:
+                tokens.append(self.added_tokens_decoder[index])
+            else:
+                tokens.append(self._convert_id_to_token(index))
+        return tokens
+
+
+    def _convert_id_to_token(self, index):
         raise NotImplementedError
 
+
     def encode(self, text):
-        raise NotImplementedError
+        """ Converts a string in a sequence of ids (integer), using the tokenizer and vocabulary.
+            same as self.convert_tokens_to_ids(self.tokenize(text)).
+        """
+        return self.convert_tokens_to_ids(self.tokenize(text))
 
-    def decode(self, token_ids, *input, **kwargs):
-        raise NotImplementedError
 
-    def save_vocabulary(self, vocab_path):
-        raise NotImplementedError
+    def decode(self, token_ids, skip_special_tokens=False, clean_up_tokenization_spaces=True):
+        """ Converts a sequence of ids (integer) in a string, using the tokenizer and vocabulary
+            with options to remove special tokens and clean up tokenization spaces.
+        """
+        filtered_tokens = self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens)
+        text = self._convert_ids_to_string(filtered_tokens)
+        if clean_up_tokenization_spaces:
+            text = clean_up_tokenization(text)
+        return text
+
+    def _convert_ids_to_string(self, tokens_ids):
+        """ Converts a sequence of ids (integer) in a string, using the tokenizer and vocabulary.
+            roughtly same as ' '.join(self.convert_ids_to_tokens(token_ids)).
+        """
+        return ' '.join(self.convert_ids_to_tokens(tokens_ids))
+
+    @property
+    def special_tokens_map(self):
+        """ A dictionary mapping special token class attribute (cls_token, unk_token...) to their
+            values ('<unk>', '<cls>'...)
+        """
+        set_attr = {}
+        for attr in self.SPECIAL_TOKENS_ATTRIBUTES:
+            attr_value = getattr(self, "_" + attr)
+            if attr_value:
+                set_attr[attr] = attr_value
+        return set_attr
+
+    @property
+    def all_special_tokens(self):
+        """ List all the special tokens ('<unk>', '<cls>'...) mapped to class attributes
+            (cls_token, unk_token...).
+        """
+        all_toks = []
+        set_attr = self.special_tokens_map
+        for attr_value in set_attr.values():
+            all_toks = all_toks + (attr_value if isinstance(attr_value, (list, tuple)) else [attr_value])
+        all_toks = list(set(all_toks))
+        return all_toks
+
+    @property
+    def all_special_ids(self):
+        """ List the vocabulary indices of the special tokens ('<unk>', '<cls>'...) mapped to
+            class attributes (cls_token, unk_token...).
+        """
+        all_toks = self.all_special_tokens
+        all_ids = list(self.convert_tokens_to_ids(t) for t in all_toks)
+        return all_ids
+
 
 
 def clean_up_tokenization(out_string):
diff --git a/pytorch_transformers/tokenization_xlm.py b/pytorch_transformers/tokenization_xlm.py
index e37f3888a3..8a11a84f8c 100644
--- a/pytorch_transformers/tokenization_xlm.py
+++ b/pytorch_transformers/tokenization_xlm.py
@@ -34,7 +34,6 @@ logger = logging.getLogger(__name__)
 VOCAB_FILES_NAMES = {
     'vocab_file': 'vocab.json',
     'merges_file': 'merges.txt',
-    'special_tokens_file': 'special_tokens.txt'
 }
 
 PRETRAINED_VOCAB_FILES_MAP = {
@@ -46,24 +45,12 @@ PRETRAINED_VOCAB_FILES_MAP = {
     {
         'xlm-mlm-en-2048': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-en-2048-merges.txt",
     },
-    'special_tokens_file':
-    {
-        'xlm-mlm-en-2048': None,
-    }
 }
 
 PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
     'xlm-mlm-en-2048': 512,
 }
 
-INDEX = {
-    "bos_index": 0,
-    "eos_index": 1,
-    "pad_index": 2,
-    "unk_index": 3,
-    "mask_index": 5
-}
-
 def get_pairs(word):
     """
     Return set of symbol pairs in a word.
@@ -103,7 +90,16 @@ class XLMTokenizer(PreTrainedTokenizer):
     pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
     max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
 
-    def __init__(self, vocab_file, merges_file, special_tokens_file=None, special_tokens=None, max_len=None):
+    def __init__(self, vocab_file, merges_file, unk_token="<unk>", bos_token="<s>",
+                 sep_token="</s>", pad_token="<pad>", cls_token="</s>",
+                 mask_token="<special1>", additional_special_tokens=["<special0>",
+                 "<special1>", "<special2>", "<special3>", "<special4>", "<special5>",
+                 "<special6>", "<special7>", "<special8>", "<special9>"], **kwargs):
+        super(XLMTokenizer, self).__init__(unk_token=unk_token, bos_token=bos_token,
+                                           sep_token=sep_token, pad_token=pad_token,
+                                           cls_token=cls_token, mask_token=mask_token,
+                                           additional_special_tokens=additional_special_tokens,
+                                           **kwargs)
         try:
             import ftfy
             import spacy
@@ -111,11 +107,9 @@ class XLMTokenizer(PreTrainedTokenizer):
             self.fix_text = ftfy.fix_text
         except ImportError:
             logger.warning("ftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy.")
-            self.nlp = BasicTokenizer(do_lower_case=True,
-                                      never_split=special_tokens if special_tokens is not None else [])
+            self.nlp = BasicTokenizer(do_lower_case=True)
             self.fix_text = None
 
-        self.max_len = max_len if max_len is not None else int(1e12)
         self.encoder = json.load(open(vocab_file, encoding="utf-8"))
         self.decoder = {v:k for k,v in self.encoder.items()}
         merges = open(merges_file, encoding='utf-8').read().split('\n')[:-1]
@@ -123,35 +117,9 @@ class XLMTokenizer(PreTrainedTokenizer):
         self.bpe_ranks = dict(zip(merges, range(len(merges))))
         self.cache = {}
 
-        all_special_tokens = []
-        if special_tokens_file is not None:
-            special_tokens_to_add = open(special_tokens_file, encoding='utf-8').read().split('\n')[:-1]
-            all_special_tokens.extend(special_tokens_to_add)
-        if special_tokens is not None and special_tokens:
-            all_special_tokens.extend(special_tokens)
-
-        self.special_tokens = {}
-        self.special_tokens_decoder = {}
-        self.set_special_tokens(all_special_tokens)
-
-    def __len__(self):
-        return len(self.encoder) + len(self.special_tokens)
-
-    def set_special_tokens(self, special_tokens):
-        """ Add a list of additional tokens to the encoder.
-            The additional tokens are indexed starting from the last index of the
-            current vocabulary in the order of the `special_tokens` list.
-        """
-        if not special_tokens:
-            self.special_tokens = {}
-            self.special_tokens_decoder = {}
-            return
-        self.special_tokens = dict((tok, len(self.encoder) + i) for i, tok in enumerate(special_tokens))
-        self.special_tokens_decoder = {v:k for k, v in self.special_tokens.items()}
-        if self.fix_text is None:
-            # Using BERT's BasicTokenizer: we can update the tokenizer
-            self.nlp.never_split = special_tokens
-        logger.info("Special tokens {}".format(self.special_tokens))
+    @property
+    def vocab_size(self):
+        return len(self.encoder)
 
     def bpe(self, token):
         word = tuple(token[:-1]) + (token[-1] + '</w>',)
@@ -196,7 +164,7 @@ class XLMTokenizer(PreTrainedTokenizer):
         self.cache[token] = word
         return word
 
-    def tokenize(self, text):
+    def _tokenize(self, text):
         """ Tokenize a string. """
         split_tokens = []
         if self.fix_text is None:
@@ -211,58 +179,26 @@ class XLMTokenizer(PreTrainedTokenizer):
                 split_tokens.extend([t for t in self.bpe(token.text.lower()).split(' ')])
         return split_tokens
 
-    def convert_tokens_to_ids(self, tokens):
-        """ Converts a sequence of tokens into ids using the vocab. """
-        ids = []
-        if isinstance(tokens, str) or (sys.version_info[0] == 2 and isinstance(tokens, unicode)):
-            if tokens in self.special_tokens:
-                return self.special_tokens[tokens]
-            else:
-                return self.encoder.get(tokens, 0)
-        for token in tokens:
-            if token in self.special_tokens:
-                ids.append(self.special_tokens[token])
-            else:
-                ids.append(self.encoder.get(token, 0))
-        if len(ids) > self.max_len:
-            logger.warning(
-                "Token indices sequence length is longer than the specified maximum "
-                " sequence length for this OpenAI GPT model ({} > {}). Running this"
-                " sequence through the model will result in indexing errors".format(len(ids), self.max_len)
-            )
-        return ids
+    def _convert_token_to_id(self, token):
+        """ Converts a token (str/unicode) in an id using the vocab. """
+        return self.encoder.get(token, self.encoder.get(self.unk_token))
 
-    def convert_ids_to_tokens(self, ids, skip_special_tokens=False):
-        """Converts a sequence of ids in BPE tokens using the vocab."""
-        tokens = []
-        for i in ids:
-            if i in self.special_tokens_decoder:
-                if not skip_special_tokens:
-                    tokens.append(self.special_tokens_decoder[i])
-            else:
-                tokens.append(self.decoder[i])
-        return tokens
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (string/unicode) using the vocab."""
+        return self.decoder.get(index, self.unk_token)
 
-    def encode(self, text):
-        return self.convert_tokens_to_ids(self.tokenize(text))
-
-    def decode(self, ids, skip_special_tokens=False, clean_up_tokenization_spaces=True):
+    def _convert_ids_to_string(self, tokens_ids):
         """Converts a sequence of ids in a string."""
-        tokens = self.convert_ids_to_tokens(ids, skip_special_tokens=skip_special_tokens)
-        out_string = ''.join(tokens).replace('</w>', ' ').strip()
-        if clean_up_tokenization_spaces:
-            out_string = out_string.replace('<unk>', '')
-            out_string = clean_up_tokenization(out_string)
+        out_string = ''.join(tokens_ids).replace('</w>', ' ').strip()
         return out_string
 
-    def save_vocabulary(self, vocab_path):
+    def save_vocabulary(self, save_directory):
         """Save the tokenizer vocabulary and merge files to a directory."""
-        if not os.path.isdir(vocab_path):
-            logger.error("Vocabulary path ({}) should be a directory".format(vocab_path))
+        if not os.path.isdir(save_directory):
+            logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
             return
-        vocab_file = os.path.join(vocab_path, VOCAB_FILES_NAMES['vocab_file'])
-        merge_file = os.path.join(vocab_path, VOCAB_FILES_NAMES['merges_file'])
-        special_tokens_file = os.path.join(vocab_path, VOCAB_FILES_NAMES['special_tokens_file'])
+        vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES['vocab_file'])
+        merge_file = os.path.join(save_directory, VOCAB_FILES_NAMES['merges_file'])
 
         with open(vocab_file, 'w', encoding='utf-8') as f:
             f.write(json.dumps(self.encoder, ensure_ascii=False))
@@ -277,14 +213,4 @@ class XLMTokenizer(PreTrainedTokenizer):
                 writer.write(' '.join(bpe_tokens) + u'\n')
                 index += 1
 
-        index = len(self.encoder)
-        with open(special_tokens_file, 'w', encoding='utf-8') as writer:
-            for token, token_index in sorted(self.special_tokens.items(), key=lambda kv: kv[1]):
-                if index != token_index:
-                    logger.warning("Saving special tokens vocabulary to {}: BPE indices are not consecutive."
-                                   " Please check that the tokenizer is not corrupted!".format(special_tokens_file))
-                    index = token_index
-                writer.write(token + u'\n')
-                index += 1
-
-        return vocab_file, merge_file, special_tokens_file
+        return vocab_file, merge_file
diff --git a/pytorch_transformers/tokenization_xlnet.py b/pytorch_transformers/tokenization_xlnet.py
index a30e6db8da..942b532ec6 100644
--- a/pytorch_transformers/tokenization_xlnet.py
+++ b/pytorch_transformers/tokenization_xlnet.py
@@ -16,17 +16,13 @@
 from __future__ import (absolute_import, division, print_function,
                         unicode_literals)
 
-import json
 import logging
 import os
-import sys
 from shutil import copyfile
-from io import open
 
 import unicodedata
 import six
 
-from .file_utils import cached_path
 from .tokenization_utils import PreTrainedTokenizer, clean_up_tokenization
 
 logger = logging.getLogger(__name__)
@@ -44,8 +40,6 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
     'xlnet-large-cased': 512,
 }
 
-VOCAB_NAME = 'spiece.model'
-
 SPIECE_UNDERLINE = u'▁'
 
 # Segments (not really needed)
@@ -60,31 +54,26 @@ class XLNetTokenizer(PreTrainedTokenizer):
         SentencePiece based tokenizer. Peculiarities:
             - requires SentencePiece: https://github.com/google/sentencepiece
     """
-    # Tokens
-    special_symbols = {
-        "<unk>"  : 0,
-        "<s>"    : 1,
-        "</s>"   : 2,
-        "<cls>"  : 3,
-        "<sep>"  : 4,
-        "<pad>"  : 5,
-        "<mask>" : 6,
-        "<eod>"  : 7,
-        "<eop>"  : 8,
-    }
     vocab_files_names = VOCAB_FILES_NAMES
     pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
     max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
 
     def __init__(self, vocab_file, max_len=None,
-                 do_lower_case=False, remove_space=True, keep_accents=False):
+                 do_lower_case=False, remove_space=True, keep_accents=False,
+                 bos_token="<s>", eos_token="</s>", unk_token="<unk>", sep_token="<sep>",
+                 pad_token="<pad>", cls_token="<cls>", mask_token="<mask>",
+                 additional_special_tokens=["<eop>", "<eod>"], **kwargs):
+        super(XLNetTokenizer, self).__init__(bos_token=bos_token, eos_token=eos_token,
+                                             unk_token=unk_token, sep_token=sep_token,
+                                             pad_token=pad_token, cls_token=cls_token,
+                                             mask_token=mask_token, additional_special_tokens=
+                                             additional_special_tokens, **kwargs)
         try:
             import sentencepiece as spm
         except ImportError:
             logger.warning("You need to install SentencePiece to use XLNetTokenizer: https://github.com/google/sentencepiece"
                            "pip install sentencepiece")
 
-        self.max_len = max_len if max_len is not None else int(1e12)
         self.do_lower_case = do_lower_case
         self.remove_space = remove_space
         self.keep_accents = keep_accents
@@ -94,46 +83,7 @@ class XLNetTokenizer(PreTrainedTokenizer):
         self.sp_model.Load(vocab_file)
 
     @property
-    def UNK_TOKEN(self):
-        return "<unk>"
-
-    @property
-    def SEP_TOKEN(self):
-        return "<sep>"
-
-    @property
-    def PAD_TOKEN(self):
-        return "<pad>"
-
-    @property
-    def CLS_TOKEN(self):
-        return "<cls>"
-
-    @property
-    def MASK_TOKEN(self):
-        return "<mask>"
-
-    @property
-    def UNK_ID(self):
-        return self.special_symbols["<unk>"]
-
-    @property
-    def SEP_ID(self):
-        return self.special_symbols["<sep>"]
-
-    @property
-    def PAD_ID(self):
-        return self.special_symbols["<pad>"]
-
-    @property
-    def CLS_ID(self):
-        return self.special_symbols["<cls>"]
-
-    @property
-    def MASK_ID(self):
-        return self.special_symbols["<mask>"]
-
-    def __len__(self):
+    def vocab_size(self):
         return len(self.sp_model)
 
     def __getstate__(self):
@@ -169,7 +119,7 @@ class XLNetTokenizer(PreTrainedTokenizer):
 
         return outputs
 
-    def tokenize(self, text, return_unicode=True, sample=False):
+    def _tokenize(self, text, return_unicode=True, sample=False):
         """ Tokenize a string.
             return_unicode is used only for py2
         """
@@ -208,56 +158,30 @@ class XLNetTokenizer(PreTrainedTokenizer):
 
         return new_pieces
 
-    def convert_tokens_to_ids(self, tokens, sample=False):
-        """ Converts a sequence of tokens into ids using the vocab. """
-        ids = []
-        if isinstance(tokens, str) or (sys.version_info[0] == 2 and isinstance(tokens, unicode)):
-            return self.sp_model.PieceToId(tokens)
-        for token in tokens:
-            ids.append(self.sp_model.PieceToId(token))
-        if len(ids) > self.max_len:
-            logger.warning(
-                "Token indices sequence length is longer than the specified maximum "
-                " sequence length for this XLNet model ({} > {}). Running this"
-                " sequence through the model will result in indexing errors".format(len(ids), self.max_len)
-            )
-        return ids
+    def _convert_token_to_id(self, token):
+        """ Converts a token (str/unicode) in an id using the vocab. """
+        return self.sp_model.PieceToId(token)
 
-    def convert_ids_to_tokens(self, ids, return_unicode=True):
-        """Converts a sequence of ids in tokens."""
-        tokens = []
-        for i in ids:
-            tokens.append(self.sp_model.IdToPiece(i))
+    def _convert_id_to_token(self, index, return_unicode=True):
+        """Converts an index (integer) in a token (string/unicode) using the vocab."""
+        token = self.sp_model.IdToPiece(index)
+        if six.PY2 and return_unicode and isinstance(token, str):
+            token = token.decode('utf-8')
+        return token
 
-        if six.PY2 and return_unicode:
-            ret_pieces = []
-            for piece in tokens:
-                if isinstance(piece, str):
-                    piece = piece.decode('utf-8')
-                ret_pieces.append(piece)
-            tokens = ret_pieces
-        return tokens
-
-    def encode(self, text, sample=False):
-        return self.convert_tokens_to_ids(self.tokenize(text, sample=sample))
-
-    def decode(self, ids, clean_up_tokenization_spaces=True):
+    def _convert_ids_to_string(self, tokens_ids):
         """Converts a sequence of ids in a string."""
-        tokens = self.convert_ids_to_tokens(ids)
-        out_string = ''.join(tokens)
-        if clean_up_tokenization_spaces:
-            out_string = out_string.strip().replace('<unk>', '')
-            out_string = clean_up_tokenization(out_string)
+        out_string = ''.join(tokens_ids)
         return out_string
 
-    def save_vocabulary(self, vocab_path):
+    def save_vocabulary(self, save_directory):
         """ Save the sentencepiece vocabulary (copy original file) and special tokens file
             to a directory.
         """
-        if not os.path.isdir(vocab_path):
-            logger.error("Vocabulary path ({}) should be a directory".format(vocab_path))
+        if not os.path.isdir(save_directory):
+            logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
             return
-        out_vocab_file = os.path.join(vocab_path, VOCAB_NAME)
+        out_vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES['vocab_file'])
 
         copyfile(self.vocab_file, out_vocab_file)
 

From c079d7ddff7eeb653842f33f1f3fecd8b210e616 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Tue, 9 Jul 2019 10:40:59 +0200
Subject: [PATCH 072/139] fix python 2 tests

---
 .../tests/tokenization_bert_test.py           | 23 ++++++++-----------
 .../tests/tokenization_gpt2_test.py           |  5 ++--
 .../tests/tokenization_openai_test.py         |  5 ++--
 .../tests/tokenization_tests_commons.py       | 17 ++++++++++----
 .../tests/tokenization_transfo_xl_test.py     |  5 ++--
 .../tests/tokenization_xlm_test.py            |  5 ++--
 .../tests/tokenization_xlnet_test.py          |  7 +++---
 pytorch_transformers/tokenization_utils.py    |  9 +++++---
 8 files changed, 39 insertions(+), 37 deletions(-)

diff --git a/pytorch_transformers/tests/tokenization_bert_test.py b/pytorch_transformers/tests/tokenization_bert_test.py
index 220bf45346..dbbe9ac5ea 100644
--- a/pytorch_transformers/tests/tokenization_bert_test.py
+++ b/pytorch_transformers/tests/tokenization_bert_test.py
@@ -24,7 +24,7 @@ from pytorch_transformers.tokenization_bert import (BasicTokenizer,
                                                     _is_control, _is_punctuation,
                                                     _is_whitespace, VOCAB_FILES_NAMES)
 
-from .tokenization_tests_commons import create_and_check_tokenizer_commons
+from .tokenization_tests_commons import create_and_check_tokenizer_commons, TemporaryDirectory
 
 class TokenizationTest(unittest.TestCase):
 
@@ -33,21 +33,18 @@ class TokenizationTest(unittest.TestCase):
             "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn",
             "##ing", ",", "low", "lowest",
         ]
-        vocab_directory = "/tmp/"
-        vocab_file = os.path.join(vocab_directory, VOCAB_FILES_NAMES['vocab_file'])
-        with open(vocab_file, "w", encoding='utf-8') as vocab_writer:
-            vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
-            vocab_file = vocab_writer.name
+        with TemporaryDirectory() as tmpdirname:
+            vocab_file = os.path.join(tmpdirname, VOCAB_FILES_NAMES['vocab_file'])
+            with open(vocab_file, "w", encoding='utf-8') as vocab_writer:
+                vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
 
-        create_and_check_tokenizer_commons(self, BertTokenizer, pretrained_model_name_or_path=vocab_directory)
+            create_and_check_tokenizer_commons(self, BertTokenizer, tmpdirname)
 
-        tokenizer = BertTokenizer(vocab_file)
+            tokenizer = BertTokenizer(vocab_file)
 
-        tokens = tokenizer.tokenize(u"UNwant\u00E9d,running")
-        self.assertListEqual(tokens, ["un", "##want", "##ed", ",", "runn", "##ing"])
-        self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [7, 4, 5, 10, 8, 9])
-
-        os.remove(vocab_file)
+            tokens = tokenizer.tokenize(u"UNwant\u00E9d,running")
+            self.assertListEqual(tokens, ["un", "##want", "##ed", ",", "runn", "##ing"])
+            self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [7, 4, 5, 10, 8, 9])
 
     def test_chinese(self):
         tokenizer = BasicTokenizer()
diff --git a/pytorch_transformers/tests/tokenization_gpt2_test.py b/pytorch_transformers/tests/tokenization_gpt2_test.py
index 30959ceed1..8ae8896187 100644
--- a/pytorch_transformers/tests/tokenization_gpt2_test.py
+++ b/pytorch_transformers/tests/tokenization_gpt2_test.py
@@ -17,11 +17,10 @@ from __future__ import absolute_import, division, print_function, unicode_litera
 import os
 import unittest
 import json
-import tempfile
 
 from pytorch_transformers.tokenization_gpt2 import GPT2Tokenizer, VOCAB_FILES_NAMES
 
-from .tokenization_tests_commons import create_and_check_tokenizer_commons
+from .tokenization_tests_commons import create_and_check_tokenizer_commons, TemporaryDirectory
 
 class GPT2TokenizationTest(unittest.TestCase):
 
@@ -34,7 +33,7 @@ class GPT2TokenizationTest(unittest.TestCase):
         merges = ["#version: 0.2", "l o", "lo w", "e r", ""]
         special_tokens_map = {"unk_token": "<unk>"}
 
-        with tempfile.TemporaryDirectory() as tmpdirname:
+        with TemporaryDirectory() as tmpdirname:
             vocab_file = os.path.join(tmpdirname, VOCAB_FILES_NAMES['vocab_file'])
             merges_file = os.path.join(tmpdirname, VOCAB_FILES_NAMES['merges_file'])
             with open(vocab_file, "w") as fp:
diff --git a/pytorch_transformers/tests/tokenization_openai_test.py b/pytorch_transformers/tests/tokenization_openai_test.py
index 22f7d70017..f5c99877d7 100644
--- a/pytorch_transformers/tests/tokenization_openai_test.py
+++ b/pytorch_transformers/tests/tokenization_openai_test.py
@@ -17,11 +17,10 @@ from __future__ import absolute_import, division, print_function, unicode_litera
 import os
 import unittest
 import json
-import tempfile
 
 from pytorch_transformers.tokenization_openai import OpenAIGPTTokenizer, VOCAB_FILES_NAMES
 
-from.tokenization_tests_commons import create_and_check_tokenizer_commons
+from .tokenization_tests_commons import create_and_check_tokenizer_commons, TemporaryDirectory
 
 
 class OpenAIGPTTokenizationTest(unittest.TestCase):
@@ -35,7 +34,7 @@ class OpenAIGPTTokenizationTest(unittest.TestCase):
         vocab_tokens = dict(zip(vocab, range(len(vocab))))
         merges = ["#version: 0.2", "l o", "lo w", "e r</w>", ""]
 
-        with tempfile.TemporaryDirectory() as tmpdirname:
+        with TemporaryDirectory() as tmpdirname:
             vocab_file = os.path.join(tmpdirname, VOCAB_FILES_NAMES['vocab_file'])
             merges_file = os.path.join(tmpdirname, VOCAB_FILES_NAMES['merges_file'])
             with open(vocab_file, "w") as fp:
diff --git a/pytorch_transformers/tests/tokenization_tests_commons.py b/pytorch_transformers/tests/tokenization_tests_commons.py
index 07f962bcab..4e5fe83706 100644
--- a/pytorch_transformers/tests/tokenization_tests_commons.py
+++ b/pytorch_transformers/tests/tokenization_tests_commons.py
@@ -14,18 +14,25 @@
 # limitations under the License.
 from __future__ import absolute_import, division, print_function, unicode_literals
 
-import os
 import sys
 from io import open
 import tempfile
-
-if sys.version_info[0] == 3:
-    unicode = str
+import shutil
 
 if sys.version_info[0] == 2:
     import cPickle as pickle
+
+    class TemporaryDirectory(object):
+        """Context manager for tempfile.mkdtemp() so it's usable with "with" statement."""
+        def __enter__(self):
+            self.name = tempfile.mkdtemp()
+            return self.name
+        def __exit__(self, exc_type, exc_value, traceback):
+            shutil.rmtree(self.name)
 else:
     import pickle
+    TemporaryDirectory = tempfile.TemporaryDirectory
+    unicode = str
 
 
 def create_and_check_save_and_load_tokenizer(tester, tokenizer_class, *inputs, **kwargs):
@@ -33,7 +40,7 @@ def create_and_check_save_and_load_tokenizer(tester, tokenizer_class, *inputs, *
 
     before_tokens = tokenizer.encode(u"He is very happy, UNwant\u00E9d,running")
 
-    with tempfile.TemporaryDirectory() as tmpdirname:
+    with TemporaryDirectory() as tmpdirname:
         tokenizer.save_pretrained(tmpdirname)
         tokenizer = tokenizer.from_pretrained(tmpdirname)
 
diff --git a/pytorch_transformers/tests/tokenization_transfo_xl_test.py b/pytorch_transformers/tests/tokenization_transfo_xl_test.py
index a4ddd357b9..135f48b0ef 100644
--- a/pytorch_transformers/tests/tokenization_transfo_xl_test.py
+++ b/pytorch_transformers/tests/tokenization_transfo_xl_test.py
@@ -17,11 +17,10 @@ from __future__ import absolute_import, division, print_function, unicode_litera
 import os
 import unittest
 from io import open
-import tempfile
 
 from pytorch_transformers.tokenization_transfo_xl import TransfoXLTokenizer, VOCAB_FILES_NAMES
 
-from.tokenization_tests_commons import create_and_check_tokenizer_commons
+from.tokenization_tests_commons import create_and_check_tokenizer_commons, TemporaryDirectory
 
 class TransfoXLTokenizationTest(unittest.TestCase):
 
@@ -30,7 +29,7 @@ class TransfoXLTokenizationTest(unittest.TestCase):
             "<unk>", "[CLS]", "[SEP]", "want", "unwanted", "wa", "un",
             "running", ",", "low", "l",
         ]
-        with tempfile.TemporaryDirectory() as tmpdirname:
+        with TemporaryDirectory() as tmpdirname:
             vocab_file = os.path.join(tmpdirname, VOCAB_FILES_NAMES['vocab_file'])
             with open(vocab_file, "w", encoding='utf-8') as vocab_writer:
                 vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
diff --git a/pytorch_transformers/tests/tokenization_xlm_test.py b/pytorch_transformers/tests/tokenization_xlm_test.py
index b543ed23f8..827ec1606e 100644
--- a/pytorch_transformers/tests/tokenization_xlm_test.py
+++ b/pytorch_transformers/tests/tokenization_xlm_test.py
@@ -17,11 +17,10 @@ from __future__ import absolute_import, division, print_function, unicode_litera
 import os
 import unittest
 import json
-import tempfile
 
 from pytorch_transformers.tokenization_xlm import XLMTokenizer, VOCAB_FILES_NAMES
 
-from .tokenization_tests_commons import create_and_check_tokenizer_commons
+from .tokenization_tests_commons import create_and_check_tokenizer_commons, TemporaryDirectory
 
 class XLMTokenizationTest(unittest.TestCase):
 
@@ -34,7 +33,7 @@ class XLMTokenizationTest(unittest.TestCase):
         vocab_tokens = dict(zip(vocab, range(len(vocab))))
         merges = ["l o 123", "lo w 1456", "e r</w> 1789", ""]
 
-        with tempfile.TemporaryDirectory() as tmpdirname:
+        with TemporaryDirectory() as tmpdirname:
             vocab_file = os.path.join(tmpdirname, VOCAB_FILES_NAMES['vocab_file'])
             merges_file = os.path.join(tmpdirname, VOCAB_FILES_NAMES['merges_file'])
             with open(vocab_file, "w") as fp:
diff --git a/pytorch_transformers/tests/tokenization_xlnet_test.py b/pytorch_transformers/tests/tokenization_xlnet_test.py
index 8fc98209ba..e50fe9243d 100644
--- a/pytorch_transformers/tests/tokenization_xlnet_test.py
+++ b/pytorch_transformers/tests/tokenization_xlnet_test.py
@@ -16,11 +16,10 @@ from __future__ import absolute_import, division, print_function, unicode_litera
 
 import os
 import unittest
-import tempfile
 
-from pytorch_transformers.tokenization_xlnet import (XLNetTokenizer, SPIECE_UNDERLINE, VOCAB_FILES_NAMES)
+from pytorch_transformers.tokenization_xlnet import (XLNetTokenizer, SPIECE_UNDERLINE)
 
-from.tokenization_tests_commons import create_and_check_tokenizer_commons
+from .tokenization_tests_commons import create_and_check_tokenizer_commons, TemporaryDirectory
 
 SAMPLE_VOCAB = os.path.join(os.path.dirname(os.path.abspath(__file__)),
                     'fixtures/test_sentencepiece.model')
@@ -30,7 +29,7 @@ class XLNetTokenizationTest(unittest.TestCase):
     def test_full_tokenizer(self):
         tokenizer = XLNetTokenizer(SAMPLE_VOCAB, keep_accents=True)
 
-        with tempfile.TemporaryDirectory() as tmpdirname:
+        with TemporaryDirectory() as tmpdirname:
             tokenizer.save_pretrained(tmpdirname)
 
             create_and_check_tokenizer_commons(self, XLNetTokenizer, tmpdirname)
diff --git a/pytorch_transformers/tokenization_utils.py b/pytorch_transformers/tokenization_utils.py
index b191dd22e6..60081893c8 100644
--- a/pytorch_transformers/tokenization_utils.py
+++ b/pytorch_transformers/tokenization_utils.py
@@ -231,8 +231,7 @@ class PreTrainedTokenizer(object):
 
         # Add supplementary tokens.
         if added_tokens_file is not None:
-            added_tokens = json.load(open(added_tokens_file, encoding="utf-8"))
-            added_tok_encoder = dict((tok, len(tokenizer) + i) for i, tok in enumerate(added_tokens))
+            added_tok_encoder = json.load(open(added_tokens_file, encoding="utf-8"))
             added_tok_decoder = {v:k for k, v in added_tok_encoder.items()}
             tokenizer.added_tokens_encoder.update(added_tok_encoder)
             tokenizer.added_tokens_decoder.update(added_tok_decoder)
@@ -256,7 +255,11 @@ class PreTrainedTokenizer(object):
             f.write(json.dumps(self.special_tokens_map, ensure_ascii=False))
 
         with open(added_tokens_file, 'w', encoding='utf-8') as f:
-            f.write(json.dumps(self.added_tokens_decoder, ensure_ascii=False))
+            if self.added_tokens_encoder:
+                out_str = json.dumps(self.added_tokens_decoder, ensure_ascii=False)
+            else:
+                out_str = u"{}"
+            f.write(out_str)
 
         vocab_files = self.save_vocabulary(save_directory)
 

From d5481cbe1bf9f02036cd054f1e09f93266b56d07 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Tue, 9 Jul 2019 15:29:42 +0200
Subject: [PATCH 073/139] adding tests to examples - updating summary module -
 coverage update

---
 .coveragerc                                 |   3 +
 .gitignore                                  |   1 +
 examples/run_glue.py                        | 124 ++++++++++----------
 examples/test_examples.py                   |  25 ++--
 examples/tests_samples/.gitignore           |   5 +
 examples/tests_samples/MRPC/dev.tsv         |   7 ++
 examples/tests_samples/MRPC/train.tsv       |   7 ++
 pytorch_transformers/modeling_bert.py       |   1 -
 pytorch_transformers/modeling_gpt2.py       |   7 +-
 pytorch_transformers/modeling_openai.py     |   9 +-
 pytorch_transformers/modeling_transfo_xl.py |   1 -
 pytorch_transformers/modeling_utils.py      |  25 ++--
 pytorch_transformers/modeling_xlm.py        |  19 ++-
 pytorch_transformers/modeling_xlnet.py      |   7 +-
 pytorch_transformers/tokenization_bert.py   |   2 -
 pytorch_transformers/tokenization_utils.py  |   6 +-
 pytorch_transformers/tokenization_xlm.py    |   6 +-
 17 files changed, 139 insertions(+), 116 deletions(-)
 create mode 100644 examples/tests_samples/.gitignore
 create mode 100644 examples/tests_samples/MRPC/dev.tsv
 create mode 100644 examples/tests_samples/MRPC/train.tsv

diff --git a/.coveragerc b/.coveragerc
index 9b8c40ecf1..e0d5674aa0 100644
--- a/.coveragerc
+++ b/.coveragerc
@@ -1,5 +1,8 @@
 [run]
 source=pytorch_transformers
+omit =
+    # skip convertion scripts from testing for now
+    */convert_*
 [report]
 exclude_lines =
     pragma: no cover
diff --git a/.gitignore b/.gitignore
index 05129fc402..6bbe32df6c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -126,4 +126,5 @@ models
 proc_data
 
 # examples
+runs
 examples/runs
\ No newline at end of file
diff --git a/examples/run_glue.py b/examples/run_glue.py
index 59583ed712..62d655ecc9 100644
--- a/examples/run_glue.py
+++ b/examples/run_glue.py
@@ -60,25 +60,14 @@ TOKENIZER_CLASSES = {
     'xlm': XLMTokenizer,
 }
 
-def train(args, train_features, model):
+def train(args, train_dataset, model):
     """ Train the model """
     if args.local_rank in [-1, 0]:
         tb_writer = SummaryWriter()
 
-    # Convert in tensors and build dataloader
-    all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
-    all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
-    all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)
-    if args.output_mode == "classification":
-        all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long)
-    elif args.output_mode == "regression":
-        all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.float)
-
     args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps
-
-    train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
-    train_sampler = RandomSampler(train_data) if args.local_rank == -1 else DistributedSampler(train_data)
-    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)
+    train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
+    train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size)
 
     num_train_optimization_steps = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
 
@@ -109,19 +98,24 @@ def train(args, train_features, model):
 
     # Train!
     logger.info("***** Running training *****")
-    logger.info("  Num examples = %d", len(train_features))
+    logger.info("  Num examples = %d", len(train_dataset))
+    logger.info("  Num Epochs = %d", args.num_train_epochs)
     logger.info("  Batch size = %d", args.train_batch_size)
-    logger.info("  Num steps = %d", num_train_optimization_steps)
+    logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
+    logger.info("  Total optimization steps = %d", num_train_optimization_steps)
 
     global_step = 0
     tr_loss = 0
     model.train()
+    optimizer.zero_grad()
     for _ in trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]):
         for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])):
             batch = tuple(t.to(args.device) for t in batch)
-            input_ids, input_mask, segment_ids, label_ids = batch
-
-            ouputs = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids)
+            inputs = {'input_ids':      batch[0],
+                      'attention_mask': batch[1],
+                      'token_type_ids': batch[2] if args.model_type in ['bert', 'xlnet'] else None,
+                      'labels':         batch[3]}
+            ouputs = model(**inputs)
             loss = ouputs[0]
 
             if args.n_gpu > 1:
@@ -150,30 +144,20 @@ def train(args, train_features, model):
     return global_step, tr_loss / global_step
 
 
-def evalutate(args, eval_task, eval_output_dir, eval_features, model):
+def evalutate(args, eval_task, eval_output_dir, dataset, model):
     """ Evaluate the model """
     if os.path.exists(eval_output_dir) and os.listdir(eval_output_dir) and args.do_train and not args.overwrite_output_dir:
         raise ValueError("Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(eval_output_dir))
     if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]:
         os.makedirs(eval_output_dir)
 
-    # Convert in tensors and build dataloader
-    all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
-    all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
-    all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
-    if args.output_mode == "classification":
-        all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long)
-    elif args.output_mode == "regression":
-        all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.float)
-
-    eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
     # Note that DistributedSampler samples randomly
-    eval_sampler = SequentialSampler(eval_data) if args.local_rank == -1 else DistributedSampler(eval_data)
-    eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)
+    eval_sampler = SequentialSampler(dataset) if args.local_rank == -1 else DistributedSampler(dataset)
+    eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)
 
     # Eval!
     logger.info("***** Running evaluation *****")
-    logger.info("  Num examples = %d", len(eval_features))
+    logger.info("  Num examples = %d", len(dataset))
     logger.info("  Batch size = %d", args.eval_batch_size)
     model.eval()
     eval_loss = 0
@@ -214,36 +198,47 @@ def evalutate(args, eval_task, eval_output_dir, eval_features, model):
             logger.info("  %s = %s", key, str(result[key]))
             writer.write("%s = %s\n" % (key, str(result[key])))
 
+    return result
 
-def load_and_cache_examples(args, task, tokenizer, eval=False):
+
+def load_and_cache_examples(args, task, tokenizer, evaluate=False):
     processor = processors[task]()
     output_mode = output_modes[task]
-    label_list = processor.get_labels()
-
-    # Load and cache data
-    processor = processors[task]()
-    examples = processor.get_dev_examples(args.data_dir)
-    cached_features_file = os.path.join(args.data_dir, '{}_{}_{}_{}'.format(
-        'dev' if eval else 'train',
+    # Load data features from cache or dataset file
+    cached_features_file = os.path.join(args.data_dir, 'cached_{}_{}_{}_{}'.format(
+        'dev' if evaluate else 'train',
         list(filter(None, args.model_name.split('/'))).pop(),
         str(args.max_seq_length),
         str(task)))
-
     if os.path.exists(cached_features_file):
         logger.info("Loading features from cached file %s", cached_features_file)
         features = torch.load(cached_features_file)
     else:
-        features = convert_examples_to_features(examples, label_list, args.max_seq_length, tokenizer, output_mode)
+        logger.info("Creating features from dataset file at %s", args.data_dir)
+        label_list = processor.get_labels()
+        examples = processor.get_dev_examples(args.data_dir) if evaluate else processor.get_train_examples(args.data_dir)
         features = convert_examples_to_features(examples, label_list, args.max_seq_length, tokenizer, output_mode,
-            cls_token_at_end=bool(args.model_type not in ['bert', 'xlm']),
+            cls_token_at_end=bool(args.model_type in ['xlnet']),            # xlnet has a cls token at the end
             cls_token=tokenizer.cls_token,
-            sep_token=tokenizer.sep_token, cls_token_segment_id=2,
-            pad_on_left=True, pad_token_segment_id=4)
-        if args.local_rank == -1 or torch.distributed.get_rank() == 0:
+            sep_token=tokenizer.sep_token,
+            cls_token_segment_id=2 if args.model_type in ['xlnet'] else 1,
+            pad_on_left=bool(args.model_type in ['xlnet']),                 # pad on the left for xlnet
+            pad_token_segment_id=4 if args.model_type in ['xlnet'] else 0)
+        if args.local_rank in [-1, 0]:
             logger.info("Saving features into cached file %s", cached_features_file)
             torch.save(features, cached_features_file)
 
-    return features
+    # Convert to Tensors and build dataset
+    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
+    all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
+    all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
+    if output_mode == "classification":
+        all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long)
+    elif output_mode == "regression":
+        all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.float)
+
+    dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
+    return dataset
 
 
 def main():
@@ -350,10 +345,10 @@ def main():
         torch.distributed.barrier()
 
     args.model_type = args.model_name.lower().split('-')[0]
-    args.tokenizer_class = TOKENIZER_CLASSES[args.model_type]
-    args.model_class = MODEL_CLASSES[args.model_type]
-    tokenizer = args.tokenizer_class.from_pretrained(args.model_name, do_lower_case=args.do_lower_case)
-    model = args.model_class.from_pretrained(args.model_name, num_labels=num_labels)
+    tokenizer_class = TOKENIZER_CLASSES[args.model_type]
+    model_class = MODEL_CLASSES[args.model_type]
+    tokenizer = tokenizer_class.from_pretrained(args.model_name, do_lower_case=args.do_lower_case)
+    model = model_class.from_pretrained(args.model_name, num_labels=num_labels)
 
     if args.local_rank == 0:
         torch.distributed.barrier()
@@ -372,23 +367,30 @@ def main():
 
     # Training
     if args.do_train:
-        train_features = load_and_cache_examples(args, args.task_name, tokenizer, eval=False)
-        global_step, tr_loss = train(args, train_features, model)
+        train_dataset = load_and_cache_examples(args, args.task_name, tokenizer, evaluate=False)
+        global_step, tr_loss = train(args, train_dataset, model)
         logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
 
 
     # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
     if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
-        # Save a trained model, configuration and tokenizer
+        # Create output directory if needed
+        if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir:
+            raise ValueError("Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(args.output_dir))
+        if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
+            os.makedirs(args.output_dir)
+
+        # Save a trained model, configuration and tokenizer using `save_pretrained()`.
+        # They can then be reloaded using `from_pretrained()`
         model.save_pretrained(args.output_dir)
-        tokenizer.save_vocabulary(args.output_dir)
+        tokenizer.save_pretrained(args.output_dir)
 
         # Good practice: save your training arguments together with the trained model
         torch.save(args, os.path.join(args.output_dir, 'training_args.bin'))
 
         # Load a trained model and vocabulary that you have fine-tuned
-        model = args.model_class.from_pretrained(args.output_dir)
-        tokenizer = args.tokenizer_class.from_pretrained(args.output_dir)
+        model = model_class.from_pretrained(args.output_dir)
+        tokenizer = tokenizer_class.from_pretrained(args.output_dir)
         model.to(args.device)
 
     # Evaluation
@@ -398,9 +400,11 @@ def main():
         eval_outputs_dirs = (args.output_dir, args.output_dir + '-MM') if args.task_name == "mnli" else (args.output_dir,)
 
         for eval_task, eval_output_dir in zip(eval_task_names, eval_outputs_dirs):
-            eval_features = load_and_cache_examples(args, eval_task, tokenizer, eval=True)
+            eval_dataset = load_and_cache_examples(args, eval_task, tokenizer, evaluate=True)
 
-            evalutate(args, eval_task, eval_output_dir, eval_features, model)
+            result = evalutate(args, eval_task, eval_output_dir, eval_dataset, model)
+
+        return result
 
 
 if __name__ == "__main__":
diff --git a/examples/test_examples.py b/examples/test_examples.py
index fada43dae2..8284858a12 100644
--- a/examples/test_examples.py
+++ b/examples/test_examples.py
@@ -19,6 +19,7 @@ from __future__ import print_function
 import sys
 import unittest
 import argparse
+import logging
 
 try:
     # python 3.4+ can use builtin unittest.mock instead of mock package
@@ -26,7 +27,11 @@ try:
 except ImportError:
     from mock import patch
 
-import run_bert_squad as rbs
+import run_glue
+
+logging.basicConfig(level=logging.DEBUG)
+
+logger = logging.getLogger()
 
 def get_setup_file():
     parser = argparse.ArgumentParser()
@@ -36,12 +41,18 @@ def get_setup_file():
 
 class ExamplesTests(unittest.TestCase):
 
-    def test_run_squad(self):
-        testargs = ["prog", "-f", "/home/test/setup.py"]
-        with patch.object(sys, 'argv', testargs):
-            setup = get_setup_file()
-            assert setup == "/home/test/setup.py"
-            # rbs.main()
+    def test_run_glue(self):
+        stream_handler = logging.StreamHandler(sys.stdout)
+        logger.addHandler(stream_handler)
+
+        testargs = ["run_glue.py", "--data_dir=./examples/tests_samples/MRPC/",
+                    "--task_name=mrpc", "--do_train", "--do_eval", "--output_dir=./examples/tests_samples/temp_dir",
+                    "--train_batch_size=4", "--eval_batch_size=2", "--num_train_epochs=2.0", "--overwrite_output_dir"]
+        model_name = "--model_name=xlnet-large-cased"
+        with patch.object(sys, 'argv', testargs + [model_name]):
+            result = run_glue.main()
+            for value in result.values():
+                self.assertGreaterEqual(value, 0.75)
 
 
 if __name__ == "__main__":
diff --git a/examples/tests_samples/.gitignore b/examples/tests_samples/.gitignore
new file mode 100644
index 0000000000..1ac7520522
--- /dev/null
+++ b/examples/tests_samples/.gitignore
@@ -0,0 +1,5 @@
+*.*
+cache*
+temp*
+!*.tsv
+!.gitignore
\ No newline at end of file
diff --git a/examples/tests_samples/MRPC/dev.tsv b/examples/tests_samples/MRPC/dev.tsv
new file mode 100644
index 0000000000..5b814856c6
--- /dev/null
+++ b/examples/tests_samples/MRPC/dev.tsv
@@ -0,0 +1,7 @@
+﻿Quality	#1 ID	#2 ID	#1 String	#2 String
+1	1355540	1355592	He said the foodservice pie business doesn 't fit the company 's long-term growth strategy .	" The foodservice pie business does not fit our long-term growth strategy .
+0	2029631	2029565	Magnarelli said Racicot hated the Iraqi regime and looked forward to using his long years of training in the war .	His wife said he was " 100 percent behind George Bush " and looked forward to using his years of training in the war .
+0	487993	487952	The dollar was at 116.92 yen against the yen , flat on the session , and at 1.2891 against the Swiss franc , also flat .	The dollar was at 116.78 yen JPY = , virtually flat on the session , and at 1.2871 against the Swiss franc CHF = , down 0.1 percent .
+1	1989515	1989458	The AFL-CIO is waiting until October to decide if it will endorse a candidate .	The AFL-CIO announced Wednesday that it will decide in October whether to endorse a candidate before the primaries .
+0	1783137	1782659	No dates have been set for the civil or the criminal trial .	No dates have been set for the criminal or civil cases , but Shanley has pleaded not guilty .
+1	3039165	3039036	Wal-Mart said it would check all of its million-plus domestic workers to ensure they were legally employed .	It has also said it would review all of its domestic employees more than 1 million to ensure they have legal status .
diff --git a/examples/tests_samples/MRPC/train.tsv b/examples/tests_samples/MRPC/train.tsv
new file mode 100644
index 0000000000..5b814856c6
--- /dev/null
+++ b/examples/tests_samples/MRPC/train.tsv
@@ -0,0 +1,7 @@
+﻿Quality	#1 ID	#2 ID	#1 String	#2 String
+1	1355540	1355592	He said the foodservice pie business doesn 't fit the company 's long-term growth strategy .	" The foodservice pie business does not fit our long-term growth strategy .
+0	2029631	2029565	Magnarelli said Racicot hated the Iraqi regime and looked forward to using his long years of training in the war .	His wife said he was " 100 percent behind George Bush " and looked forward to using his years of training in the war .
+0	487993	487952	The dollar was at 116.92 yen against the yen , flat on the session , and at 1.2891 against the Swiss franc , also flat .	The dollar was at 116.78 yen JPY = , virtually flat on the session , and at 1.2871 against the Swiss franc CHF = , down 0.1 percent .
+1	1989515	1989458	The AFL-CIO is waiting until October to decide if it will endorse a candidate .	The AFL-CIO announced Wednesday that it will decide in October whether to endorse a candidate before the primaries .
+0	1783137	1782659	No dates have been set for the civil or the criminal trial .	No dates have been set for the criminal or civil cases , but Shanley has pleaded not guilty .
+1	3039165	3039036	Wal-Mart said it would check all of its million-plus domestic workers to ensure they were legally employed .	It has also said it would review all of its domestic employees more than 1 million to ensure they have legal status .
diff --git a/pytorch_transformers/modeling_bert.py b/pytorch_transformers/modeling_bert.py
index 0dd72b2969..ea9502d2ef 100644
--- a/pytorch_transformers/modeling_bert.py
+++ b/pytorch_transformers/modeling_bert.py
@@ -28,7 +28,6 @@ import torch
 from torch import nn
 from torch.nn import CrossEntropyLoss, MSELoss
 
-from .file_utils import cached_path
 from .modeling_utils import WEIGHTS_NAME, CONFIG_NAME, PretrainedConfig, PreTrainedModel, prune_linear_layer
 
 logger = logging.getLogger(__name__)
diff --git a/pytorch_transformers/modeling_gpt2.py b/pytorch_transformers/modeling_gpt2.py
index 9340ce8489..7fefbefeae 100644
--- a/pytorch_transformers/modeling_gpt2.py
+++ b/pytorch_transformers/modeling_gpt2.py
@@ -30,7 +30,6 @@ import torch.nn as nn
 from torch.nn import CrossEntropyLoss
 from torch.nn.parameter import Parameter
 
-from .file_utils import cached_path
 from .modeling_utils import (Conv1D, CONFIG_NAME, WEIGHTS_NAME, PretrainedConfig,
                           PreTrainedModel, prune_conv1d_layer, SequenceSummary)
 from .modeling_bert import BertLayerNorm as LayerNorm
@@ -122,9 +121,8 @@ class GPT2Config(PretrainedConfig):
         predict_special_tokens=True,
         summary_type='token_ids',
         summary_use_proj=True,
-        summary_num_classes=1,
         summary_activation=None,
-        summary_dropout=0.1,
+        summary_first_dropout=0.1,
         **kwargs
     ):
         """Constructs GPT2Config.
@@ -172,9 +170,8 @@ class GPT2Config(PretrainedConfig):
             self.predict_special_tokens = predict_special_tokens
             self.summary_type = summary_type
             self.summary_use_proj = summary_use_proj
-            self.summary_num_classes = summary_num_classes
             self.summary_activation = summary_activation
-            self.summary_dropout = summary_dropout
+            self.summary_first_dropout = summary_first_dropout
         else:
             raise ValueError(
                 "First argument must be either a vocabulary size (int)"
diff --git a/pytorch_transformers/modeling_openai.py b/pytorch_transformers/modeling_openai.py
index 4a3ff732f6..c99df42035 100644
--- a/pytorch_transformers/modeling_openai.py
+++ b/pytorch_transformers/modeling_openai.py
@@ -30,9 +30,8 @@ import torch.nn as nn
 from torch.nn import CrossEntropyLoss
 from torch.nn.parameter import Parameter
 
-from .file_utils import cached_path
 from .modeling_utils import (Conv1D, CONFIG_NAME, WEIGHTS_NAME, PretrainedConfig,
-                          PreTrainedModel, prune_conv1d_layer, SequenceSummary)
+                             PreTrainedModel, prune_conv1d_layer, SequenceSummary)
 from .modeling_bert import BertLayerNorm as LayerNorm
 
 logger = logging.getLogger(__name__)
@@ -150,9 +149,8 @@ class OpenAIGPTConfig(PretrainedConfig):
         predict_special_tokens=True,
         summary_type='token_ids',
         summary_use_proj=True,
-        summary_num_classes=1,
         summary_activation=None,
-        summary_dropout=0.1,
+        summary_first_dropout=0.1,
         **kwargs
     ):
         """Constructs OpenAIGPTConfig.
@@ -203,9 +201,8 @@ class OpenAIGPTConfig(PretrainedConfig):
             self.predict_special_tokens = predict_special_tokens
             self.summary_type = summary_type
             self.summary_use_proj = summary_use_proj
-            self.summary_num_classes = summary_num_classes
             self.summary_activation = summary_activation
-            self.summary_dropout = summary_dropout
+            self.summary_first_dropout = summary_first_dropout
         else:
             raise ValueError(
                 "First argument must be either a vocabulary size (int)"
diff --git a/pytorch_transformers/modeling_transfo_xl.py b/pytorch_transformers/modeling_transfo_xl.py
index 35a1b635f9..0c5d127d62 100644
--- a/pytorch_transformers/modeling_transfo_xl.py
+++ b/pytorch_transformers/modeling_transfo_xl.py
@@ -36,7 +36,6 @@ from torch.nn.parameter import Parameter
 
 from .modeling_bert import BertLayerNorm as LayerNorm
 from .modeling_transfo_xl_utilities import ProjectedAdaptiveLogSoftmax, sample_logits
-from .file_utils import cached_path
 from .modeling_utils import CONFIG_NAME, WEIGHTS_NAME, PretrainedConfig, PreTrainedModel
 
 logger = logging.getLogger(__name__)
diff --git a/pytorch_transformers/modeling_utils.py b/pytorch_transformers/modeling_utils.py
index b9be1a3813..36b506da3b 100644
--- a/pytorch_transformers/modeling_utils.py
+++ b/pytorch_transformers/modeling_utils.py
@@ -25,7 +25,7 @@ from io import open
 
 import torch
 from torch import nn
-from torch.nn import CrossEntropyLoss, MSELoss, functional as F
+from torch.nn import CrossEntropyLoss, functional as F
 
 from .file_utils import cached_path
 
@@ -514,10 +514,10 @@ class SequenceSummary(nn.Module):
                 - 'token_ids' => supply a Tensor of classification token indices (GPT/GPT-2)
                 - 'attn' => Not implemented now, use multi-head attention
             summary_use_proj: Add a projection after the vector extraction
-            summary_num_classes: If > 0: the projection outputs to n classes (otherwise to hidden_size)
-            summary_activation:
-                'tanh' => add a tanh activation to the output
-                    None => no activation
+            summary_proj_to_labels: If True, the projection outputs to config.num_labels classes (otherwise to hidden_size). Default: False.
+            summary_activation: 'tanh' => add a tanh activation to the output, Other => no activation. Default 
+            summary_first_dropout: Add a dropout before the projection and activation
+            summary_last_dropout: Add a dropout after the projection and activation
     """
     def __init__(self, config):
         super(SequenceSummary, self).__init__()
@@ -531,8 +531,8 @@ class SequenceSummary(nn.Module):
 
         self.summary = nn.Identity()
         if hasattr(config, 'summary_use_proj') and config.summary_use_proj:
-            if hasattr(config, 'summary_num_classes') and config.summary_num_classes > 0:
-                num_classes = config.summary_num_classes
+            if hasattr(config, 'summary_proj_to_labels') and config.summary_proj_to_labels and config.num_labels > 0:
+                num_classes = config.num_labels
             else:
                 num_classes = config.hidden_size
             self.summary = nn.Linear(config.hidden_size, num_classes)
@@ -541,7 +541,13 @@ class SequenceSummary(nn.Module):
         if hasattr(config, 'summary_activation') and config.summary_activation == 'tanh':
             self.activation = nn.Tanh()
 
-        self.dropout = nn.Dropout(config.summary_dropout)
+        self.first_dropout = nn.Identity()
+        if hasattr(config, 'summary_first_dropout') and config.summary_first_dropout > 0:
+            self.first_dropout = nn.Dropout(config.summary_first_dropout)
+
+        self.last_dropout = nn.Identity()
+        if hasattr(config, 'summary_last_dropout') and config.summary_last_dropout > 0:
+            self.last_dropout = nn.Dropout(config.summary_last_dropout)
 
     def forward(self, hidden_states, token_ids=None):
         """ hidden_states: float Tensor in shape [bsz, seq_len, hidden_size], the hidden-states of the last layer.
@@ -567,9 +573,10 @@ class SequenceSummary(nn.Module):
         elif self.summary_type == 'attn':
             raise NotImplementedError
 
+        output = self.first_dropout(output)
         output = self.summary(output)
         output = self.activation(output)
-        output = self.dropout(output)
+        output = self.last_dropout(output)
 
         return output
 
diff --git a/pytorch_transformers/modeling_xlm.py b/pytorch_transformers/modeling_xlm.py
index c7ea294dbd..65db9e7159 100644
--- a/pytorch_transformers/modeling_xlm.py
+++ b/pytorch_transformers/modeling_xlm.py
@@ -14,18 +14,14 @@
 # limitations under the License.
 """ PyTorch XLM model.
 """
-from __future__ import (absolute_import, division, print_function,
-                        unicode_literals)
 from __future__ import absolute_import, division, print_function, unicode_literals
 
 import json
 import logging
 import math
-import os
 import sys
 from io import open
 
-import math
 import itertools
 import numpy as np
 
@@ -34,9 +30,8 @@ from torch import nn
 from torch.nn import functional as F
 from torch.nn import CrossEntropyLoss, MSELoss
 
-from .file_utils import cached_path
-from .modeling_utils import (CONFIG_NAME, WEIGHTS_NAME, PretrainedConfig, PreTrainedModel,
-                          prune_linear_layer, SequenceSummary, SQuADHead)
+from .modeling_utils import (PretrainedConfig, PreTrainedModel,
+                             prune_linear_layer, SequenceSummary, SQuADHead)
 
 logger = logging.getLogger(__name__)
 
@@ -79,10 +74,11 @@ class XLMConfig(PretrainedConfig):
 
                  finetuning_task=None,
                  num_labels=2,
-                 summary_type='last',
+                 summary_type='first',
                  summary_use_proj=True,
-                 summary_activation='tanh',
-                 summary_dropout=0.1,
+                 summary_activation=None,
+                 summary_proj_to_labels=True,
+                 summary_first_dropout=0.1,
                  start_n_top=5,
                  end_n_top=5,
                  **kwargs):
@@ -164,7 +160,8 @@ class XLMConfig(PretrainedConfig):
             self.summary_type = summary_type
             self.summary_use_proj = summary_use_proj
             self.summary_activation = summary_activation
-            self.summary_dropout = summary_dropout
+            self.summary_proj_to_labels = summary_proj_to_labels
+            self.summary_first_dropout = summary_first_dropout
             self.start_n_top = start_n_top
             self.end_n_top = end_n_top
         else:
diff --git a/pytorch_transformers/modeling_xlnet.py b/pytorch_transformers/modeling_xlnet.py
index 628dbe7450..e0b3fb0661 100644
--- a/pytorch_transformers/modeling_xlnet.py
+++ b/pytorch_transformers/modeling_xlnet.py
@@ -31,9 +31,8 @@ from torch import nn
 from torch.nn import functional as F
 from torch.nn import CrossEntropyLoss, MSELoss
 
-from .file_utils import cached_path
 from .modeling_utils import (CONFIG_NAME, WEIGHTS_NAME, PretrainedConfig, PreTrainedModel,
-                          SequenceSummary, PoolerAnswerClass, PoolerEndLogits, PoolerStartLogits)
+                             SequenceSummary, PoolerAnswerClass, PoolerEndLogits, PoolerStartLogits)
 
 
 logger = logging.getLogger(__name__)
@@ -227,7 +226,7 @@ class XLNetConfig(PretrainedConfig):
                  summary_type='last',
                  summary_use_proj=True,
                  summary_activation='tanh',
-                 summary_dropout=0.1,
+                 summary_last_dropout=0.1,
                  start_n_top=5,
                  end_n_top=5,
                  **kwargs):
@@ -314,7 +313,7 @@ class XLNetConfig(PretrainedConfig):
             self.summary_type = summary_type
             self.summary_use_proj = summary_use_proj
             self.summary_activation = summary_activation
-            self.summary_dropout = summary_dropout
+            self.summary_last_dropout = summary_last_dropout
             self.start_n_top = start_n_top
             self.end_n_top = end_n_top
         else:
diff --git a/pytorch_transformers/tokenization_bert.py b/pytorch_transformers/tokenization_bert.py
index 3e14673f46..1235d6f3cf 100644
--- a/pytorch_transformers/tokenization_bert.py
+++ b/pytorch_transformers/tokenization_bert.py
@@ -113,8 +113,6 @@ class BertTokenizer(PreTrainedTokenizer):
             raise ValueError(
                 "Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained "
                 "model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file))
-        if never_split is None:
-            never_split = self.all_special_tokens
         self.vocab = load_vocab(vocab_file)
         self.ids_to_tokens = collections.OrderedDict(
             [(ids, tok) for tok, ids in self.vocab.items()])
diff --git a/pytorch_transformers/tokenization_utils.py b/pytorch_transformers/tokenization_utils.py
index 60081893c8..a84b8d6f44 100644
--- a/pytorch_transformers/tokenization_utils.py
+++ b/pytorch_transformers/tokenization_utils.py
@@ -142,11 +142,7 @@ class PreTrainedTokenizer(object):
         self.added_tokens_decoder = {}
 
         for key, value in kwargs.items():
-            if key not in self.SPECIAL_TOKENS_ATTRIBUTES:
-                raise ValueError(
-                    "PreTrainedTokenizer.__init__() argument {} should be in {}".format(
-                        key, ', '.join(self.SPECIAL_TOKENS_ATTRIBUTES)))
-            else:
+            if key in self.SPECIAL_TOKENS_ATTRIBUTES:
                 setattr(self, key, value)
 
 
diff --git a/pytorch_transformers/tokenization_xlm.py b/pytorch_transformers/tokenization_xlm.py
index 8a11a84f8c..8851455829 100644
--- a/pytorch_transformers/tokenization_xlm.py
+++ b/pytorch_transformers/tokenization_xlm.py
@@ -20,13 +20,9 @@ import json
 import logging
 import os
 import re
-import sys
 from io import open
 
-from tqdm import tqdm
-
-from .file_utils import cached_path
-from .tokenization_utils import PreTrainedTokenizer, clean_up_tokenization
+from .tokenization_utils import PreTrainedTokenizer
 from .tokenization_bert import BasicTokenizer
 
 logger = logging.getLogger(__name__)

From d0efbd3cd1fa268a3f5c5235237ceae0bde69776 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Tue, 9 Jul 2019 15:46:43 +0200
Subject: [PATCH 074/139] update sequencesummary module

---
 .coveragerc                                          | 1 +
 examples/test_examples.py                            | 2 +-
 pytorch_transformers/modeling_gpt2.py                | 6 ++++++
 pytorch_transformers/modeling_openai.py              | 6 ++++++
 pytorch_transformers/tests/modeling_tests_commons.py | 2 +-
 5 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/.coveragerc b/.coveragerc
index e0d5674aa0..fa6c165a8a 100644
--- a/.coveragerc
+++ b/.coveragerc
@@ -3,6 +3,7 @@ source=pytorch_transformers
 omit =
     # skip convertion scripts from testing for now
     */convert_*
+    */__main__.py
 [report]
 exclude_lines =
     pragma: no cover
diff --git a/examples/test_examples.py b/examples/test_examples.py
index 8284858a12..56c30efae4 100644
--- a/examples/test_examples.py
+++ b/examples/test_examples.py
@@ -48,7 +48,7 @@ class ExamplesTests(unittest.TestCase):
         testargs = ["run_glue.py", "--data_dir=./examples/tests_samples/MRPC/",
                     "--task_name=mrpc", "--do_train", "--do_eval", "--output_dir=./examples/tests_samples/temp_dir",
                     "--train_batch_size=4", "--eval_batch_size=2", "--num_train_epochs=2.0", "--overwrite_output_dir"]
-        model_name = "--model_name=xlnet-large-cased"
+        model_name = "--model_name=bert-base-uncased"
         with patch.object(sys, 'argv', testargs + [model_name]):
             result = run_glue.main()
             for value in result.values():
diff --git a/pytorch_transformers/modeling_gpt2.py b/pytorch_transformers/modeling_gpt2.py
index 7fefbefeae..840016098a 100644
--- a/pytorch_transformers/modeling_gpt2.py
+++ b/pytorch_transformers/modeling_gpt2.py
@@ -119,9 +119,12 @@ class GPT2Config(PretrainedConfig):
         layer_norm_epsilon=1e-5,
         initializer_range=0.02,
         predict_special_tokens=True,
+
+        num_labels=1,
         summary_type='token_ids',
         summary_use_proj=True,
         summary_activation=None,
+        summary_proj_to_labels=True,
         summary_first_dropout=0.1,
         **kwargs
     ):
@@ -168,10 +171,13 @@ class GPT2Config(PretrainedConfig):
             self.layer_norm_epsilon = layer_norm_epsilon
             self.initializer_range = initializer_range
             self.predict_special_tokens = predict_special_tokens
+
+            self.num_labels = num_labels
             self.summary_type = summary_type
             self.summary_use_proj = summary_use_proj
             self.summary_activation = summary_activation
             self.summary_first_dropout = summary_first_dropout
+            self.summary_proj_to_labels = summary_proj_to_labels
         else:
             raise ValueError(
                 "First argument must be either a vocabulary size (int)"
diff --git a/pytorch_transformers/modeling_openai.py b/pytorch_transformers/modeling_openai.py
index c99df42035..024ff8eb41 100644
--- a/pytorch_transformers/modeling_openai.py
+++ b/pytorch_transformers/modeling_openai.py
@@ -147,9 +147,12 @@ class OpenAIGPTConfig(PretrainedConfig):
         layer_norm_epsilon=1e-5,
         initializer_range=0.02,
         predict_special_tokens=True,
+
+        num_labels=1,
         summary_type='token_ids',
         summary_use_proj=True,
         summary_activation=None,
+        summary_proj_to_labels=True,
         summary_first_dropout=0.1,
         **kwargs
     ):
@@ -199,10 +202,13 @@ class OpenAIGPTConfig(PretrainedConfig):
             self.layer_norm_epsilon = layer_norm_epsilon
             self.initializer_range = initializer_range
             self.predict_special_tokens = predict_special_tokens
+
+            self.num_labels = num_labels
             self.summary_type = summary_type
             self.summary_use_proj = summary_use_proj
             self.summary_activation = summary_activation
             self.summary_first_dropout = summary_first_dropout
+            self.summary_proj_to_labels = summary_proj_to_labels
         else:
             raise ValueError(
                 "First argument must be either a vocabulary size (int)"
diff --git a/pytorch_transformers/tests/modeling_tests_commons.py b/pytorch_transformers/tests/modeling_tests_commons.py
index db79b017c1..5535177aaa 100644
--- a/pytorch_transformers/tests/modeling_tests_commons.py
+++ b/pytorch_transformers/tests/modeling_tests_commons.py
@@ -396,7 +396,7 @@ class GPTModelTester(object):
         model = self.double_head_model_class(config)
         model.eval()
         outputs = model(input_ids, mc_token_ids, lm_labels=lm_labels, mc_labels=mc_labels,
-                                                    token_type_ids=token_type_ids, position_ids=position_ids)
+                        token_type_ids=token_type_ids, position_ids=position_ids)
         lm_loss, mc_loss, lm_logits, mc_logits = outputs[:4]
         loss = [lm_loss, mc_loss]
 

From d743f2f34e11bdeefe144356fe89331c0f44fc36 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Tue, 9 Jul 2019 15:58:58 +0200
Subject: [PATCH 075/139] updating test

---
 .../tests/tokenization_tests_commons.py              | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/pytorch_transformers/tests/tokenization_tests_commons.py b/pytorch_transformers/tests/tokenization_tests_commons.py
index 4e5fe83706..44adbc6b53 100644
--- a/pytorch_transformers/tests/tokenization_tests_commons.py
+++ b/pytorch_transformers/tests/tokenization_tests_commons.py
@@ -14,6 +14,7 @@
 # limitations under the License.
 from __future__ import absolute_import, division, print_function, unicode_literals
 
+import os
 import sys
 from io import open
 import tempfile
@@ -49,15 +50,18 @@ def create_and_check_save_and_load_tokenizer(tester, tokenizer_class, *inputs, *
 
 def create_and_check_pickle_tokenizer(tester, tokenizer_class, *inputs, **kwargs):
     tokenizer = tokenizer_class.from_pretrained(*inputs, **kwargs)
+    tester.assertIsNotNone(tokenizer)
 
     text = u"Munich and Berlin are nice cities"
-    filename = u"/tmp/tokenizer.bin"
-
     subwords = tokenizer.tokenize(text)
 
-    pickle.dump(tokenizer, open(filename, "wb"))
+    with TemporaryDirectory() as tmpdirname:
+
+        filename = os.path.join(tmpdirname, u"tokenizer.bin")
+        pickle.dump(tokenizer, open(filename, "wb"))
+
+        tokenizer_new = pickle.load(open(filename, "rb"))
 
-    tokenizer_new = pickle.load(open(filename, "rb"))
     subwords_loaded = tokenizer_new.tokenize(text)
 
     tester.assertListEqual(subwords, subwords_loaded)

From 269e73b6011f87f4d5e6fea47f8fee11dfcdf2cc Mon Sep 17 00:00:00 2001
From: LysandreJik <lysandre.debut@reseau.eseo.fr>
Date: Tue, 9 Jul 2019 10:11:29 -0400
Subject: [PATCH 076/139] Adding example detailing how to add a new file to the
 documentation + adding fonts.

---
 docs/README.md                              |  41 +++++++++++++++++++-
 docs/source/_static/css/Calibre-Light.ttf   | Bin 0 -> 62476 bytes
 docs/source/_static/css/Calibre-Medium.otf  | Bin 0 -> 47860 bytes
 docs/source/_static/css/Calibre-Regular.otf | Bin 0 -> 49856 bytes
 docs/source/_static/css/Calibre-Thin.otf    | Bin 0 -> 46740 bytes
 docs/source/bertology.md                    |   1 +
 docs/source/conf.py                         |   3 +-
 docs/source/index.rst                       |   3 ++
 docs/source/migration.md                    |   1 +
 docs/source/philosophy.md                   |   1 +
 docs/source/torchscript.rst                 |   7 +---
 11 files changed, 48 insertions(+), 9 deletions(-)
 create mode 100644 docs/source/_static/css/Calibre-Light.ttf
 create mode 100644 docs/source/_static/css/Calibre-Medium.otf
 create mode 100644 docs/source/_static/css/Calibre-Regular.otf
 create mode 100644 docs/source/_static/css/Calibre-Thin.otf
 create mode 100644 docs/source/bertology.md
 create mode 100644 docs/source/migration.md
 create mode 100644 docs/source/philosophy.md

diff --git a/docs/README.md b/docs/README.md
index 22f1116c87..b88cd50bbf 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -1,6 +1,18 @@
 # Generating the documentation
 
-To generate the documentation, you first have to build it. Building it requires the package `sphinx` that you can 
+To generate the documentation, you first have to build it. Several packages are necessary to build the doc,
+you can install them using:
+
+```bash
+pip install -r requirements.txt
+```
+ 
+## Packages installed
+
+Here's an overview of all the packages installed. If you ran the previous command installing all packages from 
+`requirements.txt`, you do not need to run the following commands.
+
+Building it requires the package `sphinx` that you can 
 install using:
 
 ```bash
@@ -14,10 +26,35 @@ You would also need the custom installed [theme](https://github.com/readthedocs/
 pip install sphinx_rtd_theme
 ```
 
+The third necessary package is the `recommonmark` package to accept Markdown as well as Restructured text:
+
+```bash
+pip install recommonmark
+```
+
+## Building the documentation
+
 Once you have setup `sphinx`, you can build the documentation by running the following command in the `/docs` folder:
 
 ```bash
 make html
 ```
 
-It should build the static app that will be available under `/docs/_build/html`
\ No newline at end of file
+---
+**NOTE**
+
+If you are adding/removing elements from the toc-tree or from any strutural item, it is recommended to clean the build
+directory before rebuilding. Run the following command to clean and build:
+
+```bash
+make clean && make html
+```
+
+---
+
+It should build the static app that will be available under `/docs/_build/html`
+
+## Adding a new element to the tree (toc-tree)
+
+Acceptes files are reStructuredText (.rst) and Markdown (.md). Create a file with its extension and put it
+in the source directory. You can then link it to the toc-tree by putting the filename without the extension.
\ No newline at end of file
diff --git a/docs/source/_static/css/Calibre-Light.ttf b/docs/source/_static/css/Calibre-Light.ttf
new file mode 100644
index 0000000000000000000000000000000000000000..2e6631909a671e74db99044a7a1dad512df82207
GIT binary patch
literal 62476
zcmce<2Yg(`)jm9PSK3u?yXtMFm9(lW?XIL%w|aG#EjPKy>b5KyS#rZR#(-l?H<-`^
z#-SI7&=MR&XrYB30wkd&0p0`*0TKwotM55;@7<zi=Y7B5U-sR5@9vZ{XU?2CW#$ay
zjIl6$_%K#m)824V$W?KSeSI^&UD(*TWZvq*r=P|9mv~QTs&8oY`>pmR#_s$h-s_td
zE@-bGiFt#udlMLI+}G5;wDz&?`z6NiGBPH8zo0$ewytjL3C8#Vq+i#)scZ9%kL*NE
zey=j7Y3%RXvUy^HX;9vq_}#C6Xh$Er|1U|5g<iwhfs%pVt{%x6vl97(@O$w9zR=cb
zHsHM;?`Z>@#<uGRJ1)Wdr;KSo9vbQHS{r_D31dMqD1Yy!uI-y8Keh_L4<Nm9xNB2y
z>e%5z#(s?Ywd0#dwv4?oyz@53Zfa&M@Ipqo;OxA(bD!9=HnifOEKq}T8GAMSZ&B)f
z;=n|UrcbjQ?|uv|6@SPl9dsUH&uK2g?*}w}q9o-<y733;8~!+3&UE5y%E?8lYVi*I
z^VfMU^JRY0L1{U@Ym@JM4NKwyC_hl6(MXb30=ASgl#Fe2TjsI7%=jNkfnhg$#K_o{
z^fNy$_2Fh@8fhK9<C}T-_!F*|Sq<))QaX$0udsM&6HAwDEQ9S~nY;mYHsJemmMlrE
ziCx0{c@4|NJ%!ttuN1?wc`Hli$5|Zz4U6I5ut@$A?qA^hdszhkkfq`|jna#}m#{L)
zpSAHzSPj3888lkdRmY;GK(;`d!_3l3+%;?ge+TytykE!`u-EYZHq%Q++`q;5^xFWg
zA^aY$+|%*fKUf^#PL@`)I;jR_e#@ey1QsJjBd?C7a|hA@X58b^hFHFi836CY{1KMH
zKV!}Om&}a2nLmR|Vo{(ZGygN%`~<6FPfUEpPl)Fx>3P;f-}B##^i3j99^NkluCg$K
zK4i807>kfP(f<2^3q13ik#;^au-^iP`KbSD;E-_t8t_2%{uyxlutfeCe*1=9FFlR#
z5?DAd1Dr=#0=_%UD_9=Vv~V4Z<{`i{!G3~8upa~dZ`f1lM?Ei)^GN5heCZNg_p*H8
zIbPtH@I2|#ti*SOci`OfVy{j78?d$F{x979f$vlo&I#`x7vOJ_wz6D-cf!4T5#G^1
z{GY%L;dq)$`UB{L+D|z5xPb4kf%iIG60QJ&YZd2&caMwc5;)HlcqiPe7vcRg{6>9&
zc2oUR?r1yJPdJ$N4tk{irS^+E(I(&&G&<WIv`h3uG(7DcbWJoQ-HLMm%=EzX-_hn%
zz_kzBi+6B@CosBS1D!m<l5lUty-7NVe0y0HjR&GhF(yElM4L1=D2;M2_Iw9jHUZ`&
z_6bX(eiU`jbVuJ3eyNW=@0w<2*2Lm_8f&0;<zA@VD>=rW7y~ohX>8K?@Ve)qZ$K~9
zXJ>q;agDJ><688sI=(T+#5ni3^Su)<s(mhK9rQ-DD(FRlQ;h9z*a3_ox~{_YHm*By
zy@6{Ft{(|rLUpQjt9Jzk{s)$+nUB7@gvH`AfOi_WAI7eqxEsLFlQA}pd@jbp%Pd86
z5lcs3#fs~ti33tROQbO`RRKR&W8A*TtkPAu{|eXRcs`d|h|Y<33jTQ+D+8|U_><sM
zxKK_N;CvI;tGM3gC-J=u|5La>q+Dtncj5got}Ag-Td&7O?cGmv0r3kJHo<?02NB=I
zMf^+f8)^sU0kz$jBjkC4cr5W(@H*mig1`H*NX)(ExLR=);7Y+|!<B}s1y><1`aTtx
z1y_-Br!<+|hjrp=z_nVrGkmYt7?}b0R{R?lpF7!3)`n}Rcms1~G(*yrfYF3Ma8QkS
zO28az#259f8g24HZ{5m%%I;<l@@Bq=AL0{ItmKe>BmF7qf~2pF24kEt$(U-)Fcuig
zjP=HQQle8*Q<kKxNsUNtPF<JU`yUCi3&Y%cGfKIg-3Pwhic-FpVo=Jn(%VUUlfE+Q
zjWNaqqfwMn>MEr@WtCD&&wu#D#KfN`o}Bpg#1j)ICVn+>eB#cD+KKFmVCOl`k&k}+
zQP+pHAI|x(>ch$p%RaPznDwFYgI|2G;Dd$_B0mUyzw>?b`(f_~Ye(gl0-FEZpH_mT
zL4wJY^0-UZA<1R7KY?>#+y^+;f|vO*e-^+3Sr7|mAuN=Iv2YdvK4xH%px|g0gYggt
z+Dl-G;8e-LX9`PYCYFY=lEKUvU0E!f<*;0q$1E(L6)-EaF*_?{MXZ=PSP3g-WvrZ4
zuu9C>bI`XntQK=fJ!=5>Y+}u<1(eao=CXNgKInKMTf`Q#cD95qWgTo8B(fE3C1k`-
zwwkSBYuP&1h31{h_ObIZGA@N2buGJz-N=5zZb45UVMp0*qQCE8cd<J`J@=sJkFopN
z&)Cn|1MCs@Fng5!g0BX3_OM>o&)2do>;^W(26;alWdnROJC7Yk4I}(4KFa#MP`QC^
z!~3o5A-0Qk(=WW7kKneEZRhLQdbS&K-XR{sL-|tP!B_HCd>LQP9z=@cypZ?sUcR1p
z^DTUguVG_+1@Git>>TzZQ2GIO0XxVpVi&Uu+2!mC<hTqY`WkjMJINb(8{5R2FbZ1O
zIG+nDZp82IzXafY9dPh5&*sH^J|Ex*_znC){se!Me<b-xQBt$CPr5;RTKY~Cq-oXc
z)f~}0t9e`Vna1f;<#V;qF`qyBO1?(l65qMLW4;&rUg!I;@9VxF`Tn3aXiK!qwFk7v
zw4Z9f*G1_vb&a|n-A>&h-BH~Gx~Fw-=|0kZ>6h)-;kV204!^hjzVo;FFYzDpzuf;0
z|A+nm77!4S9k4iHSHLv^F9m!S7!qg<tO{HnI23q(;C+F|13wP(!9Qcr!k}G2HwXPI
zI4XEa@WsJLgZ~)(X-GgwM##dDgCW<1JP`71$m=103H1q02yG2L5c<ckgs_UR&ai!9
z4}^Uj9v5B}z9IZT_-*0Og?|?j6k&`giC7bHe#8S2f7EOBh5BLrUHYdD%#dbiHf%N=
zH9TSXC^8|kGxCPWkD>~r4o1BjZHZnPeQETQ(Lco0$1IN-ia9^#o|ungjj@ffL$TMy
zz8V)0R~)xF?wYtK;y#Pdj<1g&j=w4XM7%R0GhsC0=7i4@V-gz^w<q3}_+C<2Qf<;$
z(oIRPCw-Tko7|ebC;7JIkBnMlm2rddF5^on+LW@Cy(y2TN~y)EyHX!ENu~_bV$*ih
z5!17#lW9^~RGKYqaoT9w4QUUiJ(>2G^tkle^uF|~(odwnmHt@<%Sg?r%~+W+oN*xI
zNXGGuH#5F6Czu<}+s)URpELh0GbXb-voG_S%;z${%Sy|dn{`drvss^Jr)DqA-j{t>
z_NO_iIXyW?az4n7%UzRuQ|^<w-{eK)mE<kX+n#q<-d`+Qi^a0ia<Sz}%lG+d`HS=S
z<v*PNUV*kCqoBTEtl&_=v4S@X{$-7^mRSd^x7j3HuI+&B1$%&fiTyhJgZ5|bZ`)56
zvcjaovclzsLxq<YK3Mp6;SWW&q76m+i*7Eur|8k5SBpL^X2sdXON#dtA1c1D_{rig
z9ZMWNjxopij%OXt5@X45$?K&VrANxNWgTTbWmlCwSoTuc$K`S5wdE_zN6RlRzp4Dq
z@{h_-mj6%@P?1n!shC@_ykfXwZ^g|OPgJ~F@nvN}Wp(Ak%7Mzgm6ulDP<dD7iOSb2
z->dwr^82cYs@$ros)bcMtBzLvFsEwHt~vM2d3(+e)!Eg})t%LsRzFw$QB6=yZB0kb
zH8ro+d{pyYZCq^?{_U-OulB1tRu@;-ShuonsBTZ)b#)Kay;S#EeFFGx4J5;hp}+Zq
zbF$*XV!JKMpf#nMHFix+R9I+KRA`tKP!gawMn@Z?p5nJTp(sO9#N2s=J%oH3a9^e`
zvPar+W#-xQ_>rgaC(^_tVF1#^gX6Mvok?eMh|6Jjm~h#3;==c4Jn;{8gL!#*H&}A=
zy8PFt_4^H2axFLFT93D%rr((MN#^7D198Z=Yp#L%&RP7#-TZFp0q}1YU1Z^Adz9T)
zTvV9PExf3(*a4uDcr<7<RU2uDO6F0K2CXjg&4%?6h3VU>7p>n|-LS4TuV!he)ihSt
zR#dX)N9$UbcKfzeuB#}TAD<LaTUOt$S+UBmwXD0$o}C*Om$A8iiLYRrzASy>Td7GJ
z0S<yO#~3ktFgv$%jUkFa$Vk;PLS+Wvu$c|o)C}CC^o0s60COe3G{-sWur=H8pZ~kH
zu&A|l!yj5V;Is8VHdC6-mS(a^eW$MDUzluox7(f9ZfI*ODs26}u(btG&UWQ{xlb|x
zPX^#I5hUDX;iA<^Jee0e>{?y4$*h3mAehk8x)2Vm*liAeht;>dz*-d(mGCRGCST(i
zsBUN*S+c3NsAy$FbNeT`rHkjRtK{$6O3Umq1#wBw*RI*LWVmJVhDD{TBj;4qx3|9^
zGC#j+Q3C+sf;Kt?Z3xQ9v=B|;`=p6`csBHmKu}s(u_H2s8=|5kGm@@2d-a|@tIs~f
zBc3(dqVN9K-O)Cq+_(2|vz(rVX>~<ayfDLD6vfR-27c6e(lZC;N#;9wJaqeTP^H-s
zjsAB;>!Nk$NK=u{yn9Xf>Xg-CYu1F}7B2DPgyNwgeBl4ajm0o8am?kL_+$J7f>95m
zjx-^osmT8KoS*(Q=jc)X!BKqXAf5lj^_Xv4C|$IQPD3d7FHVXK3y%yf=<U%ag+)cB
z#fk9(oUqS%ERP4RFe1v#e|&DSP<<&Fe?yQR{5?ls3PxQNMxQ+D1WtW{yFB0m`J?rA
zjqi5_-z|~eJC#afF?-@${yF;HpIYsknh{2Q8>W>~wM2=9Jg9HiuD;$~y9`IxtU0pA
z|9&2K|NYL7@0VH*e|h*Y8QlnP`20%hhcw0X;ihn1k>e}dEe}flr>>Q<+lye`LK@iY
z_z~1&#9ffF&Q!%wRiq(A7s^v2Q;Hl_5?|j`TGOht`wcExyv`@U&)}cnm*VpXuW@dF
zCnq(PKU-3}pnloL)pLVm<MoLl#(;F6XYx&-<!9E4I%3e4TTw@*;3{Gqfr?!fX>~~;
zSt4VzJqdNwI5RzLDQRgbceDlCf(F{!yQ>R|8~c|p9SXDuE|^n2Z&5i<&#QJ6)zy~F
z?d)tWUg+<?pki+CnuV2hP32|vXjdfK^A6}W1a}SbF}d4QiuAnvvyZJ?wpg7X{6p$<
zuDRwK%hgwB@;i`+;JgcYVgYBW&J>Q;M-tir9KnV`Ra}_yaZ6q<Z*+cA(A++}`HrCw
zd%`($+S~c^GTxm3zGchO_OZ_PbC>U1q4eR4q7O}sMbMx`ep9MLVO`M*qS11*Ja{*D
z)YdFpR#WS=r*^k>OTk^0OU@eY*j<vgd)*b>|KO@s2Ul(Dzj8-XNO{}VrOURBw~bwW
zAK;Ea8-EPA(a&6C(x()q@E`xV@FV9jet}eZ>M6;Qk8S{!rJ{Y010Ee@Y4k-gfdhIH
zcXWtBFZh~6a9O$8a?jjzVCBjKE4s(pTFc8?&tEa#SeozXpTD5L#8TQgZkj(jf8N;V
zwd-qZ*E^T@%qv|M8s3^;zj9@LertH>veJ1ys9R!bsQVce7fetZAw;L}c5?_An~a2L
z!nY}vpFbNiR?mP8QI{Bh{7Q@+z2^v&Tr;6M7HKAQk4e5(Oo)UlN#+G8BakS`lN-28
zaq)#9r5HDV;J=oO;?OX*K{9?_jKQMxNL_Sf8ISn%Qy!dub;0i4R#8_Qd!0YdccF(c
z?>dT7BGYlT@iFJceC+k??7i991QUmTvW+j1%i@u~MGkWjU*h~6g;nHVZQZ@QfZ#Sv
zoZ=S=Jm+|}U2h1{>co(9RQb^Oa*e-VC=$VOtu{gzuT6oho-f73`=*7(1T>b-Y10+@
zZD?P-PG(#D#oz=(Vz5!0#`7{$@{_e022-lOq<VhcvO(e}66Vi7=|k9T3ZYXmuG1>R
znWpL_3A0ipy4E7eQ<XC4-CZ9Q?a(<ul41gr(((ezsuurb&!I<4)@`XCTeE(1>v<ci
zUORfl!<!eFY`zf{$yN1z3sUVt{y8zR+vi!cD)_F6ilMb5mp*aXyzLF9ysVteh5I{h
zdiVBoww#la7@NAddg<QOXzQ03)eY2IO9`hE=6ELMV(jRkYcg)vo6IRX6F3GB$p6~D
z(D~&8JZ=4qChQkpkkXuRH^8QclpiBcFz5^N5Etw#hlh(X<7>AS;yF?t?0iE>{f0HK
z9ADGteBj`rLkDkeYHMraRRQfKi<V1$i|1_&_}F>!ySmXj6vNDz>toP&fC+OcN{g1~
zd%-yzcEXeSy0PWk^)|z*+O8ct*VV2v*z`M=jUDAl_pV)3QGe*0L-iGl)>69wqf{u`
z6$+XXz~pcZff%MW?K^PR1s9xk!2a0&qeu7WU;5dlr=FMkZg+lqJK=<j_PvC>G+z=A
zRQ@gCyPXg5T<4p-&3T^Gm;dMde-UGm)BhalG2hu`u8AD`?7YYM3QuvqkEDMS^CGpc
z0PVw^laBdU?tZ1AM3JJ~i}}ROukGKw^~M{=SL`&{a}TWFwX1to-`9C*TQ9p;v~%yF
zoo62!lIxYqWH`AU6I==|dw7bZgHUz0-45<Ya#p(Y58(4B<@9VH(qq1*`EZ||e#bkt
zK@a|P7|#M*@+^e1#H^$*(zTXX&$n+}+CF$!Mbo?m{8s1c?m^P_Mcbti83uhgxBGJw
z*F~aiUR~?_l&4lZ|J{IUoF_P5oe%Jw#=la9GR|JXqc3nqKu3bm<^i7P38hN82k|@n
zS1Be9H7s7Vk?%JC$y)NsCq-6?pPE31ze@3F1XuIKA`kk3hnw&}+|I+L*i`3*Rx2M6
zjg36@cR&OBAR3|mXcW8xd=k<aI571_go`^%jn{75dhNAaxApE^-?ejR*LwcA^V4I;
zc+4^9ypx9ypCogHOl$lL)EgjXA`{gsCQVE)VpQ<FyY26->hCXXYrON$ChqV2R_a^1
zWa-M$opmsT5gZ=~I6`2_VZK736;u&v&_!!pz&IfK74z@xRf_~j*RHYo$xv0>3QhKA
zV@);zcI@KoSJiOGHiFJr7hl^f!w3HtjaPW%xEns5-o$B~5<ldTSK8lM)wj6y7v;`J
zjyATnHS$m=1cVjstsVZ(FQk$mYAY&hsJ)PxKr7DyZ|O9$6p2MqPQ;*9XgOJC48Uq!
zQAB5T{n8ENjZK>uFB{g|GtQ}McU1MaHIA<6IQwcxS#9Z(q!fKa)!dcsO+#~vEtc@4
z$a$3uI+uw0Aje61z%R726i}BbJehy<p&b)@K1QyCV5&o3#vu*p*9Q|R(JpB$bmv#U
zdY--awm!T6xjQkbTsg3D;{e~}ym%+&BmS3-e2^VLXGCP&Vfm`)&9}fjPJJL5DGjg&
zo-SySQwohKkRz?0_h(;zsqJ>l6_@ATVZRjVzm&p2i-D*9NzXE_;7^r&kr%v#YxI;+
zPqpw<ufJP%;#Vc_yixY>BcRCFozD@zxr66Bzvlpk>IGGbync9re2iv;F9-kH@qzvQ
z5?;iu&YLMM3|vTyzTn_AMEmIwS~-NENY3APHu7!GOZm^72lMmIQdquOg=+z1?*z!+
zgwt@yhPd|J^xNM`t55C9lite{^?S(Y@n~=n<a6oZsZ{Ab)E7T-5BpTk9~};6hU*u|
zZ<fx_JC#b=ktUY^jAsHSG)HfTrlvDplrvh?zPL#0d+6MA<urzgJNVp*w}>Z217rg)
zSq>$Fvxg-=UM1@W++?=%D(f<sx%VNH#4;xS&Bx(aiiedGy;kHofr`0Z6|Hs9o=LES
zpjFJr3&QHnjce95rqzd+7q-V*gO`@KE-eVG%d|H&*)!|1^+|126>}3K7T7B1mQ=Sl
z{-!iB-MZ2S%%aX_A-^32ePPCP5Wd2N_$oN01`4@6-=%7ISgnS(oHac?YjWBQQB7I(
z3m4XBHPzO_$NXh-?gdX@kjoRE%(nC&?N{dnei&_`IZw!E)6988%Nte%*#oNbYZfi4
z$*&5q2d!vWzU}3`ZF6$1eRuR(bLX@P7{Sl@QQ%C#m?EP^E)&q2J#n$c4WiZR22zGE
znTMU)y4wY#;FJ6(fYCs0PZ#sBqHHJf6wz`G#Hkbu=K}@7tLm3;+PI=)jq?|LaZP^R
zmWp|PTrbt0JEv`LfA79+>m!zQ+j#xr#%l7_i?PLnr7MNL5=o3BkrOrIUy)r%Nj%sP
zo0c~)U~O)0bi_*sErIsDwjAdOzanR0h1%|U{1(6$NA!^H*>2FK=pb2nse}Q=J<X07
zT5~$PyE}7Q4YuBnxod)KQNy)OQ4Lx3^XJ!RH6-RIXYV_)FDJRi+0(bFsk(E?FCWXw
z>t4918`MoYHRHDc*BGNTe+gQM1lWRI>%yCCQEhpx3$3N4xy|9|`oE`ImNh%y<;lgh
z#o5m1(R9!hBt*=|y`X)PG?2Miv8e2@l^YTah6H}xS<jCjrbh#lCcc`OC(;KKw)CQ9
zFdYyBcjqiG3iZ<krx;xY_>`1uay2PA{9C0Q8TNYtJ8Dfa(`;=Ihv6!kZ~k?$&Gz@&
zH{N~Eb`N0geCzR3qL0v4wnH9^1mzAJxWHDtD|nFh1NSL-nT!M%Hi)~>k9^$Fnp@Lo
zEy&NT4zbCfwuri%)qHcBbx8vxtMsDE!Yt>ruKV%&)Eu=Q(Z@2rMQKqp;hP_CnuKnd
zhK~VG9oZ9L!xEM&_e{@^Z8|nIbgX<~UG2h!wRODV<-L1f-s^nkj{g2T`l+v)COY^t
z$RA4Du^9g{49fVYUZAeuVY6Xqt?unzjnQve*06A4!?Ie4&$X_hzZ@eU<(X05Pu22#
zC(*2?h(-djElwj<Db8k73cTwxWuVqVyfZ&3N>phwMwH387)v_zmV)^heN<*un9UYe
zovG)KTm1@ioi6}SraaJc68Odd@}<&zF6ariMrDLgvNK{bPqM{#6;v#4AMC)C9%PMM
znOl~gvtVR-L;cz<i?TCIqN5X*l+;z1&d*9uN(zcf>@2UV1<FxR80zttEZ`%|A;!0`
zN`xB}wJ2j%TN?}$)yBkx3|m=0&(0Yg&2j#rIXpj<loifuk^efZodFF%xKr3Zh*#DR
z!ufU9s8(grHHFjI5oEwKoFCGt<3dg#Tft2z19OsHp6^fQ{H*BEAv@Av$hV1lkVa}n
z8pxj9J>6r{uG_GwV1LfWjXC?Rn>HZBAwG-&I^w)SekK?(-}4_MFUA%p>m=;MW_z@Q
zUw-3J$At%rj^60lcOF0f_Fqn%_~_k-9+L4A0a_rKbeIE84k3ixWoy2Vpt|cAf7p3|
zZx?WxQRX(J^Pc0k!Ojut{1<O_ewCAB;=?&6=OMXV1Lk-;#wpf?IU)_^JQBTE$)TMU
z*=@KNi*FLSK^JvR_F0=V?b-OVXKp?#J2UflujJ(wX1$ze$$L4gFfZ>FOAM42Y9HhV
zwnyd#(Lnuze8pl%zd<J8WlDWTg$`F;QK&<$ODk%BI5X3`M(r!PaWNLJO6B@8+3)!<
zpDQG5yBKW5!A)XX!|<rd%g;7-M8?GB6=bJ&L|J)DQDt*_a$#i?##qh7&F};G17Gk1
zM2IX@f<|xGtA2o>fV2SqhA}G27=7w5vL}G_(0iZ@r3t*b?d@(+f>cqkO)|}|UzAtc
z;ypLFl;$m3|LOWY+Ky$q*3vv1W@uYpS*xyNscyI2Cdjk=H;|vvFM45@Gw7f&7TJp{
zxjO4X^(7W?=bd-PaJzF=P3+2*Yv<Q34zlW3m)7&!d$JCd8}srqiVkJ<bPhFFE?-3W
zhxWiffxHEI72XR$M;bEw$)-OldhX?qm)v(>@#B}@c;k(>8*k*=EbefAz>Uu5v*u-8
zetFhB04Sj!uj4lZJ`23VEL}E7DfT~zSF)t9+J`;(gD?Y_qoGb%cs?g>mah(PpHtrv
zXpQO2aYQE{vgFlA;0tgK*B9x3zqc?dD)v5mYDYpwNo{S};_URqq^O?c;>_l>1aoO^
zUD@KS^u*+-?qo-1TG>V0Ba0$ZBf_^CisK-dNm%<8$5%+J;4R0fMIX!7N{9N8alJkv
zA|hc!M1md<QeHxYJ~%|560E129y2fLfHw&pFr1_ZFD;N*uq{oNphR~Nqmk_p(oK(^
zf8L{yo_GE=KRx-=pPqES*E6?#0h%?hq^YT7UW_$pLHS(%nDfgAALL;VI$OUva^#yM
z&pa?(UEH$t;L?`jYJul)tg!gEG>jP~13jiPG=LsGofIou2mYNCV2QEJsa}(uX0(Oo
z_?73_%W`yCG0EB4$uU{HCN@xS%g(YVT9VT9GV)3a&PffAN{ow7iUO`;SqD$yq;rEi
zIZS4n^u_0G*3M3A=kLX5#{u~d>ktw+=lg*h9cH}*tSfNMn^D4U8f7z+Cqd}CC@6!p
zU8s|M|AwnKY`A*<ioGjV?5$5qnV*zyDKhhjP>Us0(%vvQc*EfL`<E}@zv4ja6UO-b
zfVn$j!pl%LXNRR&zDBAgU4XoAho6^Xv$6np9zV!KMIV9lyY0x)G>elXDr#v?QgTjC
za?)kBYijXdim%Gasmg83udB;1sH+pWZovwI$FR0C5N+}Df@q+1e4c&uDE=Siqx_3K
z*?ab6JHO2#u>&wg^F;O;)^?FiT$b(NA0Q&5#RM2w<Q_O-j7vzB@_frW9ns;j9oPMC
zvq2vw1;-|(CFPeF<`{Ae@tqloJENj7@Tts59>cyynV>hcQ`Wy-_DEQTJp2LjY@A^l
zj)^m+#>b}m=4i`u@=KDV<5S9P1(m*e0Tqd+^aTE(DJd>GIXN!7*q#-at<_~EWS11>
zRTvZEjB=eG`jj4y%1^BAiaG`SMoiqsN5D78=f`}()lK07vE@9R*XLX}e|&s??n>#2
zQ`P+5#`?O(Q`Klg278~Kg>_n(i`*X848;)qPt}mj*;5mXxMkv1>Ql^Z_J3H)3+_J$
z`Srl-Rg$M~=Bk{IUa8;tFptj7vjJ!cYctDv62^z{0lEE-D)sTC){4s3)}0B~g;wiA
zDemCXr498<U+SAPr*DqPKZ)LHB9^D)XQrcf4TI@PS`3&pu;ie>$(EH4T)=O{b#505
zx?GBPuFd1OIG@Vx=;Ir<4w%cz%-8IM(tn-Q$CVa1|NL%STY6ht%6ouCF1tf18zafH
z?3u{QtCmB_aS0~SSb3)-CM>Sw`nS$9M1)PM@xr*I^rV8a!fZocc;YH^(hfm!a@}W3
z8v#W&*8GUNeb0dI0GIEib~iZJTF~yZPD719+B%R~TAHb%<mVvHm}FyIPLVw;K1-|1
zOayUSDpQJ+2R(2^^Uja?Au(TL@I{Epo|vzuuuAKIw-=Lch>y$#WhR{Q`I*`7`PYIT
zokg=6?=`{}l&c8=zY7q4Gq*KdLna6>i})J*G6$hwwu<vx#(`b3CC!xIF)4qiZE?x;
z`6ERA&{90C@ful@^af#FCo#BAcPUcbwk1nQgmuB<Bg!Htut!@|G0H7Pd9^Iwl$H`-
z8=jVymXc5tZt^JJzo4YL5*O`x5N(w{6XolL<l(LXE+Tow1vrD4kKxVos^U$|#h8G_
ztQ->Tu7N*8)3o>>5)3Uttw)n3>WRQC<k=EQQRAX%Is#$9$m3N_dVYO=etl?SW+MK(
zYmuaSi={q)QDkCbq#+SP6Y(0fN0SO37%C)9m(3qy8J2gDKjbjK#Q7voc7DjKoR@#H
zr<h-lu9uwvcqnx~E|+7TSfuzcU@Mu)i$R=ldQZj@@W)BsjN8s3_!gk$FM&2SG=Gsr
zj6_EDAI}3dUgYJTxOV&w#X&J8ufy)^${Cr$GeC*X*QU>`=8eF-3Hi#?8={v<(#t9G
zj!KcDYE6V7N7-NW5@>~FG|0R{N~6*3CJ)cX{M?H;zVQO=SDy4yFE>!a+Iz;%6S#o}
zMHULkHuin?LS=|befDwD$F2B$(>t95>GG7JsHVdje#YL_L;%<DiHY*$w(hg%s|_Sh
ze~&!cG&BdY-=KXOXpAI#0|K!w#=DLC?2TS6BZ8FrmU*@a{R|ykssSAa5=BBfkUds#
zBSZ@e14&p=<I05R>M^_Q$J)1#oAY=CIk33-JGis{3=@@1Gg7R^2Nn;|BA+tsVYgL$
zB>YnEF@K5TGSP5l#IFIU9<CF#RQOH=E@_>Rphz#-S+!p*LMQO%Np2MB+HvR4CVNt*
zDbrz(n8qu^ThU=h&!k8foDt|FJ^9_HgHd6wQ=C0><<6e$^O63p_PdM&8NXzIpMv3O
zp})c#*N;qUhzsY=|4haZ@k3FkGL{k`g-th_$i+F?mGyfs+IxRerIOdc`umiM$!2&7
z*2g|Znggvtk{2N9%)j25w0*nspQr6i+$sL=*a5DpVYw4uNNteY$xCcvY>r}6qRg^V
zY1Zit(VP@r-$FAdS=UzGc<t7~=0am>{k7YAn=3+Wyu5r_*G~ROf%BQNrR#U<7A(Ku
z;$scfF^$XiU3{#zmDg`-=KISJZ`;u1JiCM(a+2%|60}K{BJhHU$sVB=Q02{vR|v%T
z!~ab23{6vx!!AuOrxujxbB5(afOCU8O!iKxQ@iS<`khR5lYLdvb?&)c_H_|I4JW(n
z6n=^&^4=b>SKXa&h$~j%_3HCe^CqJ}B0HPp3q1M>OPK0`qIHg{7wS#4-_sLCj8>jA
z$R8R+mcDf8h%iD<^@!T<CYNYy6I`MfUo7?Qc6&t8X|T8coBtMm0j$$e-H3{<o77CV
z^eSw?5E6-?cyZN=f%f9ejN+QMwexav3hOHiD{HNtOInwe@KOJD^HvT<7sM@WZdsgL
zXT!?AI%gh#W9i0a)rC!<8;$H&g8hp8N{~R$)Uzan!_#?}D&fDH(Zj@_fMe_bP(M24
zboDnu;XAYX&xdU^6}BIy%<vwD3G(hvXPOWO*nf6f>gz3?_ngMfCY^s?SJ!!`cFhP3
zy26EvG0_*W4xDjcOyR+&cl=2vQ%ZbISQ>5#)nVcK86lIh3QB9LaAA@rUM&2O(kjd?
z2JEh=5F=5h#8eodl&6d{#ou0iNU$EJDq)#Qh}eEQJWjG@oqA(NxZSWG7WmA^?vFFZ
zs48T;wC>YUT9Aj-JtIg`5{c&dec1^K*%0c*+C1rD_#phTM+G{n!-jRvLWYq2jp{1j
za;$*He4#jaqY25Ae2~X&jUn6YjTQXMg5Z@6n~O^})OM_E+EHG(B(?w^-;%k(j=p8&
zom^B?P%~OFUkC4Gjj1eeLH_dmHFMeq`kKotTQY?UxX98Cf92BkZhs~DZ5Ap1T9WS-
z+n~2;*pw>d`)R$lC6MpEJ+~y@qy3oI`A@)m$j6#IyO-5W%IM6V))a7>X}ql?U@;!`
zXyB<pJsiwjKbuFFgKvb*=+%Ws{?xbAcy`tH5HE=*nHKBS$rUndRT`OrTrqQPGV3@s
zO@(4S$~cs5!DixM)7XP&gWxhulP+6v=geSC8hOqo(*h)6PcCC;3tx$cUb4ehhz=s}
z6hA(f`ULU{K3{aDLywT(ZVZTDj2(UZUZ#{HorKR565*8e5*ecq;={K#=h|$!&G|O)
z)-`L+drr<Ev=^+SApWEm+JvHwr)J39iL__R9w4jS8x_XmW0T4i`?z38di`0VK*fg&
zIOAxw38X+QT(@J%qAbk`i=o{XZO<^vZdGBaIT*a&QaC3seNj-&et1|nb~G1A#>RcB
zoAsZm#+)>t<n;KIsKXBpoV&NL|D3Xx7U%FCeSJ&X`w2Ignc-(2LkrBp(>}%XtM}&k
zVmX4B_x(*bdyg`wgoNv-xQC^)WWH`p0i`Nq<QUpXzT!xHGsRmhW_)jtaVB-Dm)Ds8
zb)QFB<l{$KG~Qs@n&f?E6X@)fULI(4yz?<HC$v1Kj*0#c5q?IQ0;U$nxjd5o;9a5^
zJ<eZy73$I6G0|QRE5F-2J_*Mps84d3N4b$K=1^}`Nb@}=PNBo_W1vg22gH$IVVZG@
z9XpedbcyHayv@65lSbsHQ`^{teT0wkzoJJ%Ne|S+>Tg!(XedkJ>T_yF6CZm&Dl{Z2
zDui|lg+xUrMe*>QocqK|>Bvaudtw7oBoPKm%boZee+YY&6&*)iDXoqQtXS1K4AFW;
z$I%e#a;?^(<<0i&M2mIP@}|OEL(-e(lG;Vwob+aTY26}TRoT<OH?KIdd`?&Y-rUk}
z=301IdjFDUtMeb}VoV4>IOtQ1>ghb;VutYYhObq4&LodGF(66ys|)@kG<k;|Tw$tr
zT>~1e_VlpJylGO2?(&FF0n-ZSpSDO}nNEN0RjMFU@UY1RYZN$Tc?)_0sAj6PHC;WI
zdrMZAcvUxvbinH-SB!op9|?T)f+s)&#{4e4t7eR5)l>ff(4%_m6}~Rkj3AGD&4|~U
zF`@+$W~N^=qAB!PFQzQ<0KLg@tP*1tf|Ck~?4^Vb<SBWopgI5OjQ|zQvnF{%iJn0L
zSP%Z3u(yVw_uN)p4P0{G_SJW>YT9K1zl~Put2#_AX8r$8hoZtyCs{uJ{EYG>e+`~O
z7yO!F0Y9{D5BN!Ww~Ixgc7u02nmS7}O(iC?M{91bEJ@GKPA?H<%5wgwXp{ZFkeod$
zJYC!JJ4-TCOHwmuZs7xSO3k^sW(dbbkI@sY5*!HVFB)TZJ?(l^*U`Wy>hN60pRA9?
z8ky+0<jABTuO<D(IfjIgw9w?lD63sv-jA{~CR+Kqki`SBPaNy%+|q5d$`)j~*HaXa
z4EKhBH@Fs7>ixp&Gh7QVWih;UsSBuHWtnC3l%_l1EXvc%0K(8E<PRwZ_K0Zo#EiCe
z@@`IVTfc<NhTb~`q$A^UxmfQ(`_RZviu_D&-;~`4oTOprE7MB$Bnp2%?V3!$908bN
z`JTxipvZF5gMFE75Sj$JCSJ7%$#U6YAs3(yh2P5CbfzvQ<vudc{n_-5IA18|U(e7E
z;qQUgph`q+hFuzbl~}&${9xMM8q(WhLF1`Mz57C@Ym(s)V@+f<X(u!3SE|Tkp0*LY
zMdjXFmb6k{L@CA&*-&FNfta5NU$i{SjxAv{nM{K(lZ<}n+tat@s%`D<+niyOnxgi}
z1xg({sBF=W8Ods0XD;=M^PkhUN4j9&zO`#leKo05gkMD}6zvEh%c9Z}jhI&4n_bw1
z+=4qjH@!##?o-!K+WR8g;4{Qn(5w2P#&dNdRtI>lB$l*UDVaZ?b~Q2o+s&u8D(fC6
zEh)yBgQWfco1cSd<{iKsO;%>u1~HewNutUy3a<+H;A5O^u@)6gw@zE~OJBV@{nU3V
z+?P7X)rHWCFAQ^|@WH`16Y1$kyYG4D+g`q^o3gV@)ICC~K1^#{gx*cLXH!Kld_HPs
z{S&KVoCDMAoQj_?SoR+=j~U1lR$i^lc+TH@fqacJ>y1o?dsNsZpn=V@`rB)Mo!LAQ
zqD*wtm?*kjGn|TR@-M>*t@0aBHz6++!`$0(DRY+nD)_**Y!m1e@Pj&LgJ01&W-+bA
zi7Kao{85Fw(E7+$PJ727n|XLkP2Q7Zu_ML9Z)!?%&CdzWmldC~-Qu2_<+-m_tijh}
z4F`7TiskkiEi@N-E1_xxqdoC(g}WtzzbotSedCh6*5wBj1X<_Bg+@h3#ij6TX*s@g
zLy&uger|E$;=)wO1}Ta1S{&K`OgxkPz=CIr?a`_W+9@-cKX}oTU@{h^H`Kl$o+#Ud
zWUe^dFyL!t&ghtVFv!gDhiQj{A~O_V{?&>V>WUQ#i`Hh^e79I`$2-&Q!IK@@r#_sr
z6;IF^_8^EkIE5@vs=dpTb<VaKAxxf?FPeQ%f?_+rcDAhvt~L`7fY{{817M1tvEdsP
zme4s}%O#aFIFqI{p8A#AJ_<?HjQB!lyaU6FVVq_+hE?Gjv+T;?H@et|iUT39kWEJ!
z+OxHPtr*)6PTP8!tvGcvw!Ig8DwWocxoSGoj!zKP_MfKT7bz0cY#mQBM~m2B8br1-
zUv-6UBtkgM*6`>wgm(xvge3oFy0$-3r%eboAVFLiziY<!{}{eLGxj`V&AYN(xTtUr
z*Pkg8QqmG5L&G9NlQ*9R4?nK=O$?2U42y^<Kec}rWQh4gD#N%J>sB}|X-vU|ji#!%
z-?d5h9(lx*@~7^V<tG&3TCqQB5Vwx>tm$_8D6+ek$?*!M$SM1NG|_Uwvix%{<_Xy{
zD`Xa|0FM-MivTqmJNQz|u!+}RR;Fy{b)L;1&OIN0xz1MZ<NSd?ynjDXPBWy~iv;Zj
za)n&V6g#OYTwiG0vc+Rt5{%0v>|E)+R|YZSFyA1)B4Zz8?TAL+YlJm5-gZ_PFZS8T
zyq7Y_D_X91trDYhPNAG9VZSwy#ZtB-S|n4dy!MQYwPfo)f_19Fap4C4SZq^|NRA7L
zG26%}Yltya7L*!|+Z7Wy#<dU1`>l|{bSMvbVS|p`XKxdx2&u0`jujw$*X)=?@|H(B
z*xqlHE$^#^xd7#Hl=m|11+q+s@`^oVrhRr#(}2`xQ>_9LmC7Ci&kLsgL=-pzrUeMz
zxM`_HU0K$Q_}Cb1WC^qJ?-j;0r%G)HVy8^BGHe+Yv*{$&*@5lTF74PBNLY30!KP)%
zL-Ifp@?QWwi7cqVgSUKv5?ou!P?DJCjrIW7p0IjXd1)Q){bj5XB_9;)m}r#;;X>h?
z`|NU7xr0Y#VMpvEV>{rR(8#HG*ks%a_!dCNY67gGM4jp`VT?bqs*|>|K2xM?E3cl1
zUFC~6wAk;eY;^4_mqKV^6neS}^GdC<KQ5N$Z%>$_r{Djk?3{a%_QClDdF+F8ZKS)?
zIb};6@JajMzW~1wYcwXq>zd8o8{zp*d5n8*h?jop(#geMJ=s@-H4rXe4Ygj~vZs1&
z)E#@O^TrJw_45MoblP7ZMB_)bE`!I0BA9`DyZu`#4<Gm3ckjWwHHnkA;G^uOi55=#
z0H>Di@Yoi3pIY`U>h?g7av$;9DoE=#z>Dq$O@M-wy@ioe_7?KvZ^>H=seMTQidd_I
zRdi&#(GZi4JOEqG6%-Kwi*vpL*&!@|;Kl59O}a9sk(Nf<PNZC6la7sFuk1+_IODGf
zoIzIg!kJkV{<+$s$5Df4W8{hi{T5HOu=53O!X}j=h)^vd)#j03Q=@>OwhJ(b)q!MF
zlH0{C(3dArEr0fAqH-nPf!wzL2)&#571Gll*gqigKATXa@sm8R2wy1Im)7dje3TNY
zdgv*FLW2+|M7g|19;fmg7A4A7#<glc*<pJ{PA~RzI-ggx7*ruFBZwP8wK$3#sK*hh
zBUwK@snU*(L-U+_5p|B6ou5@(=X{;dX;CWWVK1W|swEQX(E_ZU55Ks!)>cs=9sJs2
zwK%Wity$JAN-xjRv7+x$3*iXHdv!%(GrDz$8o2?BS-lq^%59?kx58pgc@chO%ACjj
z$zJz~*Irzh>9$Vs04lh}Sflt6Vil*i{Nx5KMD6;&DbRTmK%zI8M6(!Uv^P1387I*Z
zXmJX_uc?LK?Fq9-4UbF(UBx5mW}sEY3gm&xB$-paqGD6l%Df;|@kzQF*^_0>3?w@R
zLOvjCWM6v%s931Zo(dcgOFFTPSBysAA~8!Uy^QI=l*+ZXvWhb2H58-6dRyh;B6CZ-
z#MC$)r|y4Al+i>w2Jnc5g}&-0Yshmdd~U|dH;X;jHW|LMqELT7+78WE$~&)B5U-QB
zMWbw?mm>ZK*>%KPR<g}Zw}kTb=kK1pY!Wt}_a`rwl=WO$_7FZ_QJylBx_zoYdVZa^
z4-;0MH>dh7$yV72`GiU$%d3ZUGVSWp9z~72Yci-V4eoD~09N50g-jDp_*HcnH>_To
znz!FOQc=vj9lrecZe2`Svx;#puS0;8I_)Z48vnSaEW<rFuho)nX|+sQhdVkxj^&2(
znppUgi5JjLwOE;dQdugWbGPLAyw<!lT>0L(_uIE`vb!B)*9_vo9vhTNeo@uZ5t$;^
z)@tnGVttY(%=r{ga(=`soi!9rh%a2YdH;UJigP=Uxg8No{9tBP=<FNNa55r0B3Koo
z+R0(RWDBL(J@2;Tgzz-6F^I%UF57#g>arCvF)K=!4-G9ZTM-?-yu4$rSjd!L7XSl^
z^Luq!MYwGAsqwEUE~?!7^4>Yc#dEfIQRD@LinvnjY3Ezp8$<{Oz=b(j(xQJTJ`9b1
zntEi<s%L~4a&}IQ5)%MEJzb=jF6tWMe<&|b^bO=dn%8uZ<O`>{K_0wMfkjImo73S(
zdC_xwI>872p~kNv3-9#sX*?C)sesn0taUnQPldbHTp8{{@NL>lA4HZsFBw8VsR4y*
z!B2U#!h>Z?TfLg&!grv+H^$ox?aAlDF)_6zg)%>%t{t7^jSzURM7b#v@pJMF$ow2}
zDZNn7Q)FJA?g6;W%AfTH8T}|?{(wf(DJrQdAFClrh3ACI#+n~@)F0~Y*(3kF4dR(C
z`j0dpR7AQC;u8oOf<22$<)@EQ@D9!VCua^-K<>K#nK4{}+TIAXH}{O&D_a+5Z}K1I
z&YQK>zY%JL+HP7$0h}v3gy!^3|ATBxykeHdpD)~eU(DM6MM{jPGwyHVxU)C@7OZ}p
zt>xdJBWO1XYb@n>6zK0Wh)6*d&k~hlCu*HBGR3D<rHj5vhC+YK_HCW!jDlM!J$iE1
z7+12t%_+?u>dHmyG;V=+RSr9SpL2&il>a_Ur^}Y|lQVX@z(X>4EydarYxlkQ9&MB+
z&ijqZ@BT$GxI7u%Ni`f7+7Sl+DaZPN-tL$#+J{%+UZFqo)Ec~FfP6BUM?)Tx^osve
zm3w^^D7i)+XwEMQN*VpDz2N(nhe%h+cByIMBSzx|)|Jr4ydmOG-;7QoxlG^;N)4?m
z69E^1C>Yy|C{Ee<xP5$_VwL?l&-o5^YW<n=M1ZFSNEVu15IdK`@hWyZSODGrn@XNg
z-P&p!9bL1gIyo^R-Cp*OtZft)Wn0z-6c?qW<REY@_JfFhUcfb<VyBzD8-!ozZMF+l
zHiWo#9-XpJ-3ubGIiq!$7?abkIh#^%v3Ee0l*vm3rv_+w$YYtH+AblNlk5QeOm4}G
z&r9lN7D(c!R!jyPWyo;X%68Ui;YPXMfWG%U*|8?A0+S7fDy~T$Cpp$r8pku#WOvlL
z)rus}RS;DP{8Z|UaE<~_P}6j9W_Z-)4ewl41fL3Zm0Jvl+#%CRmXK#bDU)|1sdfa9
z?X>Eqr0YD^^e0Z)nzV1qep+>{-eDn;LuONT3UwC&)Kl3xAma>{Q%5+j_Ne8@UW@el
zJa!8#o6<KB(_MSM$leHPeRA7`b%7)YwMEW<DQd-()}+X41<LeXXGtrArh%D+Vw!%u
zd*nt%%b13a=0ejzQGmBq#4AK^Ll5wx9B&7%L(44#a{R*Kl`){Ns5oO~!X&6Kn_ttE
z1aGY|v8ojIBa~kYKlE*|!@>K-r&x!lqW(I!!P$RO5G9wb*tBfYn4ozR(xvnv?507$
zJ9v+0xFlH|@(z;tOPMRfK;;!HQ97>rMPMV4T7@5#%9D$T_AHgx%i>LrXDlY!G5Xw;
zfK2Khwg{{>CKHuw51Rs;5=`+eMWXhIsVHymRo8u<4?4zbNc6P4?o-aLuCF{ML!>UN
ze2#=GqDiy>@f+oRdPF~J?PA|ESuETepw$g^>)ku`ux8V9Yn}H-Ve~O#4WrEWg<cK&
znLFyEkdQqhKRyE~+ZFv$+CYk#&~5oPAyXqS<<PsLM*a^(XNNne6iLdrcm$SG>J`3K
zk63E%{JcaM6(4iQo>W(S)p^FCRAK0aT>#~{;a9bGM6Ps2wz@(T>56Y9WvI4I83*3B
zFAq?d+_OLTfJIsP<N`&7gJdqUpDC*nT<G9UvS_u&4UJ;&y3K_ZKJns2J7TDeko^QR
zm4lRdp>BmzkQiF1A)Eaj`q~BYfsKx&#DwIAh`9cdim0BH*o4H?6u)wA&i-*#OOPdG
z-JxvfJ$vzhNb3t==@n~oHM>EB0gzXWv|`?4U8{ynOCk)8P8ba22pK|K_lky)YHhUQ
ze@Y@&v?oPICB_z%ArPl){q8G?Sy_om*;&c@=$MG`=;*O_(21|I$L)`b?Z%FCf*5yI
zsC8K=97I7XOIKE0(T~$c;w-Q=78EqvQj(%51VfY$0I50MZnq2pURG8?T6A<;^hHZf
zrA=8b>JhI~ScNsI7#kGNfiz%`hzCkU2Qf^hi|X(Tj2ExS4iw#k>`bY!#XYQynPQxp
z1kD|s|9`zxjbBbW>hJzH`_*VnUVDan)_mlB*uaInM*&u?Guer)LEBEd8~Zt+>NGpD
z`JaG1f&u#^fIAlqs`x+KP6>kHG`l6v1rBH5F~Rrj1HX%dK0+~zgRsBQ+n#e~THcGQ
zPV)M!dY*?m#OKYf?2&%oBJ|uKmOlk1VJ4p~4CBozK)24U@_9m&J)7Rgg};(~xilMi
zc~*C*{z(Wko~}vA7zZTIhs0d1&eU94$0X0k7$8WaK^o$%Dre<GGC!4>D5bfIf6v#!
zmkz&?r$o&&Rl!zupd;<uQVQZ0(|Q}1Ov&XnG?KhX27P@e|C+Z+vh3+frAnT>%|ptR
zJOxXX$k`i^n@T~im<edDpj#CKhvm9Vqd{3pALUbGQP!qMS&?rK;uVi$ode{TsTx=@
zUpz_qQMa7cqypX_A@7jjD`*w5C6Xu~RAfjDb&?oq&X#2@iUC9O6fG(Ckhk>wFoGJS
zdqt!sznPbuC6^=1SmgJmay*4A2W>6&igKARZycHw=~CV|GzlKqcD>-a0?c9Ge*=%w
zE|lX56R*V(@9>iLXj9}Qh)CWN@F?;Y`64HWAxiazFTpKq8R8ne0QCxuP(Kxi*oT7-
zX_e$XY>?rn*}>-3h*vx}ulaj5X37SnDJ`SE7qTRH_~cj&=r2Wx{R2f>@sMEUSPV1l
zTQa-$E!~X{RwY-lZ;8(J`GWr(G3qpAn@EYw<QIK`<uc-XO@_z!l2iUZ7I8s#2%8bp
zd#?Z5QJ72z2ZiKIeO<oE*gqFxh*^em4E$*}CU`N^dm8NxYI^>(l3pJx(ofvP#&9lC
z&?GMDO26Ktc;JfQ`9{D)z93JC?4B<!@NQD;^@P*2NR&x1e<5I|GCiTTd%n0pTCSGq
z*%pshQJ)&nt}jHpP^LN`inSf5Dbu69USI;Z$%vU33|>M$c$ZDdRFvGE+1;Hf>CBy-
z=9gYVIwR7BiTT(g9oe45(NIOk?yMgCWp#H;0U0Y-W-JgN$d|#+=VpFB<U!)BSbt2y
z9yhl-id$QY9lT<5bMxkAP!7(Z;rEI&phR>qc_)XW^T1u;{iL2pb5M!rc9GY)G>FZL
zbK2xj)QYpIC%)D6p)cfkOQ|^DvXu5v!!5u?=rP&`=Och4u^8qI*)stnLz?<6q;rq2
zab}l@!kDU+k1ty%&MV_@%cqwa)nm(eGrpj+%2LF^WjM&}RJeT58OhQ7m-0Dg&&anu
z$}wguykGeZ>hWfXkB@USZx``685l3v-tD%|PFp8M;6yto{Z7#f+q{0CKq_7jI>$Jr
zc!Ip%Icni_4goEnLjFY97=Hks@&qji|Dv+NMQ7KU;H-(pjn?3a;CkpwGt7`sB^-&B
z;k%YK_l6gQwG`E_3Hg)%JDmU4|3v8Ox}uh_g7Dtvj<X}GN)a7&S=PwjmGerg`&VWK
z#Uuo0E$^u=owsuLrmWzE7>rigH|vV8Dxv^+`F=%&Rf-}?{zik=p@a~f6kT=cn*2*j
zYiji7jMb~>O^&Mi?IC_v=}_5bV>(vI!0rrBQ7D?PgQg^GhD6c6CNn<CorYW38y#lu
zNSK>$S-i-SAvL6={OM1BI)2>w!VAvh02$cKpZJfoR-!nN**G&*;8x>~IHRgGQ{r@j
zpkN}RB`-?kMuQwC(d3h(EzK!#Buxt1={$k(k#cW2tuuy5<gdCzVge#7V7m&jLp9Je
z>y=oJf|3cWNpVR%!sN^bw6sW$>iE!fVN7{ROk8S8e5^@{Jem|8mr`o8Rv_|dd16|6
zf(-wPX=0r^ubI-O9ETkC5qc%zc%8s8?PbrSwZlZi;c7HgLli=Qh=v)GxX^C}n-y_>
zM0`|lR3~R;B^xs{jd3PZT)fFNcYSSIg1u~cSwc)gnsbp0&!>Ko78jqER&2T@*^-R^
z&lG1T<@y<l%~q>9Hq+lPHz`{x_Q1Cc=Y_yGY2G<3ALrh(^l4y>ik4w?pB*L#an+zr
zXA*0Zwo*#blUfmPt4Tux%5!td><MW#K5{pW&K`U-dAb(*XU0+!iw)GmVre6BhzDi~
z8S5+P>>&Eb4>Do~#sO$8P2rISabOw^U5_@qJ4Q4aia+e?rbtW<v>@Dld>sFGRaRD2
zW?rGmRA?GrUt>y$PN05@PB7Iw*HBQ^OH5Jej~t3lx0F|`tJLKMRwQJkD~;MCdhIWw
z)08Huy;ej`x-Q`nb5CStenDlSr6Q{$3v^}yo#9*|VZX?PZ_n;-N2K92Bt|XK;XpYw
zg&!xdvzn))FB$H)#&SpR{JjileBADt=}SPG;`}`Xc<|^Z#Mpg|(@ERlRw<DPK0v_+
zh!06Pd-*Axi*_+=7?3daMTARrV-GAVliu5`wv>%Y_Ci~cabt3UU3s$>(;F|a<mJ<K
zW1gkJiaV9hQT`5;|0c<nWV3?!9%&L6&NRh)+g}>;8oJM`ede#Yuisb87iX<lk=2p4
za%EPBfUgW^G0)?_gKif<C5Y4A$p2Qx^&foT{G97~p=FDuv(vJeT10V9MsWVJl3qQ*
zjqGeoohRRapGP=P^7V2C=QA=6giO!RM;tJUpF!R&Fj#d}vN@6__9^FuoUJSY?)E%4
zL|&OJpMa4*<p>eL2Hk<5E$V?RMzxq`D!sz9T=^u7DFyzwXM1q=CTszz6R%B?ao<$c
zd=Fio`tDUJm>VV9cLBZHW6uRQs(UFUkIfQtf4@C3%o8SlPP#M1BN1Lh(Vf?mIW>~X
zB|Q8acz~5iK10HVU&W(I@}9hq1iy&QXZ+t$F==^&g9VStXS75Uf*zZ6Sc^EYrM05X
zpK#ScC$~V?=6@4s5E1P|<NX2tRjhTDVVx;%^h46Md=B&=<RJ~1&w<v1MZ(C$na6Kb
z-X~<o6F=d%1Jd<;NbCdiuvLioxk{K)k>)C-AzQM#0^l!-yF%EEMIF0P|2E-kWmENh
zzQo(J>hh?{Z8|L7g!tyP&XC3%#W#o767^mwUBlOjHGY%rA4*&T)pLlnZ%cOoXS6TU
zW!X^eAM!3G(DL=tU5JS<$6J*(QxbdK&IXE!?-732OWCDydZ%=Nw~3g$9`=0?ub`&|
zB~ki*lJ`$y`+=iPqK`c689!J2ZRioPY!CI+pbZyGgW%1uI?2&I2++yrGe~x?^BE{k
z^tXr;P5Ho9;4M(JkpFX>XsP{7;zW~NelO(qUtxCflH2cf%k3V&yXAJ}cZ$P*FHaHq
zp{2=^KO9Qnzn*dUDK-58Pm|J5AIIF4o-^NxCvdLrO+;tvxw_Tzxw>Azxz6;uNjcMN
z!Ne1=W6Am8%3hGK<f5^a!F7oDzJ&BXcTcJD-g%-D@SXI-d!QfQL+Pf|4|`<&5Viy!
ziL-<2)UlBH4<0Gb{mSIgusu}E-wWFVkCyl9P@ZVmAKbr5PWNv~6UT85^L3>6BFe&)
zbC{1zJ%{-?&S4hmp^;2;4)cRk&tYDGIuekFs2ZymX$7M=Gi{T{nQ8G8$8j#2NJBnu
zED{yxqD6b2i)KJMcBFxg>c2gg<*(w16nx~poNX89vS3>X^$+UB*(-9H|K-^$Q8?X2
zoV{Ytei;S7f&*WuO!)aGp2WF#mqIqLLKXk7&oX`czdXwn=PH0Noc!PDyOGUd+u9q}
zhgm}#?A0qmUh#i{^Tz{@g)X0CZwR%9t#4=_4{xbI1AUjXa-7}%!hd^q`_~QY!|fSc
z|8HlvbJmJ;YaW%(<lLH8<=h(osscJwuAs`_9(4M1YYIRGZv(%e+tZv|^GmODYiiqr
zY!S1ZTSIc)$B@w>dthFi?%bMrmme>==kDTPU4e6JX#2@@=hkp$gAew1fbX<E*gF=+
z2R4uoHmMOl*x`zpZk(==n3AfS$p<T+-~1PpO)Jc4bx9Z^+lRSMBOiR>?6Y4u`?4de
zS07oeo<+{DlaC_ju<$YdE_5kK$#!rA7znU&C$i=E!_Oz*n$Q2Urm+9ibNLv<w11Y)
zh$eivbb6scxz8fk@C#ZhYUk<-{c#*?w@;uh(m%mJ#pgJ0agMx~7Lye+(_!Qp@x^4r
zEx;Ig9(5po7I3X<veTLegk^{n(>Dsr=FY1uogY{jJT$MPzs~At-PpN&B+wqTxTbE=
z(yA$Ekwd)eUA@S47CEg!q5h?KQj^XiZ~1(Nv&b!g^S8)D(m0e!*IDFZwx@H_$vVJ&
z=I7_|h0Z_P<}MxG(y@Fbv@oQ7Uq^?08u^C>V|Q%nY_F=L)5wXR6`{-z(DnfGpgU+`
zy=-AfpYcHQtt;zlSFNh8U%^+~Q~T%jNg?a2JI2PBpHr5;cm3g~>pO8G`Ks4(Bza<J
zMXNZHymcH$k^|2a1K>5lPkt}uO!C+MTKHES6?UuFndCOW^n1WWF=Qdlz_uV8J)-o=
z6WsZ<><uBE+_nYzgc$DAv}#pT<EmAS6?5lSl+T-Y^~2km%JNGF7A_hnDJW|;%^RDC
z!^wFWolR~nsB>P|J9o<I<b;<@!0?8O7n$+|a(tC_r_&rxen>@(^VVq)!;j2}8Ny8`
zb_L%7+c4%)WhqQ*XB5eJ@*&$|K5Lk`9%))=%*d8;*at&-Lvb<^hFW`jgg+rIZY9CL
zn#!B)yz|wHJ$aV%&P9E*pLgC+jE(Z)k)Z`MpLgyn_L2Te=}$T{3FFKyFPQ~eF{Dzp
zlNZGK!g#H3ve}!P?Y0zaDvZR91^ib{eN9b$DPrLUPN%yx$2rP>-PhdQ*R(U$ibPhc
z^HTA8BQ2<~CXq1?ST(>t9t0R@KNIa5GQ)~lF5Aa73e~#-knP}P09X8Fis@YFd9H&s
zviPOB&e1qgrwkaNFe<Okx+a1``k4wp7Iy%Af`)&SY6WB}oB*5N_^5#TcV0~)nFDhe
z+r&@s4}||hY=@T}5+`$h`qLa7(DcDkeCAL(*iBBj?Ix{my9sB*h*f{x)L8YO>`2_P
zJ!!{|r0qKrr^KpnMSRE;XA~b2RlTpqh7|Xt$Qk29N@&mPD0kA?_9((R>1_L*INSab
z`D}Z|lCqQQakPDK{?!G$cf$&eHhZK|ttXc_KS!d9{Hv|IcVl`co6!J2KGQi@Cv$GS
z75{JLXH7ciN<!HqsDG+02T)9F%kfIDLPNG3K=i=}To3clBwG&k?GE5#Mtf81>FiCd
zu!o&EQ+rtJe`^n8EQt>T%krOvtx=viM$w(hGwu8Tjq;30PMj;xJ}j6QE6={Hf}c>H
zwJb=ARGxKgyT+hABXYmyXUa3QU(Nf<Gj{p;#4680cwVJEhcJ`RFO=snrt|q+c@Ag3
zz8vG6`;flXiA#RQ8Qts><ypeK`-t+a!M?LslxH6n$K#Y|U$&jsE6-ZA>p#k~j(sVm
zD$jl_Ril>a&*C&sDc=XOY|ZD&a}dk+$yc63SfkHb%5xYC@%fYT9L@rK{t1j4u@a{M
zoXLu(Ha5ft*(N+KVLRAn){Cb`Hp0d+mV4MJzO&$)8l)P+eH3Z?*#LaPTf|#0?!EYS
zD?WQ@=DpKsE3jIPZ9{{bj7xTG?lm@!j1TvW?ywkZhK7uzgZ%?zTa2T<TY5*g_Vyrs
z9qSSRtVbbK^;}m4tgdTlaQ$enaV}+IY%zZC2UtVMF^bpT{_&wMJXld}0sh&*Xxos#
zh0Vj0ce#0<<ytKT1-A0Gxh?a`l`8U-DrU(vDP@Of^%ej-0z4VLfLO@d@eP5bf<|+|
z08);zZjpYgD~E*@AtHP^@ZZHYqD)HDhXzrftVffmX%^hAtOTCcBE~v;N4E@)3>)P}
zENnLxz^aU^d|+g(dt`Vk{a`6_lyB<V*gG=TX8{Ur_{L@}DMfDY22REGw0JNA7V7O@
z+{e%k>Te_9C2F89p<l<uw?qR*Jdfhn9(+Ts8b!N|XqTF2`d|By=D(pyf{E}xAGK`5
zeI@QhNL_+z2|A+Qe5GE(stPyt0MRzVF^F7SfL|A>4Q?^^4vr1<jvBi_r2T_i#(GD4
zdyHeFT|K><x<)q|N9Y%i*S;CZ-8eXGoZq|6xU#ovsB5^#h$|m?M?~4(7!qTngS}f&
z^XZ2ln=+Ot7XeS5(vIeALo-J4wjP6(wT+???IYVjC+qP_Lv;+px(vw3M@sr<L6^vb
z-XaLn0(9&|(y_74W%>Erwr#U)#PGFrk64E3>;H!(<O{gE(Vk(nhq|~Q&~63{n`eM8
ze`v6~cX-R-aKB~q01|mqxZ1JIbNnkr4vsC!6VAl*B7TDW;_=BJ#|uX)4ZP{T&?j{8
z$@xPr4utJL7`#0c9=dR>N76%sh=dLu4S#AZ_<ub17A2z0WOPsp_5qk6ji$2<NQ0T!
zTb2!NBNu*G3p#{&gbkjtLX7QVZ~_|TW#9)DSmRoS6~EQsytQalJ!=40Zeq<~Z>@sO
z%>(T(U<=tIFvfQ9?xk4Ku#7EdD=-#TVLn=oS#&L1$GW)0Ze{1Q3)t0cKf9D&#BRh4
zaFD&r&SO`yuh`d^tq!oK*n8|B>?U>>`zQN`{fixeo%MOlxA%dly4mFzOE02Ro?|ay
z4tt6Hmc7jW2Yl)c_9}ag-H%TCl3l^xVsEnFp{xGRK4%xQ4d}j&-~dDD#G65+XQ2y;
zNXJ1=TS0_>W7|P=JJ{K5H`~P?Ww)?%*dDf*oy$IBC)qE#hWl_|uH`!J$4)UP_vZmT
zkO#2|O#H!|jsWF643cOBd|d`uG^1d>jDcM<j>og_*uQxKPsHgp$=nEgZz{ZrX*`{0
zz&@JEvv@YoVc+9)<6NGHbLsPW0k?7+w17e^;wt72UcyUx8T*L+87p-vcqL9(pTnzp
z4X@>Oyq-7kM&87m*)jGr-ojgX8=nhb+<d-(FXW5(V)h^Q1N#g6D{tpZ;B)KX%lLA>
zg0JMOcqd=Y*YLG`9q;1nc{lIjy}Xb2^8xl4ALJYOMn1$oW}mRz_$HjZJi<3aS{%jM
zPh)(Xz0E#ke_(%Pe_|i7ci8)UE6$_d&Uf&g{A|99?`99MpR<P{Z9L5$WDl`t*gp0o
z`w_dFJ<Cq8$Juedho8eP=6exccppEHpU*GA`ls93W$Y?;m_5OM&G+*k@dNxqocDSW
zzZml9rTj8}IlqD*;#cykaPG#{I9vM~el4W?>-i1*Mx2`c6Mi$|LLA|@@}oFU{Wg9(
zzk}b&@8Wm!dk}}_K7Kzx#(%~ifKB;9#A$z+Kf)j7zu>=w&-FO}6{N|>`4jxt{7L>4
ze;VhzKg)l^pX1N-7x;_(CH`CfGJi!kK0H`ZP*b3*+0@lNIx?*5lJD9Yhy`1FwO!(-
zs~PDZ8SdSv>yqz*b=`xb-Q%13hI+RLcDtVg>Uu`Ty1KzF#sa!skJ|d~E)=cpk#F^=
zr)x~tpi~H^PZet$<O+MmEug`bJD}I~sB2Kb=#}r<2D!vuaSLp60}|Npehz4I6%f$x
zdJJmn9@(_1O9nQm-{U2)*_}6V!2RsgyuNGHX8<4Cma)O1o?h*sxanGymJBK_X^~q(
zg4NbuU5f(qpnR8FTBX4afvxUx12?#zgW5a*3EJrO(m&eUJ4`d(V7GQ|SNHf>uXae>
zg64Xr8}fM3&Xo}~ByK))(R!aDd}!y(`G@8F^F6Wz4ST$3=gawr#ZA*N+^^{!?$<3)
z8a$#jc!AvD5pfG%Ffcyc-!(eEX{c*_EO^B8O}kjGcT}!-u}4)wqaH8X#d5u)@~vIY
zwME<l+ui62+~R)rThiUzGdMKV<u|50XqU*@$HdKN33cBXb>C9C`^M$&TdK5XTxrcx
zxi#bB=DTzh^PcZGeFiV}Y)kOC=bLV+(#_-YJ#d*DvcPQ~&nrBhcetOmE9Exs6t{qt
zE;<d^>3*ysDC7iPu18%>gZN6<)hq6S3$_e(Z5fcCN8Har?H&jY+T!sdc!16B&@SsA
z?ZTMfCf}BcBDCAY&3{?XAmp?ygIoN!sZZJ!a>5<r=F>1fIzmO+tX7A7FS0B5Lgiko
z-0PHky>f5x9kigC!CN+Wb@#3%mIUKvxYwJdYiKM&{st-6_YRG0Llf3(`quUZiT<^;
z4~=Z?+8C&GKBe~;fLXe`Hv5Z4&{K#T=i&!BFC`JwK;H$(gdlzvL_kl$u5O^WAQcDn
z;zxZ!4`H4ZL0^RQZtouI+SD_$ZCLz%)_CuhF{l>m7vcTmgF_@(4Gs17jZu;a*Ed3s
zqpw7wq25g+V_u2GH?BnfqN6E!@cORqjjlHx=v|b%xp#DMq=z^&iQDv5!1{4iD{2iG
z*s*z_SHKw1JKO{E5ea%XsgJ>%yGFr$6d*&yo2%~7^%&0^dtKm#DetZX!SqG1E4O%a
z{iy8{An1sEEa<bI5scwcJn0Bn+yhBN7$Ou8jcIoPN+jwV8UfNN(av6xO^fM?ejnRL
zh4_sPfLYSdeIw(dtiHjm;`c3s+v)c$;LMcWua^{GN;o_y3J8$#WEtyP?=Oak@-#Fu
zOi#gEdN&Prk0@{cf*$F6%BFxDg2%Cek?}3y)Ku{1q46zL)26{;@esIad}wTN^Uw}_
z>EAQBb+Cs3^XolpylaTk5A|-L^Z{V^U1&HSgQy5Kdl3DuCJt2+sqcZqBV+1wNZ01g
zqa)kZS6Sxf6BCfNIbK3=RfsRi>cb%^av>>7Y!~lm96NV5F$oe9zJ~;~1J65!-Ny%c
zNT#PU0w8BT!KC`exl35cP}dk~i}X7~Ep#Wj-WPJZPtD2(Bg?K^yab<%7tAxVvL%b#
z@H{Cc<-@82kxxTU9`8as=<eFm%QE^vO)Ph4u)mAhhDN%FSc&+o8XF$pL@W7_U!#;l
z&oU(V0?!0B!K8(jL9r!M0fP<NKrJMG`i(;ykni<MKK1*j<#*VW#P2+td>_EgQRV%0
z+_jv1P#lU7o}q`J{X$mez>_cZm@?=WosiyrfH_e+-vCG>SUjXtx<{c0=pGHJo$h2?
z^WlTaZxYH3;dIV9azLh+Q&I|AFH7Ii^Cs7`uc9%<pq&!5phzKM9)jF?J><+wg`{~I
zWX&Hz8oeCy=m8;xUP1CHN{SR3SQyGRDO!@O>vTio7z4HJ5_0!ilChCG0+*ktACv<9
ziLlUur&?GYZ{$CNd^(*B`zYkqUno-UFNOSi3uM_FAj_VB^a>dkr6dWBkaqV?)&FGO
zkE|el(0;CB4?>sdg1qmN>wyc7KE|F5T)DVteLYr*;QLS!4J-*Ta=>=CpbPg?81sc>
zeJ`YJ$k>x|5S0+^H$xlrr2RNSAw;wEyGr#5g6b0l)h7t5M@$SQ4d2*1It;wYB@(65
z$dXt<&<%_S-XqF83%c6BfKD67L9oygUXa41GU<TymZn*=PxH1<htJJEANhRaEBP9H
zTYVS$?(_X4{(bfTRd@cuRUOwIznAx<r>6%9{X`Ok-zNSMI{}vvOyf}3)JgoSZl;qG
zcS<J7G*g%0X~#3<57%+i)=fL*4^NGgm`svsryXM;PLn#G#wIonsSW;tsqF;Y*!;8*
zAbtptkdWl<=X>_v`|gt-2w};d?45mg@7~?BXU{o%_Uw<lPrsGTm3LI`$+hP0&V4`E
zQB_^lfd6>alBzFN-Cgxa)st0kR2?8s@lsB^@=dvp{H6Ij^7rO{l;7o3g!knQs!tK#
z$NePms`LE%xXL}1d-6S%d&2uDu_M<}eNV+_3F+#nexUjx^FOTKCQPdLRG%zNEqu0c
zS3%z|PIi4zEJLUX?le<#zPNWito58GhW}%CT|u*ZH)!GCuv^`$!BqZzZjpPL{Rmyb
zGIt=jo&0=B>~@l(A9<g^_&&p%4)|Qp`dI^SCDBdHElhS>C?QA6K2q|JgusGZ%nH;1
z?^N(PAS@&PcJgJI=*%*L&xX<m6FAr82f*uOFUA0Ho&&})U^D`w8=MA!kt6>}O70{7
z-e9A95jmT{n>ui5AhmKfAt952*XoW2vyg?^l(Ui?tAb3p9Lf7C{|>u?FHBauj_`iU
zUQ5fb21=i41m&Jiy=Q>OO3Jcau0Q-5x$dWg`XJBS@xYuw8NSw)V0)Zen>HqQ2f2GG
zHy3<``yKQKD+sRymsNZVw48ERqigO5PCey1us;j-{a_9xQi^`9yqyAGQ+YpwTB#Oa
z0XONil{~Ks3e+Z~HjdgjY7s)`N^r_^jdzED{43zoIysNI(V%C6oB{G-AcHG$Gw|9u
znn0)4E8N?Z@KmtM?T<=$gA&@HKqoX46}rL)`672MuyigToHOWE5$;)oK8sRjgTHWI
z0smG)iB;rY4i|)LBRO-HvyK|7_KiR)Qb7E3K*~n6%mXP)?W=**AJslf?fcQs72wrp
zP^Xh78&tZ3aOoYm)Bu-Sq3;%xQ<DsM+Jfx2A~{o_<5X~&j?B+Mx@W?<+3t0?wwIEo
z!L?fEehR^z#I54_0VuHC?W6~vO{vwC`aIOwA3n$m&RX*7pTb3rQ<SJW^+xp(-Nn(<
zRv&S67Ff(C^-3tSiaZZc5338S>8p931pYlhl4KtMQWqR_K@&Cil6@EmXTWs;TnB(6
zc@Q;D!KE(nItYwL%51{>vUkI+2I3_Nt!^9d8>n$3ST`ZNt?tKAs@~G56;T!`CWB#%
zNz9f=VjOy5C+&DYZLpg*co6EXvG$OMPHGW#z@H7}7lG3<G|md}TF&Pgs@-bj^!^|R
zr*ulHIF(1<##4sQ#Hn-J=}oGr)frlO4<!sxLI)5E%=fi06I2&$fTL~jt{rW17%H6%
z>fpvCAioUcS~yXIWUr&v>%n^izK!r<l?0wAdDFm~9B(Ru$0(x>JlaWh7)d@!d;=IA
z2b)H)`6bxYfK5Afan$)NSoKn;99TK95=Qkvnh2CypyYuP0;L`q=>n@9SnXgIwFb>n
zi?5>{&Y{dJ@r%LblR*4B5dMbed+^`D-;4iS{5SD`=XM2~xIcy8jPG<k;R1IyT!>$U
z|6QK1b-ThZBKu#a)CZBB@8BPTeiMTqBJ=Ag-Pe{~pl);UwEG#jP6O8hxHhZCVWWF9
zY(b7c<hF&^Fpt~}1=^qhXCKfiW>MSO;PE-6a2dS1ojckl_#X9HhfJ)8V&ch0(zH=p
zJ3KiIWlo~!8r^Xy@e-71fD+qiDZ6PUyW#LU=91S_(gq-GL{GHw+zyn(KsZUfbZj?r
zfsUmu&5}gnRsITSxQ-gEhku^(4dkpQX9GDaqSl$A4dp0ryS2(3Ihx5KUNuKp){>)<
z9Q8=3)@LOt!fO%f=J0eW=@tNaHt9Y}x;ddV{W$3>B)N;Ud8j-d>6}2xb;yX9l`iCB
z3RpNaaUS`#7Hl-^CiWD0dw^4qCQxtHM7qhqX+{!S+zDFY5p+@)n)wVecL*stj^z42
zM!mH3b|@)@8orMlAfI~VYV!4)M%S02JAm{o^s7gcH=xNI;pSvYZYE6&+-)VrRAhOP
z`!9O(8Yue)Gw!mmd>=~BP46#=lc-}qwdkUbd*Ot7(~x{e!L2XV7SzTkQoCbt`WP*-
z8ENdGolk^M6QG;4!ZK=gJ3RpXFEm&R9PQ)31%E4kAO1A{489lNhabSp-c#xfsWYV3
ziK7`(XGonPb%xX#QfEk=A+<&}6VOYu(954k-+Td&?g7e1u$c)}K4R^A39A`P;p2_?
zo5*uB{uaWw;=jndbv!?U_mWmYiB-r%1u{{AOjKYuR<LGJA*lnCDln-6lPWOLY=0G)
zRAJRrgrDb&;y($dhF{=*JNK^wTVn_Hb=WCx9~>Eg_S1zSdNl--M)+B;RPHtSd;~pQ
zpiQYqd<@#Z1nqZ1O$Rkk87%$Jg1)|pd*GZ&jm|L<uewR!FNox5a4(QfCl%Mw7Np%-
z?RRe>R|O;j3kwY(`IN<Q#`iar*TS>6!#YXTV|LhL({KM_m@;~>zT^NoI=qxYXOHb}
zO5e^3O%0q;k2o1k(F!M2Lmy){8`2~zwFMj{ol^)eqJ=EB9_FuTO*ybt52G_#&OKW3
z>r|K7`jl3l=h@2CLNNAgQa3Pex|Ox2Wyrf0=`^yroAsM}f^P)(vU2lH)_wk&6{de-
zo#}z#p<r$BJ)~d*va^Xbh|R1>yud2PJFIXVVwI?ab)r*nz89`*Rc9P4KGVV(q1J1(
zCNrP)7_GnD%*x7btfpucMQbR_8AaX2xL@lPt68B~LrWR=35{HXdG9Wm&uZAsjAHs}
zW#Z&`cM2Y6slE1(=HO+PP%RX0#OL5zff$W#-Y4hU-G8GW5{E?iF&2CBEUd40C%PW@
zHgy;PKV+HsYI?#hp2pC*H{1`fdf8;Fa$Pri3kSX@f)FYm1cP4a+3sHAO^!=%dnHe3
zU_K(%^Vm4o>9(RlDZ#aI9i<fg_evveK_?|?#dQ*w=u{A=q1=fyCbWw&a&9Nwe2z0*
z`h5ZJHFwbMb8ph31NXXn+3j|(K(W`se;+jLR4S|hWYHa<CQ+_2LQZ8t2QM}J)_~1x
z;L{Ip_i=7kFVEe?_Q#l!wpX0jee7^D$T>7*=$^t(4A54Qqx7U)9|E&|@mr;I&K(8!
zGsU>k47ttjAKY4_m%hIWurp(qr+0~*LcFO*izH5Ml|CUAle`+00CoPg^+SPM=l*w4
z{WD1sn)j3LE1{3^v!p#F=|>^p-f~YM?X_@o2Qoe4a!PR>1%wh1u7m!V6YrtOkK7jY
zSK4VF%sciu%lTH~!#t51`*Dn%JA~Faj06SlG50JZg%P^3!^|Us#qwL{OkdCv&L>ak
zm+teuzt~Ev-*%^gz8~${qxLq`sgEr7Lp~J!o{BXfb<m9K7&vT)vRIwJsIl*#29<9%
z1#$ZKvjDp_Fv}uIG^mZrO7T6YR*c5brRb1>M}7?_D$l(`OAfHsP8AVA4h~`Yo5Mno
zy)tHwF#{SuC0mhZgL1`w!BUJF3g-e`V5H_<%+38JNwpG_I9ZzsH$DH8ccUMOIqEI4
z;oukB?Qkha`rahk_~h*!<)^f$Pa-T^jJBI@z0zPjr?`v#JK8Q|)_=N;(JtYGh8OFa
z(ZPeSpP4z?&Di7L+{4)STG=(1)>$TX1$RtsHg?dtmY8iCF+`59C`Zw$o*F0BGrId9
zMoS}fjAzDl)R{(yanxb#NwfGFpX_JkB>DeMlx6}m7y+{`d<C1h0yJ0c_~sq^JJvVJ
z>roBl6MZGUg)i-$;JTzB<s!<}`mu8A#YG1hS^flj`<Q7Y#^5$yH)xEFo{PaM#MraX
z_et*m6wl~>Cw9i@N_||Q;O}&gx+m%P&Qiw3lyWh7QsrNCFy4_d)N(A*e&}G|4ph5T
zzje{&md}VR$73k;ekWR9Wi>^KGp6;&Gm}+xImV-&cEap~qGA!Bp`rOhqed^r=FI%e
z8ST-m2<eQov=rl4WS7;B-PrtGbx|8dY7XhWJ7ad5Fw=}ndOoTKD>nUnE^wnApMfH=
zzErP(w5R7<tT>xb7*eoiIZE2bkQ5_~K<_)Z-Fm<*v)KEX8QaM$hNA@>6jpGumD#=i
zcs_<1$hWY9a!|K3ZFl=($vePI&RNF)huqUTb;<oS&UxX8QChpD&NBy<%u0ccRzpIY
z?VM<{9-3XQ<YLvC^tH^u<7kIv=9JkbqFp}TY4*9FmR!kit!Ck(_z~aG>dYBKL*ytL
zNs5<eSVNQ3IKW#YnGubB^VYi+ini$Rh|%NhCY5I054q!TO%$o4b{e1MsjGIm)CARN
zIQ5j-Nba?kZDZqd?ZEh~6_{#4Qw?eH%BlXhPjv*+unsx-&7j`8Y#y_E%f2ZWSz2~_
zED`JQa=C`aF>}xk_cYtDI#s$?k=tMLe2h0QBZ+$%L)P)`SUihGs;tVenFp;PlsIx8
z!5VFMFEY=>oQKVc(~`W4VZ_hpzuP**!|ZU7tEH80bH4&|7jU1a`~$4UPUpUpRnUUz
zkd4d(9wBBQY4;C?jqTFrzVD8>XK0g8Io8+R8utvTG8*lhy8z9CtiKL7wF~nA7ULOf
zYfZ?R`pJ-c3$3vcjnPhPW2X%C(CD(uWX6@388igz9zd9+c{ybC3^6%(LV2NBG9vRx
zp6?fap(Bug&zBsOYBN37hwOHOM=#tfwG<ienvPLB7@`Nj#Pm<pm!vKD5!fNr<EYU%
z3-!<ykxmGa4z)k6I%(8`mem_C+5A?h=G;rPSuFR`Cxo?c$GdUV9j6D2_jmuU%x{!(
zyZf%&04JkaYwXB3kfF!jCgki1_hTewJ+q`w@cTITO~^`y{TbCr;}&AxDFayL?&kdq
z%=|t^>Mr**t5ln)weqr)gLVCY+{O9*vOv)MZ>dds=dq2S()aXuE{;$48F5^>`^9Mu
z^w~3vj}qMnrI;fCUs)K()GN~ab=z8$#j9j`N_Lj==4i1_r5KUBmR!e~U#6Ghw^8t{
zRUf4U$!=<vt2A%?geLfRdf&sAlSo#+-egvEE91Ei{F8(>vkJA9>shF~OZoU^*Q?uw
zj!xEsVtD?}UW@>(xP?4wkI%zR%Sr4G_Y2m%gQ2>n6hiz&i}qBL%(hUg-C!YiToCzn
z>}t~rqOE=s(@UF(uqx6!hQ|BTVAKtaSIN@@g>#8imA3I>miHKn`!OpISmm(tMa1g_
zo;I|4=AcLJ=zV0NnqDqS4tgTdiKboEe+`n3LFJuimTGWLVTm+2=vgw`T8ce25|=KZ
z$0*JBUZ2>}$ZQW<53OCK$a$f94XI!0Z^m!XPdg8!!Tr0s=ssAdYpiFIlyI|){w?k&
zV!eG~R3M)pOQK=l6c!_Mahp!(8e&bT469-CYZ25m#KO08Wqm~Y7i(cN_6&QvXwR>l
zOQ%PDQ92`G$$weQY#wP)V)YA=OXoqOWFk%uk|z4@7irhE=+wv2S*0*#DR`%l?9&ok
z%q+A)Z9>VR{Se2x(Q9#}A2aeKE&Ox*al%?}){fM~p=lqZ=V?OwY(le^VwR*jcOYH=
zI8|yEGLGWzUP(P9LA3%b8g}Dx%^0>7YR{5)x@$xb$vz`BKT;o%o{e>FlvcN53)s(-
z>=Akce!DGwFCiBc%GE}x&hO0H!m^e_e+dt*I$?Qpui*Nay;`X6k$9V;zQt>=)K$Fc
zw_oMHQeO`7#lBj(^7Nvm`<R*hz+7yHXq5_lqF!xe=l~;!U7W)L+dL?zMtI&SR6rhN
zF(gi<m$i<{NEG}ewRZ^=WQ5)w3zc>&c`0#d4V~%BUTy4x*G9f-os^P-8jbo>F$xy7
zM(RZdi@Mq@W<M{dZ+9YxjUsxpa~b>f$r2lHfOZ@^YZ2(Y60Ts^^`>*giPO8rlDMv7
zyHDsm0bTM7WEdOK()nH{%7RV!68jk0D~MgawQRj&;AQV+d+J1|rn6{#ILJXCY`9Kv
zcpF+(z}3DY&j+Qo9T~J9wRz2-_51hGian2DA6X=tUnrw95@7IHTGXJ3KX69&pN*od
zy%YByTw~e1e`c|N%sxrE(lz}#7UzF%NWz~a^d)=pd6O6C49sDF<_KL(C+IhfB^b9Q
zFI~tJnt2pl+y`j)p5HMTdfE3gw{XYo?lPsx0+P=fC)Ee+FutBpP1KILUuYk|H!ZA&
zwNuu5b_g{U>xE<{+RO0Dk}B$}v4Zzv*%ufg>`Pi&lGSz^B~Lz3VqrzL7Ndm@8)Hmj
zUgu3w7hXi;-a*;l=SQ^#?1&ELW}^K_G_iyl?=vzb#qqt3tRH>XQ?Ry+U6i8POul=l
z<y*G#SVNDdEhMDK?B)d6WHhU{lB^#GMibiL*U3myqYZP>Y*&&|0rqngB}E|*qYqig
z@L!!TLOI6A7OTC+(m5}=Z$&fDIxDCmg>#18tdT>S_9AQ1%8h9x>>28*uRw>o2*Z*W
zhKY~y`6+uh8cyGTamF2TE{&IR?&t3B>6iA9dJ?mZE+x+)^LUYCWV{#bT!fQJDRT=a
z#EDokQY|%wMFOEO#s`eBj70%<FX~%C*=!+t#pc*kl!>#(IP`d(tw-#GXubxn9q@IE
z`pL`nPfKwRkpdx{pcwgPM(=C1dhAoms)qX=MI2K(hY$KuTRL5A3u*uK*$6*tK`APw
z(_1*HQ=1;q57K9%HNEy=zFCH0f2fb7?+9MA<xv|3hX(O7scpg59ER2+f=@bD+l^A1
zwZ~ab*<f0QA5%A6y<<*$?w5|aE)KEM67}SHhW@v<6H#rLRYcO7MF$Qw#XULV0BvxX
z9`{^$`?lfh=zB?nH*|gTyj;O*Lz`GF24mJ<&Z0XPHtBd#DApaw85WjV`zB|4<I{3a
z=Fif(<1lSmaM%$8c9PrPIOhfT=X^CEew%OJy{jZ7t1K7Xz?lY%@KgEjs)1FCY0zL2
ze9<@9^_;f0FvbB3O;*_@jOOMt%J{Iw`?J~<yn@;9;+bt(#)SM!6K6C0j-?BN8Qibt
zd#I0d_h+|VZ*Q*S_j)_KZ7JUj-Nbh(I<svir?$<qbKBzlel%=Z`1u6>T6@skPV5#%
zfhEzs8H|h9TsvQGF4)hFGk~!$k1vIF1fZ^Y=+rteE?$eleQ|Wf0G39WFQsNnq2*F=
z_t$NNZ=>|96MqT$)BXxZz$$+(FbDe&bGBXDi$>nZB^U3P5|qD;(;BX`k@e*~T@G}A
zSIRiIr}-7&7JHTa5$GQKE8#%eUj;wB51`7eW&OwC);ww}e*aP8`70}GFZe1;Fo!zj
zjJ+T%;?miD{sh0zScYn(Q`}x+PvU&q-H?7r!pth1)73CD_$kf|l>d;Wx&|BQ8oQ*!
z7ern=nX=(`n4!4DQq6`&3N55Ab7(#KI!Z0)dMJGz&^}Eqe-F%;Fy6)XHa8Eb`qz;x
zUYEaP3#FbFrxQ(PL8qC>h)z(HYq4LA0B*9`qXy=KY6#zmWZc9xg{uKM;RFQwo*A^<
z1+<{W%%e;%$sd8?CQEM;{PFHmlaIhwwJsy74hho9i<)yPUgHwnW1QzA=c-A3sTZCV
zT{+%d3&!b7GhhC?5!`hN<2bGXS~h^YznbBGGo_36-t)yM@QZ#fVeMnneX!3$pR}I~
zmE`79W~uja87QeVQ|wov*y}LNxfZW?-lwrpIF8r70uH2oCH(N7|4gE8<@`*z<-eV;
zf@854_4DXbtrzOli>64^T1BEC<&3=`jOWs6nmV<_OSIZ*BN|=jzsMEP!<V5YJb%?P
zV>i)e8RMt(-)rd+<gd156KJ{GZKys~t+LhpRp2h&uek$#PpR{%r%{*LNLCXq=c7E$
zM@D9vwz`Ve^9RIz8V=1z)7*mm{~@PxF67(EFZ1RvxNhLYn!hB)O0LgvcFkSL$k(}U
z<h#mm!Kc++%Q&U+Updut4X5%i=Ud7bIYo1Ou!k=!_i_CvXH6d9B*)`i4|9fRC*R(l
z;snn}I0dtZ^E=OS8s($GyIeov#LO`GDJN@YIE|(v<Qx95Dx4MkESwWw6YSti%ij;$
z`P%YNgA;Z}WngDij^l*NKj+KLm0T4%ZHY5E|B>%5bDYbwhI2T-&DFrUkl*DD&b3_9
zGO`iX&a=>e9A`+~`PX-TZDo)?k7kZ$S#dgrN&l-Hx=&fI^06Lbu=Strv>A`BCobQn
H%)0+S7=18w

literal 0
HcmV?d00001

diff --git a/docs/source/_static/css/Calibre-Medium.otf b/docs/source/_static/css/Calibre-Medium.otf
new file mode 100644
index 0000000000000000000000000000000000000000..f9f11ebe430e3745b7b363078530cd6305f04ebc
GIT binary patch
literal 47860
zcmb5W2V4|a_dh(dyE6;3x+3GEvd%766cq$TMGW=|Dhd|Vh?J#Bm2L;4iCtsF#9pzX
z*botW3s@1wh6Q_#qLSnc-Zj2wW<kyKJipKT`KvH@?mhS1d(UmBe8(aF{zIr*R63=k
zy7%ni*Jr+B*<Ok&$fqc(ORv6t9pcZQjHM{eU5XlexS#J3zbj8)E}*EyB^1@KWk0{+
zJ>Ne4bcmwjrc#uu-Vi?*x1hF@N+^nMhxNyY#014(4lc2yD0L{7kBTq_h5p@k#cC|O
zh(8@8FoT`1+=28ed~X&Jla$seXkal#Mf^rl?4y{Vw0QcMFBZ`)@fRB#6k}2^^5yUy
z+h?xE$0a6xzDOCce@C#~fubc+>W8zP-}?mC>G~I?RKCC*>fAf0^EJ=Ub3QvNbClEZ
zT}dgVT>OLe6!ZSsNEInBV)-0pj?BqYW+B*5QN9#?3}00A6Z%hzp{x}16r=Fn+w!Ex
zQ3e`&rqHlTr9#0flvt*gD_8&W_8Bk;<?KKd)oo3uZaYxaFH$jmL=lD$2ka9?HKIp+
zY2R5Y#2@LmnsTmAD=3z7uTCqehLmS@T19Co|LQbDb)|x<(=6t%txl_{G-_vc+KQ@6
zH>*xtQ?=<Do*GI=_pHvZ^-Wq!HKj*lA8C~}_Gm0UxjIc_pKnyB6_l3#P@Ps%ZYW|4
zy^3n2aH~!;RGMONb(;MqUroJJ{9c`JMKw|8Ri~{fd*!X_w1#q0*6^?OO<GI!RW+)n
zv7xl8;OcZ;s+KB#yMtRhS62t`=*Soc|4H#ChrV&iv7rf*oE>^ZM>`}$hDRhNIwY79
zO$jNcQ0HDj(UHLkra`9A$mAGT=XUMfJf+F=tj=$1DX4yPu)K{hB_u}1#X4BXef=ET
zwUd&ro)K|LA#t%OQh~F3C(oFmC{tWgm@~HEhB<Dooju(DH{Zxa2UBEHgek!x2)h;@
znV4isFoilKB?N_<VuBK)9O9&sZ(hUxk3l*_#ySi(r8<l@1w{wNhC1Mn3)YR3d554N
zk`f|KiO&Be9Pb3|f?r%Jjx2amL{d_G7Z;b*)Kupvl%jJ;oOA5t|C#9$9T{SZO^l2U
zcaD#Ur?%s&cB9%+uK3r1@}{DxNGb+X{?sHYo-$#oFBL~6Q?XPim4JE9n9~EPqVbu4
zwBb|)l|&`VZzg=2FgFE%hazP!DoExRjCm5LL2`a1a*LscVLBXB(MXekZ98Kh+F?&T
zu@64f0HpQ&kNR!@RsY|-|6Mi$c_(0BBC+SO*zUL7eJMZ8k=m-sb;Xt<kTQu1k?B*u
z)NsaK+zCr#u#Zv5Q=$pOo=P$b#=c5@cE+bG)tT}@+JE=S0jVUpnDCi|El7QGKu(g}
zq_In-Nph|vEeA{|U}-4kNbMzHiw@XgO`ZQ;8iq9gFS950Ns`E5Y$X++qwy)tQII^p
zQtzaBcByVx8gEU%CD}-;K<Y;%)=I>Y|NpNRZ<$(>f*;mT#d^}b1milAR!tJFurAn|
z3zkTKopH>T_2(?lg)_D{`Tv2^MXnu!t;Aw0(n!ORXFPI<|Bt!(b|p%uw{))K58b5t
zRFzLE`JJL!iJ<xqjZf+4KmO1PrHWzI;ntj{R&8w^o4U4qJ)K?s1`X}?jRb?yp>dO@
z&6>AxY}v}Gb(^-%F6~_1+IMj8*r~He7tgNWd3EbPrDw0+efsw6KVYD@&!E9We1{J6
z^B+E9<fwqrW5#|zE|A)`W9O~|`NvNGe&%dZ;komdE?m4^Qhep=wHu|^Z{EIJb|-F4
zP;gjc!m<rn%cstKG<N~D;_(x_(xpH17Ez0rP99HDriWByYTDE7*#!%KiJcRX(!**s
zb^m0@g9%a5G4T_Vk|%A=*|K}j-hKOX^A1vnjvPI{WZLu@=@~!Fn*HNXluFqVc@3tz
z<ISq2T2PKuAT^$vPOYalQCU<rl}q=hr_)R6&x(eMPKrXsJ;Q86r9(Xjdk2F<6NeTK
z?Hsx|^mf?UxPId%jr|*sYhv4^f0OY|OhobdGxBvnel3w-5c1oA{I*g#R31GL`F&J0
zKz_d~ZW+=I9~^jx1`drJ9AthTU-<bo9#hRPl+d3)e||`H#~(fZ^!a@J^Ks-a|M};d
z^!Nugg@6BfFzmsA2d)o{4?2|JEKhnc^1;yiOYX0@KlA<!ChC98wHKa;!{`)T26yRa
z`1=|Cg8o38>3<ZILZz_!H}C&?3!`S@_~zl-+(6Bs)=(>O#LuXyxGN@7v#3mJ5$=Lj
z)Nj-}T=(0lEhvd>Y8$nKnu|KY9%>iLWjB>e?WOin`>9{41JofZk2**dQ2EqmY5{ee
zIzk<#j!`G5mDDoocj`2ChAN`YQiarF>Kt{6I!|3d&8~#HOchgCaOSR3H>hh=DRq;&
zPOYYvQg^A_R2g-Lx`*1xed;~!M!ln5sSk8}>Myz@1$1Xhq&=v==}uG?-G%x{ds6?<
z-6%qLr9RPK)MvUoWv0KQX}Tv((LHD--J4d>y=aE+ORMNUG>0>4h3kZ+`_b0)09s8g
zp*6HOU5oaiYtw`1!L*hhLfg<o={mG8U6&q4&8L2*ZD~Kc9zC4qX@6QrkDwdS0kj=G
zlCDpWq83uW(Dw8gx)J?7t*6J*0zHm4(&K3Z9Y`IfZc)#v7jzRkgl<d+(=F*px;Y(2
zH>E>qM>>LTMw{psbU5uqN6~HRSn4G0LdVnX=!w)Rs+_8%J5Ybp?sP+XH0?kKQPZfO
zP&dq=exQEDRX>|<O-IwM=m~TiI)-+prCVkW-ilW8d6|OeWE`G2iSns}OW^Z4NeBM#
zKQyIMP$;fC|5h-htX858C5j5MN-5S?^{J)YUQ0!4k(iG?KiV>l>hpCP)sLuqtdZ|S
zTdEvcpt@O`;wgKoG1VHcgU*x}UZ8$d0B%wf9;wN=z<$6bv<w&94qQTqaG{*T1#lB5
zx}17N{f*P6K;he>a1AJswsd>CJ5JItdK3zD0*WP#{()Xh|3a^zH_%(@T>3D5iY}xt
z(bwqvsBXWZMO^+Wg;r5dp;t6fv{AHIcqqCl1}gj&qZQ*7VTx!)iej2#7OLWziVccv
z#coA`;)LR?;-cc3;<n<B;*sKwqDsjqHOjh5LD^X8sBEkBP<B=JR1Q&&PzEW(l`+am
z${EU^luMOsl^d1YlzWtU%G1j8$`a)b<z3|?<r`(C(yUUeYO8papmI>PQGKTxs2ZyZ
zSH-I)seV!|QLRvIQteb7Qk_*@Qr%O%RuQHaV_;mE9?U>yBr}eQXC^bVm_^KQOct}B
zImKLM%9%GzCG(N_hh<nTt79G57OV@~k?qR%WCyZC*-`8`HjIsBli3;U9QJ2+ExVP?
zVGpoJ*wgHJwuHUIzGOeJ|EO7YU3CMsgSxf4gSxA_mwJ#oKy6Ydsb{Dcs#mEuI>#nQ
zM<>Tdwrkg;T@Tc2Q%pT#f<hAFVtd4e$HkhWdL*E(78H`4Wa`mJe)O{Z4hce<ULlbQ
zA;~de(WbOsp>at;A*kCW^_GizBafh@J~GEXa!#Kw6n!i$^s%(iPp0Yjl~uoxxR{tA
z3$uP-D)kQzO6dQUXn<U6z?b3y7M=qvJO?C2Mu(aP3>^5ig7-IEedICt$ldb!HXqpy
zmdggqWrM#d>k}J3L~e6PL^4VwAvq>GC^>0}rP(2g(Lsq3a?N3u-*Ww7-_-Y$^Zi25
zZitQ!^84B`zi){AaghFURsU+j;c}b9zio54rOn}%p$?ZOXt+Gyk#Z9wzp@?~o)DB`
z8W|di#zA6aV!+pu0J&teTw(MVAqL5f1zGq9S@;KiSpY#MnLNbuTkd7ZSB@cHxP;0@
zq4EMS$sA2`j_C^pnkdo|Gg(>)mubSkvI_r3BH>>uMWAdVz7k1BSc)mq8OtLr>>@4f
zB4t@bPLLM0Tl-EEzE+9)hHq4Og2@ysnLLpp(ej9*<xWO_Llcd>W970~xh(dZGE;1L
zoZN%BZ>J{C(v!Ha%OOFopCH#y_@;iMoS#@d%ZXn*nfMJ+5)L^@u9{R$m@K!M{B4`b
zmNt_uqfeHmD_Isns@z2CSJtWWq^H&_he=;cCdnm}<qDI(NIWbsOm5uSFFGzIC@L{N
zD8v*fiM733N9UmEq<>{tUa)SkDLO6{2N7J|Wr<Q|?;H|@9PlKO5?|Yv-=xz-O2{3T
z5|%+pDS5<F;!D5em+CbwrKKTD&u^t7y_?c-YGXp<Qf1Z?lTC?9Xoc6jhbKoyONMB4
zv?(k}BL9*hoBTC7(WaQVq;E;&j4veeOiQ%EK_OA!yd+3@@uq~xxKQa(l&o`^3Qt>;
z+*-t>_z08SF;i?P9v)JuDW)bBAC!Qrv$|vQ%a;~|aY;p)zI3Vj^<TxNnwnA}_5nQ@
z68j+eH%_3Nt81q|$q8}Nk8^ZVXdIqM37C+2iVw+|5G~0xI%yJiF*+$MIu2!wiOD9p
z0&ZP=NlKL}Cq>{!l4#<RB^F_kDbibFWSaCFcc@g&B=;aXDK=8BXc3EZQc$qG)-8!>
zTmYDpXo`spiK}^$R-BY8_u0L@<<Y_N=xKTMvOIcQ9(|B^^(Ziv6cLx4h+7=H5N%3K
zG)+tnipG5(Bu$5uj*Ux_pEy$qacOcgJ~}xO$zvj8EzjiWq{#T_NthKHnGzW)jZLPj
zW{Hn#hM1D-)>Qn`-dBp6dNr+L$u|w-+rOPg)o;`qNylnQ4Mmmx6e?$Wx-+V>)96L?
z4*DkQKUE3?8Yz8H&nZ*(Q-&xfD1T6vDIcrcRkKw$RQH%xOb6yW)G@{|t5LUj%DiKp
z*&(bSJC>cuPGR@6dF%zYlzq%rqK@FCc2j$({ng*2Zjh;7qu#7OpgyF2q<(2t*UHhV
zt(AvWFROu8gRLf7EwjqAI%8F2b;;_MRk_t4*0ro#TX(SbvhHO)%=&xlSnKK5KUmMV
zUS_?*dcXAr>nGOFtzTLHY5j?_<&2yo=ghfr9k~JAaBdtI!liO^xz*fm?i6>CyUE?*
z%DML%rADLSHF`~Bjibgz<E0s-8KxPl8LtV|L}_9*$(l);8Jby|Ihw_qHJYuOY|U=X
z5lx}ynx<UyT2rYZwUo8AwHnrHR;#TpDr@4-?5wQpT@yD)O_-PvX#i(C%*)Q+f_d?q
zqoSldO#(-1cW@*PiSV#3*!+F5@bO9m@r9|nq8)+57Z}O+?1f)*cb(E}bkNBCHK|ML
zy0<2cNTY|Xp)S;Y_!=4+9PK)V4(in_^jvxQzH`^E?Hkm|ps6T5`S_wKuiF}9+YL<A
zFVlWoXP>isRnDQT)YxT)H|*W1zDJwtoqGFv_AH1zKh^j=nb|XQ<@7}Rh?q&=2ToYK
zC*7dYk)cqV)B`s#Jh=ll8x76b0f|9@9(qC#d<FGDcj+Zi_YMRO++=*uLY<`ccZg9>
z+z8eoLyek3cFTTd#rhQ+x7hEE-xL`UmlzqdA!XZ71`R9=6G*!GBl9PF9=sTs!Ja2x
z%wH@?pCf2Ox5w<=yM6n<eKFfjrkI#egC<<PDRumY06nSSolwM@Sl=KN)IXZHVbeO}
zD^<+ou%sZpSK!&QvV&)D9oUl;v)ZW9xv*`C4fBLOR(Na(J~RiH@7t26KeHv=-$*uB
zzyb9ezsoH~1}4TO8Z^Y(Jc=mAQA~3d+^|=#Sl~o&urhlyuQ6wi;K&jsl2{>$M)wM=
z8DhnH5o<=XYSVAqv+R$=WMNO^Lc=#DW&dc<d`F7ip_p-GL1*@5D%dhOS4PfZ0(7D;
z<HWj?Vq|ms=)pV2o@_+&lnGJEt8xrIS<j<^w@@}uz@QtS9-cBr-&oMVL3pksonR|N
z7#0%D3`6FAfVpZ&6fvn_HT3>Jq3TcNxh=ZF;M@~TRQ}1yMSAE09N2^RgNZi+PZ^J0
z9FjZUkJ&lSf9)VW>4E}mNPOJ0y5;*DcR`E}>`HJN59G{9*<v6+lRHf6o|%VE*h7^a
zvFqKIP{EPQ*Toy)XDDOVgk_E!Wv`)kzk!rD$~_Bp$HEVuVN$a9%{ZvHEXDI_C!_Wo
z4;7Bu9Xy;_9~JuRc)ex4`m7nA9b$wn;Hv8sJg8S<>d(JUHb4gmW|k!{iI24>DR$+#
z=dNvBH#>cmfwU*V%*+ixWo@(9=sHJ?>){`<`ACMriFI0D>I;k>xM!Ea$*2(nAfbt%
z(K(vESaL-4Vp^F$R)~HqTsHeLcUUc4)NR?nXL*jE-&S%stVEx)clnkB#+&N+AX9pT
zo*&g`%<ew=h_LjDAx1Jmm1QCQt>kdHM4Bm+#s!%&ETns6()bXgwo(aaGIY&34UE!V
zKXkaH_|WK{y}uvRug~{~uNt5fHtXId3_e3tdeXQ(F%rLS%l)#38$EKbgx}DE{ms{)
z0=oMvVl!~~gzuA$%hexFzQ2ETdwk#ugH~6w>T7m^f&YW7G1t=(`dVM81MHnU!2W6=
z6g%{%sK7z`p%J@}oygDLwbyv?@QSQOr>t7BDYKFz!VP1`_>bytuZ>)nn_8fM_vRLK
zhVMuts7J<-NyM3SBVlAV_%?%4&+c#7eAIYN9p}?8)me`ROV@J{4&Gn`w$Kk+f}#^?
z><~C$Q1lNgbAHB+b&)+)-j^#-R=Gl<4tz*cXh?eE-=@;4B0hjm=!t(ZAAAgB?T9PU
z5|*^Yzgl3y75_@lmiSl7cQt6*|8TcwDe!lmJkU%Q#J5oBz~RwzmxY_+aORqMxC>lU
zlUwHDOiwXTty$j#OY>4TO<ZNjTD5*to;`S|*Jfs}HN1J+n(a1XgqM+YRHtQRq!|KS
zj<RbrGt#6CuM;C~8eTtb#inIst~F}CKwz4&&n9zt%o1&J`?r$}&QD*e%T^bhS#}F$
z7&tp3;d_1mkfQSP{Y94#<s=0DY<$DcyfyYr*VR^!T|1~-ruFMUwDwy2;EmBkx#>SF
zTcHuXbQiWAE;_VzV&Kn)SM2ljK^L0q3DeV!(8kVQU2e5k2hK{Io@5x-SsZwu>|<B1
z`+0Yc{bOi+mo!#u#g4ixHfzoF#DvMy5++Vrvw5zuGuwYje&7xLje~jTj2a(~)#%De
zDnq)mnuy7!#9+PG_eJ;a9V)t&yEidnwNZO~)7s2mR~as-H5ocEz5qKI0z;g^4ws<O
znb?sbWXKC*2S%f~NA+}nTcRVnkhZSvLSBLnbo*aEGin<PT4<q4p0r}a#x*N8Y*;-l
zDS7g=WTVDE<5`!x@HtfjPM`!Dyo`s*G&i+&!@^Z-H|aO7oSKq2X-0xkGgE-(P)`@J
zH7+M-+t!?%*liIJaj_8w@<bJ}H5T)>Vcu5Ei;KXeehVj|CEI&4N4~3YhZY7+ir^<w
zb2;5W*1r=oC2@MpMElX(C*ozWb>Hs9jZsSs8uyHHs9XA`Tmw&a5WtS;#*k(`U$O(*
zFxS{iuNYWZO2)7!ZZgoKv^9JAHPegj-<l!wSuK6G683Z!G%4Wq6#h=tSPoc?3%ET8
zH^!B9A#Tr!8>1P*Rchd<j?^KwPx(P@T&h~4J#`wFu9U9re+n0~mSEc7pV8`1Z9Q=t
zw*+`Mg~p_F9&StZABoqTh{2v1dO0~IN*T~OuM!%Ax4rg!5wmmG_U(E0=K^zk^&L0H
zXO^Ly)oj}~GVcfOr_6QpGWD;I9Di)oK<a$xNE+ihFU)u$ZcWvQA9WhtCZQ7i*z?z5
z99hMH9ZNFdF)V;=hSX=tFQkCcio12B0qNWoI+F%v&;UA@k<QRSO@1<m>BQD$=GLrM
zY@;J3V9T^u!}RuKx*O9$4JA;QAucLX(iKWrEeTUyIkmeeH*rgJrctv#g44ut6|82%
zudB9hw=W!@+f(D6@euMKY8KB*U7V~RADT2_GIn{54qSbRs~5Pkd_JlBLCvS3cH(-A
zxSq$~8tAxJm$2?&(h)t_-2@5`7n^QC5xdcWNeQV&&0%5V@)dgv@j!3-f*cmtG-a{I
z4m{Xlym@V<xE97gH!s7!F?jl%eZ;gCG-Jpx5NPlKFC{D|!~6uO&5}{D1crh?L+Y^_
z5(7OnbdG#EYHUweK(7k^h{n<ETcOFg5tsu0H}2d3{zgWs?RM#TSo)<A7wbq@($y2X
z63;t;k2|C*bX9Bn4IkdW-^t;{M*eX3%O_738wc1$CniK2Gy?=}J5CF~>ckFr%pF)w
z87ne#1l*wiOM%l8C1DtU$hbo4sWo)oYjAt5#LMfG0R=36WHi~G@MwGB{kx~9w{OQz
z7kuyBx$t|rK?}WM@iVd*IzqMvG>$~mr*nyB#QJ4O92Bzd;KBS=`Nrpi*w88ANn`XH
z@YRX!ubbPm*HvQsoA}h~@`87b92p!retd4^@#DF>@(daS8A!%4{lvD<$T~GH6-QJ^
za<<~`+!h-V5gUg)bZZW(B3c@J_Xu!NW19_)K7U@2PQB9f5NHip<Igem!DYKP?A9OI
zB<WOH6`JQ_b1>Mhd%p!5k=k#L0i}T}^k%3~YJR#9ngYhlnmc{3m)+=FrkVdj?D1Sf
zBU7d3N3H;@CkvX4OA#sHc?nwX;BZoK>Ki^Ksh<)c^(kq{YL~7*C}_e-s}$()1oml~
znm=MEY&G<`3T3pQ>BebW&A$&j?rV;$<}_QUZH@a)Bl_wbHNS(Pk&b9@)TB!0zqs{u
z*)!<)yvtq9asgNqrvz4GyYqzxY_K22{Xp3fyIFLtSfgvn!h%l;%v)AFM$D8}S_kN_
z)U2Hzy(U=i;MI~SHQ=35?r|4vrPGKmu7Cm@X9_HQ1inu-u^ErWagPf$r2j+sR1QNP
z(i-CDU6nvDk))A6W#*4F^zxm-;-P~_52MAvfr*N6rdF`j1~s*#i=rc?7)dEcPzryV
zo<m_wggzp{Lg+^n{hU(RQi{5if~ORSjQEM7=ep5Ho6(Pz^o#lmx{jirgQ8O-#V8GZ
zOtRmU=p}DP^+xaaC3ODpMjv(|eO+O#uv0iF+|gU@tq4Fbb*4g85T%cDJbIf&RV|ef
zJ;|L^eN@9#+f|QM@0k&3#mz)7aSn5WxyamLUN9nhhdZ*p*#YPWzRKQZpR#|kAJvW2
zZPCXYr=F%>gg)M#>f7qK>d#gUt-`Hh(Q;d5wcF~P)nlut)*M=G9@dks7onY&i&ok{
zTpg|z*Nuzhrg8hZ<7khSaj!U0<ErVc@z<C%Q#D&OduxrTHKEp=S}SU8s<o@u)moLc
z)wNw~ht!U*y}I_U+WEEbYI$u_Z8z-{?N8bz+TXP6w5PSD+Nat#+P~{ibrf}M>$ueE
zSjW3gRGlkzKHD%hgKU1Z*<`cL=7vpq-NAM9>lWE!{L<FfHqdst?G4)(d=tJ4{~Ldy
zo=3gD^$P1<*BNykbw0XaU79Xm_tuWGv$bns*TZhOU9jCmyM=bE>@M2fwtHTmu5VMn
zdwu`<LG@$n&#%9-{?7V^_3zey+<<N1)}Tv+hz4659BxqB;B`amhV>gdHS}({py6)~
zuQV)g_@}*%y}f;J`|s`J?5EqWx6idNvM;l*(2v!J>y!2A`UU#M`qTP5jg*a;M#e@i
zje0inZZyBqqDEPbjx?$i9EASD2q98P6&4AHg`>i4;kAJ_G&J}c5)AVVxrS@TW=0RA
zpK-h~#<<XU$oR-;#xSLmLl1{>4wD_`JFIay)L7Zr*tlKe{*6aAj%+-q@%qNQ8Xs$X
zq4BFG4Vt(#@o6%)NotehO|Ca>+0?7)$fj{k=Qds4bXU`pO>Z_+G;7r?vsp>As^+zu
zJ2Y?Kyl?ZQ=6jkKH7{>2wy57?af|#Gw;gH6W{!T2$&Q;H^BnKCbZGf~%Ox$(wtUzU
zTGejl(8{aT_pPS1+TQA=lftQvQyZtQPQ#qSoPKfI<#fU6No$+d9a;yrUfFtQ>*KB8
zwy|q7x=nJMMQwJp`Mu4{HvhEMwQbpUVcT_WH?-Z`_C(u~whx>c=X%bL&MwZKoqIYD
za}IY-be`cn-+8U`9_Kvgqt3<7cb)%mp<Q$?PA;8Y`nv?UOmIna`O&4QU0V!93~LwK
zZc)3H?Y6bM-0qF5udAQyX4ibzn{K{tp>7FoGu)QDZE@S)Ufn*U{q+tlIxOpOy~Ahs
z(eB~yN$xY<m$`3r-{oH5e$Ktr{ek;i_o_lIxuYfTW`RLWO<gAN59SK|m7P#uSHf-<
zJs468@3fyp4rbB&ADnui+%Iyv2QaH63l3zKC8a&FJ_S171j9|}m`bk`&nm?nn5k>V
z?gRt#2fOMf(}vv#9hnYnDCsC&e9}vM`K1XvwHx!Eoj?psYj#RE<|S)Fl1DNH)RRdb
zvwM*o?e2{*L5Fz~E%ML~b%B9mf9OHyzP$eQ@=GO{IH6Zq5<f7|^a@P|*B)kL>c
z5H}G~cWZEwm)D@dUS8)0-MUqD?v{bv2Fx9F&dY1?ATO_?!MARmE4pQXMMx`YgAuXP
zJ%ifeR!Ivyn{jVcig=*RcK^%B*U;+il;?b|=qtwQ?)nxJ+Q9F0>^7w3=+)ud!i{|H
zpb3FPdfOA_{jN~^@|n%K2Mzp$lYX)`WCIPipICK%;rWIx?A+klVKXKWXYgnclO4Bx
zzy0Ao+wu&2?xBs5;l>tbM_vE%2QM7jvVY%dBe)Vz27QWCzSnCBRH;(e2x~=MSAk64
z&TW6I+zz|Pb5a_HzEwglz6oTkI6M_5rqa;jv=aRUb9C)TZ5{LY7}Napd-dZ{TMxS(
zdvYx9aQjird%x!D_Q&GJFxpk{>m3??yrYC4rwioW+LSfIsuV#>QX%FUy&q;P_e*y8
z>}O;)E(mG4tP$p7vEO!1x-gb>+ROd0nxpr=gB9<Uu+c{#ahU>idZ&cdVZvaR_&767
z)X=PyF;9D^)_yntnK<Y<z5Q?SM~ZvmP~C_TBYpb(ES$f3?p!?AIpYcVJ%T<@ZQ=GG
z?r-=PFc>!K_OTZah4~H(G!5z<l6P*Fu?0)iOCNOwTRreM?^d3PJ?)ca<lh}xzvILO
z`-<Mz(8r_p>gq7v@SZ)nIOo7c{qH%UqlX4Y`x?VpZS-iacg9mle*)Ie=?f2F+#gE#
z39jfa=Z@~%H+b~GzF`N>{U9~LW!~=wT0LmWAEI_$r)xP;j}4tXHO&aEV8wItmb@)f
zXRY70Az!Z@pYaH0mxJ(t-uoQpzEVPS524li6mjiydX3mpDRvRFb!Fz3E~u7}F!Nfb
zlGXa5u`uot^u2GptD?(W@O|5*g8u;XuIs$V>^*<s@SY174u^UBj4^o|`9I)*Dj+cX
z^x1>iM~@zi8tNAq6=38)n5|X7y}Sg<dm~N`S&3xF*Bv-=#U3cv!e-4}hY+gofziiP
z4g7~8Q%4N<z=ip80B{EE>jMzVVfG_B=M~I-t^{*uPM%g&;pC}>6DmiXUt9=qSjt&7
zTxkAGY)4-#RbEWtwuA5W>o@6pVteJi#g+mnx~>#$Q#l9eC~5xOb{$Y#QOjc99C*)P
zfp(oZ9Kjl~o(||<SI~B0Jv|9K!vGX}c5(K>bNaJ6;iE>3i;6TE1IJ98ILF`W4ZD8n
zhMe7og2Shd-LTg}3UR&)R>l49!st7HU$fnMr{cND+~Ui{%c8BW13NpO$rX(J@lGcO
z-rId({mwN8zT8vbk5A+JaCj(PQMM395k6g5H2*1}%G33!ZSgxusPK5ti=qpZ=<-+P
z9KK`^3ePj7o`FP}KcdHv{}bxnDFU^T7cT~#8M@X$e%z;nuE40T?oAjv)D+*_NEmk=
zG(gIdy^((Yriea9(ty=&6P!71;c9^{u7soLis}ABC-!)&2Hh33dr1Ahor%s!0-bgJ
zI9BV;(Z%nT#p1^{oOT4Bqzj+Hz~`A9bhm?ru;Cfm00SZSxw(<~qgp$Ylgxz(n5Gmb
zLz*M!EBuKyHav%)Q)4+TDZ<6}F}1&-^~rc8X5sd~U8!6N{d5C3?Jl_p`-7bkKZXnA
zQNLZ2n#mR0C89w+GE*piE4Haniq3Z6A$DWT-73V7#7a#DnWM=NF`6M7^T)ShHx|A#
zcVo2oWNvwa_`CSAqwpiwP0-H4j!b$AK~JSI&-$Q*voJ{q{Qb*lWAU9Y5#C5L%pWUZ
z5<7!?Jb^<m%Isj>)sv&nGbc`G-+gJ1ZW<2VG+k<(f5aQCi9K{FIJkV>Uc*t?r|UOu
zWI~u>bWyJn$9$L{Qj&g}s3%mPm*_)~y@NnqZA<Ko+6}VQe`w8VEh?@Pr`=@PgfP8Q
zxm{eFESQ7<L3=7wSSJ=Lq0$bf&Jh~3q^CKN^b`{rLbEUxU98$r!FF}kBt9N0!UTR#
zYeBm?Q^*vLD#fGd8dkHUHHjfDAqHi`LI^pDZ<iRbVxc`uf%aqy0~(gZqL)>>9go6{
z4@y@-y8zcjz8I(!7rf|>yK7pfr{Xb`7+zl;#WU-Ln&Y&x%CJwUDTTIJ-1g`79B~!f
zK(OZ>+e}f-uZQN?3q3j^-?RMqax|Fv_2tAC0~jc^U`9(kiS7}-4qhtP5uFZRJ|Zvi
z;09bDXzw&qLQnDI2tn&5>=wZL6+cyMY1f3~k9^1RQxnwukpON12cuGL&s9Ji8m0fT
zjVsZu&UgE>#xjOEj%t1?8i;E6U6r$X@1|V+twV!7jQkOg?+1GYn>OW4HfXnU*_<z@
zwJ_ZwGo1<_>?*1Zj${s-!8O74!5`Vr87fo*g^|Kat{qBe3NEQn_rdNV{pyVp3ac7<
zVK~Hg$9ssW;P1kR?HvUE53x12ez&C{Z=KK2u+Q`=PI=C*0`gfW0Twu_wSL?$+zZg!
z9$m=sNl60B=b9&~V=&!G&F6*)eA=(#KTSDp|0n@cQYC8*JC-J(yKo>QHjpe|a8&0W
zV6^K~wsTTx$11~5xMzCeqN472K9ywDM*3H;5r0Iq7md1INr9UK^Z_AhaVbXr?w+Xv
z3iMauR{=fx??zl`HzYVIJlddLC~V>Q{F<Kfw+>)eV})!E4<)6$pxr2xb$wDT|GN`$
zq<bCFag1Y~&RL2qQbairFI@$TlK3b|!lcgNFaTjgY;F>21!t7xxq(AIcm|&rhw1n|
zaj4%NlKQk~4423KDC|IQnEP+nMpo*ZqRb=l>dm=NKd4YXfLglV989jb@3=rvYljHc
z9*TUim+0G!<I{pF)Y@M2LDMRNvt1+l;66EE2j5}%pef7mLAU8`qK4ay0}EX-#Gs9q
zh|GMvXpL5HJEU+Hw4I{_`d4v~QXGVN4y+?_Z~_N8iqMus6AzNc{LP{>&lMEJl>vq&
zU0^Ds?L}Jk;P6KEapkmI1v(GRO40FtcRc^bcbd<=#ibGeV|xnZ%66{!wX(RA?sZiH
z`jzBTaKvKAoSq!fK)}Pq2e$j~z|cGV9ykXtb)nPJk_>#jDY|HgexPZDtMLxI@3*y^
z4E&xwI|dc$FYG)1+^8kb#QI=BMfpUwIJOnXuiwNuxuN?b54^Iu%TJX%dJBGN=i@as
z@*(~AYb6Bs70823fz%SmD&a|((1A5Gc)C1dwTp8(NczC1U5*!sN{u0`%_uF>oU9@(
zV3ia(LCtS7LsZyzn?u4C*s8qJh3o8iM=ZKyyY5-oYpDA=>>2+=to>AXaPy`^hwZP9
zJTP>RfiERg2@|m)dF3|yo!P5)AJ2-5_}TE9eVC38e@gGvf22q9lC))0jr@yo6O+Cl
zYv22L(AhA&iTf})$s4A|+E0j@5<YU`)|}af)~x&TqG3Sk|G0eoo{@jD;m0*IG4v4#
z&brvOksEf}Z(P1<`%(iKU6^fYyOJl^C#5IG%ra=rbu1Rc6WLyP0<+w33_V2+)CWpF
zfAYT^L9hrYJcvl^3o8T&xu6>>z%O=VG2`Ne-#7@ru!ysyG#3_gv*ZJz6tn{$+P-=N
zJ#j;qiHV}`PpDhW!8GwH@v0`mQv+QzNHiGF3NcaIvKXc4E}bDgA+e0!FB(y!P1K3T
zGP4mo7cEY{)(bk@g6=v$e!DnX_m+fRhp@LSzy4ZFH9!7&4?J>RTB@};c=2)>I$~Pd
zg8n+cCr6y8tLW<3zryqSop+a8l)Yp5E5$8vciq7nS3Gf3lXoye(x9?o)m$!BI)EYd
z?$x_Wv5u?mZk2x*mbCfgZ%H*Hs!@{n7E~iY;8Gg}8rY96s9YsRY*x+_2&_l7d@Z3+
z`R_R&>JvqZ<n=;f^6{7}$2}N+Pg?<078)@F>d#O8N+aqlH2;nX;sUpX1*@ycQ?V1L
zg6nQKmeW2*g--H=mCOH>a|MXEbF}=@ie0A(J0-s)-<PtOUGL1p@V==rscK9tgZLIr
zt8h-#*@bhT7J;}_&^k2!4UZz&D~|0V>=)pLn5pX}B^HXA8Nxmej<|5>*;;xp=AJF6
z3ZY#gZqvI}y4f5b9~XN((1TmEUeE6d7RHF{%V-cFSqXx;-e0QpLkW#O_{vfxEE0ck
z=f;RDei4GDg5^1ac%>_5OxweWx7u>rRIDyL&(yF{2_uIi5xQ+h($E*|l`vz8E_Pg8
z#P~kK#uaPP`K#T>iS`}Pn|e%MLsd)VHB_~<?9)<KyGyVYz2Eb>IUN6I7RTq#7oMy6
zKg@Nz=tvgnfa_o(Tt8b}xb98wtG1ctZyxZaupO@Jz*k*5HqX<*Kj||j$Zw|c9?Rcd
zy<}zP2K|l|sY%8Z^)EAGmrc|U3@{BFbTqOsd;9XtOyd;wJ=N4%DKnGxiHVt;Hf~zJ
zX~X)AX-kdT$+BKYzeG(3LUb3h7&1+ESVMm`v_8kYQj^v`49ssz_Nu+(8SUE%oEzR{
zS@T~&Z^`XQ=TyS7_sS|X+RQ^(G5`aywWOd7>SDEEfKkNgO!j<%HG7+TU}&cAecaFu
zuG|Nm@vONA?xJaM`6hI@K`(m>)=y9c6esAO4Y|<1!(i`rE=Ast9v5DCZg8_3n=-)^
zma?<p(E43__O6fo-k?1osUZWO!uYrRbO<e7B=E<2as2cGjz1Q~MG0AW9OUA4J31Aj
zQ|UW+2luZs=p4xwHXxR9S0CNRkh}_btUk4qd8qcEz-Wid{{Qwkvgw$rQ~LU$bg#Zf
z{z~y|hE4`B7+u(L+yMXhaVA54nE#dm)j?njXyMYSto%L3aeGfsysRgo7=imQ@B)ND
z%m0KwXk&hS;*$bb9zu@?w&&h}@5_K!e1WJIX&pax|LPrwF4@2Imh4i_%ahbLm;>L4
z<JA|o9lLTWCp=(iMC?#Qi2Ht)FL-n`uxGE}@x9_EF8O(?(M>%wYvbJQdQjgl0h`AG
zmj-OWv*EIQkkEIPFiiZolsf|ESMyUrd&c(7WkAG2oAdnCb>d0gZI(Ym0xm<q?K31%
zEp{eLbtl1zAx`IEX&3cS(ta4UXFSN#^WemuBu<RhlLPnpSJ0QNge|GIU~?Bj$~_>X
z^IiUf_+l1!So}DUgU^8nagom!_>V!tx&<rOZqV}|)~%SHirRN+8T7{OIIm0z^CASv
zDySn_1+Fm4RyGV!b2|%lsx1S_s+Elc13J+8yMIXmq&n&u95lw$tzR*Eq~soyT;=n>
zHV&tCo-Ci%goHDLQ8nZ9eXe3C=lo$z6_5G7x38~SyXrFV7w=pIo`yK!mDh&}WCVP~
zs8R`?k1{M#f?Aw`(S)tLcA<v{V||g4QZxZZ*j@Miy=PBf-yS`F_bn~`{rBqzRK%W0
zTBWQBD<=x5?tUt&P*3NeuiQsyT6}X22OF>Qd&gJRp2Y?96ZjqDw+jY-$NCujy>tCG
z4iyQ0-}<2fpD;d&6$3~!9Ke_E?{JX%Kfys7zz&C+5s=>?Tb_kp*K9XbPrzpAD=tUD
z7op%wg9JXeTJ)POq9=`E`3|VHzu>^B9rT8s*N+w*KEG2b))NQm{_?ohvZaSt<Hq;A
z-oCwe>o3CuJGTKo9v%bEKY4QL{JndZeB2DAb(C(u4|9qgSIlhsWyQMf_6OoOjW@Uv
z|D)&GqVu}xGczU|A_{WY)yp!b87=;z>tYAx^_hZJinr}QIsD<94<7gMzCqPAp@QEE
z4&qeZ>uiQE%ni*B|259)Czjv3dQ#lVV2nAtw?X5@{$U##AoL1D@wG>Ts_SE8(fZ`y
z-C|PKB(F|d)sWa>mF!)KyW@ApTk-n`rZA9B?&>;+4nZVQ;r;S`g@_~?(#fD53ANxa
z`YQaTgc0KeXqR#5`V)wMQgVpSDR}$`z;}Xq)IuGxzUcw<>(gh-Rv22b(HTioBJ}<f
z3eKP3S#T<6;}nc0zG2Iz`JZt@^iVesVr|s+;chKQGqKuxFhBrabZH|<*}FMQ_D?XN
zl|UL3pLC%QcN1ea-7sb&`x*ua0b)c~bU*wD3xnvg1!+$mDL+C(<gxN&NS+6_y5qt7
zFa#Dla%65OhQRh6H?$=`%Pl7964NFnC8e#|v}x^{jT_faN;F^(u01q8w1rNE(4~-n
z+d|;~-U;5i=y~bWQuX}X@c6@HJh&BjLH@ma%b3G~0%=Chrwd1<kYwIVao9^|4t_Jw
z_7l9hGs^|^KfvYw9BDZWeGr(lo2&6ikk-=hFz@tcq?%yj_`gA@igpnA7bKCe7IF)N
z+g0tmad@6!9x3V}+yd+VLPr0Z7n}aV?5f?!N-Dd#S(pmek(Cp`nt$;<T4aBcAyv^`
zxi9q=RJGqC-0LC8V=S&%EDdsT1^-HHlF0GbzYgw|WpMoMO5)v7SSx_JKL;<&lmn-g
zVs<v&1kzBPimm?09a=8r{F!Is^8QqUw=@D+WA;G0wSuUYdJgxhs~wQ!n)7Fl)aLF#
z`PX7|Y_t4po3AZxLSyrEte!9I=JvjY`FJNqx^XQ(F^xIxIu73U=T3k64)Hfy6H3A+
zKsZ!&2)kjDhJLnksfqi!5|6&`WJ!KV`;;P4Vznx%C;66&+fjzNoMgWN(VoFohdEds
zoKbWiq9oC|3Mjebz;@-(uYj{Ac@|)y8wc+{4MX=rdu{@J%n-bU<KSR1^N+zU7ra-u
znHL4()r!XcbTsyl1#n(MAz1Gg5=@-WEB-L`PP~FMJT)C!nXe$>k#xx|kS_TJ(gV2m
zTnc<dP0dTl!4<q)Sb-~e@+*EucZ&<*cy}b8)*XqDcW=q@N7Gk}-=Uu6C7eb@ZnqFq
z2yI`v6!IzQ7V<oacZ$S2FCEDXx-SwZqlR}N{i}*s&^=gik{9kH$a^ID5n!0Pogcah
zve8ihyT#0Q95gX&su5{}umA^)B?nx%v1Zjb*@c^h9dA-i+}O9cjA?0*rRKi6F#`Wm
zsw8o1C!B<)IJhW0^aa@$&{|?&(Kij%kbB@DXSr3`BX+MFabWJT`)}^=hL2u@B!Z`#
zg(+YiT4BP;c{fG8^55u6{s}nzH@=d8v6s}77+|AWf|#l5B`lQXg?9UBN%IY;;CH5f
zaTQ#r&{qKKAOHom;~PO8pn`>o9PE+>vVt6JBA`P7MPczNK&}gSmF$r-B|%gqnm9bA
z|M**;><{2iK495I&RrM`2T=sEmLBY=h?ja0U%}sY7x-5b#7@}i^{=z@Dp<(lu#q1H
z5Ib;Widp;Bn=nKg&=djo$vv9hH;5Y|2@!J+xo`&uNv|lf(-naxZe#_Y2P(NMhrol>
zLkGho$-z*~Lh?C;al~G>5o8B|G(WNr0Gz72We9TK)8)bk6X(h$R!#28Z57Vpe97Jc
zt^)^k%%12Xn0J7?U7_3xPjpAwc^UI(&d-=XV}5#it>yt<=UBq;eB2e3`oBsGL9xl$
zh8;c46yvA&9+zK|VuY6Ls#^@{yP1Q*>`F`xWlPr|D!8aWy*Fm`3M1k%6^T6y8|#V6
z)r07wje}Y3_Cp*x$ob#ybrR5DUV5|C;x8X1v?(p4i=jkW9Ld!rMO~yI1>%;&mg1X5
z(W&=<?pJIJJ~w*b;LigmDHW1Vu~SkhaxEIgMN}xxC$ZX1Q#dKaAbB@o8i~<07s%Wy
z&t!o_P7-!W-W=Np6(hlhfA?>X&KdI!9si=aAp6H82T{D_8{1>?jm4vHY)^=w9Wx1W
z1roj@KxT{<)C%ZFV>qTBYWGz>x~Y=A)a4hU2|CcETNeV$UY9+PZe3_D*#K|9LpNqW
z-JO&9_wEhP_e7YRnRyCo7Oa8S#Z5_#FbsHoiOpG+8fVm+^YK<IEv2DfDbS^ODF&Qy
z5$2Kp(#3iURmu@);Z<L(@uCabc^AmA5@<YHpv7>>So@!V-5fTVvjBG_;I142=)>*H
zzmUK7u`NX4(ZxT<yYusb*GmLvk~l!%pQF~2m(TH^%UkPyMZdXlX0srfb$Mvk{hh_}
zd9qpe_KR5uFJ+Bjn}84zyj1Us6rR-hwwuern`V<6N1B+YHj}f!1_>U@_t7@I@wJjK
zU%f{V>kQ_|A7&3E!jOS<`C5Vo>2BQiPs9H2z=0>{&7CN<Eq<RM$qP@k%~dWg0xy#B
zBfh!DVX7O~hYXN&PGFnG@Au<$KEfu~*4*D^$Rh)iG1}2#BZmqOX@t~~l5lI)n1#jf
z!2xAljAr3SIHE?G{<dcKY2&UJc>VN-P0z?Cj2j()&SHcjuM}_3lcf)EEt*RCPjFr8
z6?mAv9XTWakFPM3lVxLxV$hOKu*Ez?3T0fw<@y|2Jb}oU$N{XtCJv_YW>_OHJ?JOu
zBn}@X4k$c{q!!pjNhD&u!Q6ZEoa$oGOQDE|Ks|H^Kim72aITWSyIu6uoiKZ1`~l99
zNtL4K33Qo09DyFwKbxYJ-h`a{k{qE%zuO2M8Y&dhQflp&7z^Iyu^fq%RU%1ODJ`{~
zRl2ZpQE9anGg2C=tl^-Ks=tsqxLW%u>o1Vb8>JU{#rEKA3&5T*mHdk#9LvAk$u8me
z7YUY{vdWWGxTA7MAzv!#M$dMj{&DxK{_#xGKYrjq$avKeh0-rova9~8L@-+vhI;t?
z0UT*9_S2ygHVxw~>gre!>Z@O)L2yzuqc*Y&4-WK36<6-X6%i*(*{$l=n%EF3a2SQ^
z(Vv_3sl5(^L>MuGzrhppo{N5=92su*ld~rY2c=LFvdKp_DV76hEMlmu>$JtN={%_z
z8txIKa1(qYg?L<`dWTKq2~dSP+v#B)C4CPCj2Ro;II%t%fH~D2tat!6=#hq%mX)x)
z>Q^iy!(NR@fng|28QH{7g-zr;9jYi>ax(TI)cayFI2*+>#9gcZg}W{=cVybli=8k)
z9~QfOb*vF}!@N@5ipQi~bPV_GRe8bgA!bzXo*=r>{{?Ps)FcB$Y7~cRWf>A*g5*m`
z8oZGc9n3?J5FIH<<t7k8J^(fdl3HB}7={8zc#KB^=Eyr6bAE@%y65amuwmTTQA0)*
z;zPSs0s~$=D8;Vwz~)+O?zs6Vp5&-sg0&NeZuPmyBUaXeB;1x0_a`y?30P`_rTwZH
zoX%L>KT;XJ_w7lrxq|W<YnhA$xPw|L2EicfF}Pc93XFpxa*?<bMSJwFXH`NLC>|39
zxL4yCEg>`PvV5(~!eh*eWJ!@TT=QouS4y&4QCT0($VbkHPaY~%U@cHkoYVkeQq^~(
zI8=Shka#;JZ$nalJX`(*)u#oLu3IQ=1cihgl~At=u-|V1tgmM$Oh^qg^%1tM+gp0f
zSbA*tIZ)V(Y*X@4O}GGm7nguj7`}#&P0ftjUs9-LGwN*R5j@I|bQWGz%|vC>WHB7h
zR?d@@_<5CRIMn~2h68E(>9CH!4UTUyB5eFWn&DqzL!rX0&4LsSOcVR}=ayk0B_9KV
zm<u*?kT6XS5@Oy$`E>Y}T+->kR**FKB_`CTVTq*u`w|w{EV7c+eU!FE-REQ`|FQs-
zXSN8EmcLy15<euKXkKBSbgaWP+<aAoakYJ_gQ~xmkXy1_NJAIsm{+K}S4a20(c@9q
zK1_MR3wPB0OUiZ&b09hN6+g{FcvMp1Q6)brY2;+E$lzt$UYar0-H|CU7E$nMPP7I4
zCa?K>Pz0PVE<JQ|h<@DIwfP4N*5>cuH#u&(kw3PXxag8N{+@jQ8sMP%-GFmM)vnL5
zZqp4rxp4JwlLA+S=v#I7?bPnZ@MlL4u33rrg`){-?EPB#hTbi##A%)Mia&k=?uZ3T
zgi&_Ca{T`8D_-LTo#+4&pX|{*+3$ee#s7jxji~_c0P68yoQcQ99+K+G&maxVKIm0E
zj$XwX=v6!(#bHDegKbL4kqAgfa<BzPlO#nJGesUX+2v%<H(8P~LH?}HIuF`x?kL!x
zC9(xXJj)hv+rU$Ddtk71s(3?8#q-h0oT`)Jn=HSR%)zJ}hB)_I;&BVO!zFz%eqf=`
zL5KG|-MiM7C@7Nfv8D=fULjgUB}3GF&TVvY<$c#e{SHkPDK?s0V!Pu_2F`5wTfV9t
z+Wm{6t~*z4^N962aEJb7^I%_*{sh=;dCR{-@_#NtDE4?H&N}8Vg5<=vxGft<j&npZ
zn<Q)K;~>FZ-k%mAJCZ$xM#*pa$uLV~|I_Emo429LkDs_lhuQ~ko-mY~`or=S*_#g*
z95U|AUcY;3o|OwbK0PdDtbQbi98T|M!LHvE{6y@V56TR@d0FOy1<Q=u5KbE2`%E0(
zcnsX6S4Vl%En(jDmI7Z+dB8w-3<Tp;RmXzw!(|t4s5u_v!Ea^4`xp-n;m$)pB)AIo
z)>Pm!y@}c?Up|%N-;|i&br58_d)0I(q4%X=v#x?Khx@@uvd*0U4T*%x`20+sGYr|@
zm8PrEJkJtgu6T#qLkY%v?n?9Y$9F^2Un9{)ySe{}MZ<LPz>_*kxLRU6|LV+&nOCJF
zQT#!7Isfn@V{g_pGd?P6=9ayN-fV~bF_#Vei=(3?yRf1xOCYVp^*R_`Mn<EfT~d4_
z<f!%@)Ff(SAnZdNsuJQhNtO7JfPR5F95(Y1V?_0!zHP?Y6&1gp<)1;%DjoWWW=@@~
z=kF#b9x>_1P8{)_@hrPz!7nS<==o<GH;&$`FUZ1A2KdrGBp{7zPWnIaO@UFC#xR5q
zDr{_WdbEHy?Hmpwh&_JVG?B%4`s8#p8Vb@eqCSPPgw%iG(Atwsi!}J3HnR>lN4~}2
zNW;4MXmIq$E8jO21MygG1sW|P48lu{H#ZmX%OY1@nxx<US7aVT_y3OD!@#M+m#BS}
zA@%BVL5kdWs2bWvs2GLOdkiL|f~Q<5Pui^k;EB0YI61=qyL@dHRUO7G;)9dF8daFp
zk_)&P4_P_<5B_i)chN#TY;Z{&fy44C(#)Kxvn&h#t`yE+C|Tc9x#)lg$}IDuf0d4q
zul!|Fa6TJs)>rUl5GalHJ{oY|I97=hU*?2Hx5R5i^+YV=kYNrypP)mL$D-77LIk+?
z&8LEV$^j|59}D^FZ$2nQPk^+uDyHGinuZuNwB`RGO2~5O&|?reoI~^9;xCYng^GEQ
zJhuX2cXraTfJ^kIxi0z!EK!wgp}hdD;a~>h3bF-hcJm=wK4ot~&;H+Y8|0lSXP0&0
z_*=J!e9w{4DI9!!=>FRFFxU`91z&nb8Yx<z5H42e_)-KVJdr^Or4lG1TR?BZFS1DW
zX5CkR0@26{&^<VD@87tEB68L5iyLAkp47L$bwdyhYDQ8e+q~B+!Dj7i98st0RYiKj
z-s%SUKP3P}2m&B#mEFHjKdZnH+8A9?SGG4-{HUzMeHjSRc`yd@EFg&^c!+zVZOz~M
z#;{iNFAy!k--1+V5XzzZ0(bi9ib#}ta0P#;8ak1To(<xI+n&$6aTGUxPu4UgK0JI1
zZv37spWNZ-m>Y)w@O<DDVNONa6rtJ`azqYw`pf6^vmpq<!0$(#@GPKl9aT3|5QHAP
zZ7g2FWtJ-#Ib1}a9g@z)dv#_7ssI>v#V{!DWoSYgq0?n*y2b1APzt-At{5x%euhil
zr)r+jQaOKsYAgq0cgo%;y?-1YBIt@L#B(H5ipol!sAvvdG&1HRO{Ar{{G+dg`^ZP!
zN3?t{NN|ue(aME$9`v=~Yn^JG2-0^))L<{<hrS~4lT{}_$x*bG)RXERL58yTM{>_u
zJVLHjXXKA-9hh?VblK^y_^^<CbsD)>bwi$85mjM9kU*ByMTj%^L@&`+0qr*$x}$9_
z0ZSY(bSz<<5sPH=&{KvU41R;Cg2z>jbiYhWTO6zJ>@%{Bu`d~nH^6B%VlnowTJuxJ
zO2cFIH^n|)oo|s(Z{QOQmLGrIcfwoa+&ARtU6BqY^scB7HY*)VQh2(mJ$ixL!f++L
zmAvBLye~%e4_{GhRKQLp9FhI&-(-w;lggbXfA-&Um|zb)L^U}6t1Y?5VoPEj^siVV
z=xC;nsd7Otj8^u-z(U!k#jIZhG-t<Dx#b9wtA0$Cu!pNkLuK32Atmg<&`VYOQHUay
z4_65y^B`M1f)JT2n>dGbjP<|6SpVTJ0=WpyBxnaTldv6fS_91%quC10Bn${N`w9ZF
zWbKr)EMSBJVTt!%u0W`)azljN73qT&3FPBl1M+!?SFDV+I6apmqr_pUHSmd32%kuW
z%o;3nW(}6PC#<SLgRH7SgJ5`H%9O+Vt8~@jeKJ{UOa@$^mH^k%TQ%hRG(xVYd4B`t
zyM<7`>I@5dFALFo<GpLpN%4qIlHM$6q*FaPNE!46+ZvV%oxj3cZ>A%@H5T!$d~rHL
zT=|3PxxyESs}VCj1<aeq#U0>EE0kI2*er-;(WSW{-7v<h5M7!(%Deuhe^-JY&5cGU
zcJz#hn34LS!TBYrMli^J%r5BB?8&Y`k7ggXbX{&iq5gPI%;;Z@f1pP*sZU`Gy@NYC
zG-HfIS#4Ym=Kl9;%zbJ{F0LB<L|>>t^wWjjLfjYh6L1KBn$TB>L!476;+&2l&Pk0h
z1Pjhdk2t4B)i|eC63&Um1`Y{eK%5g8x^Qs_C*Nv;lmF&}T%U>S5IHmFo${`o=<<v>
zit7+o9@tERm3O+&n46GJYOO?txbh<wT=_c*S3Z2ypx!?V7q1jvi09%gxbnVFY~k~p
z_R<A65pm_YGOpb3`_Mr>!VVVAGB%Sy-*>t}Lp|tjl_9SDthWVMo-N_ZdtF2Dvh{ag
zaOHUy^+kI_Mh_j2xbhGj!Dwuy23LNo9HJ0cz6NpS#d(M;A3wTZ-#`gh-dt+pK{w#_
zV0-@oYBqH|$_Y1S+$0HC?gUxSNfmi}8*$|aHWcW!abIxdM=ZGVb{;|;ZZdTG7KmMy
zVw8+4@9F|q)g<16E06nvD?j|!^9_uA<M|eG<tq_aJ_>Q=$M#&fcnopnqf7&<aphwX
zS6+~P^jHDn%Ev~Hs>YSqdL`k?hu7fB^JH9kyTT?-+ex_cyc%5ja2Z$rY5-^q*w-3d
z`92v}E>egq&y^5F=MeIB?i}vQ+_fq4%OUCW0;vdHht<WMxN6ur(nzp#3*B&m<Z8if
zuTYIh5E;oUzLpYSNN76VD;?n|5}*z$!C3u;vXNh#E`6nJ;KxxlUn*nPT8>{w#R%It
z!yOjA=WElGSaMo)TZ%Zmy@FqLyd804hb<9z0I8u4%c`7aaeQXIG;tM1-<LPT@cVLq
zjK634R|nuT>&X##etA8SHQRjik+hK?QSVFh%j;qDD;s=metA9g-b1&QLg_ikvLGEI
zR-s;9Gq9;t^}zUJ-9_CnDVR1GkulvT2$&j#0X!-uBOyN69<v6Y4!>f+n&heY4~`~u
zrBGb#bM6)b6FiXPz7^t4tiEG8R^PV*tM6Ej)%UHKhalMDyJX11zDy~+urEuBFYIq_
z2{7blp^S1frA|&jhF84Ly||oLVte`W1N>AU*&ggK-b1##+acS7{gLhNcF6W%e`K3l
zw2Mm;`|7e*rB0kUIW2MGl(ie@$X|;W;Md~U3i3-XjX%^cHFfr^bmM6-l#n1Y=88IV
z*@|Ur^jdI{Me_YMTd}z$k92V(HglN5W{#v|GlwZ`=14j=GdX=1hhH0<)SV&IdO-`d
zBYt;$A2;DTVciaH_71GneVsV(XYP{SB=lsQm?7Y=<=DIKcwDG-KOjD6$vPoRCf$V8
zeW?TTr$gPzH7v#|>(jBy`t;?T*-f<n8Q<Gg%`n^RE)PH1zvg>e>04Wa<y+h6gv~p4
zY~HkE$EHNY_r}Q|)WqWlHSyC0`Q<pGC63P&utWkY+?T)#!Q+IuuUM4;#Ht)0hQ#v6
z702=8isOrgIO(g4_od4BDV&^yKkz$?j;UN+HPR*?kv8!W0>0AkFsAe_QYw*?V#i2{
zmlSxCd5sd{4sn>`e>>hu$jB6ZGoeFfRwhgOXR7ga^m1sGne~em>6B^xf~(NnU$nGV
zt%}99YC}>zF<*C#r&defYIei_ureS2v&uj4KZfj1C(#S&CG;BlHeI36DU1p?g_mNS
zVxnRZ{$GLx%6iHM_&@HXD1TScDwV2^s)MSBYK&^UYJ=*u>W=Dx>Iwc&e-z`uxH6th
zcV;*<nh9ben2F46=2vDNQ^1_X?>L__FW7o)GuB=Hj&l%x$(hDZW9PD&>?(E>Tf|;s
zE7<?9xif)}qDU8b^(1q3#}FX_l9&ni4GMxB0-|y%A|l9ogW!R(Ac+Wyva-6UDDJ9=
z7b*xuL{voaUO|_Ait8eZKjoo$i=f8Uh_LKqaL3=y`+rs4Gd)QNc;LS7Uv*d4@zqz~
zRn^s9RsS2#3g?8IgpUg!&-#56!_&gESi$e+@T1{BhS!Hb32zU7&J52ck)tDnS)pfi
z<buff$R*77EQ`#D%!<s8%!}L|SrmCR@?_-2$oj~Z$cK?nBVR_oi6o;(Mhm0OqwS+d
zM|(sEM@L7;MW;ruVf~)lqYp+`MxTnVjlL3HAN_N5WAvkFC9C-CiYDw3>-ZGft@#ky
z+dk1AZjZGuvagVJDC(Tx;4Op_YvC?tQhQQ^EOQfD^5w92W2QP)y&n%(4gXWeC8f=l
z1xMdEZTWLWuf6!>Uq4uL>G==F_J;OM@Aqz^usD%-T)&p>-aUW&jMBYj!M|L$tZZD-
zsPiVBJap`ew`VYII%6>I>(I)D%N9Lbyy3CSE-008xeMHJITj9ha!SSJmruEJ(xeqv
zu3ELC;z@96n|LvF;SCpGIj(rrw5Q)FjW27PSQ~Et#?amCo?rUdQh?VxoI090^NHAw
z&hec8-Sp;X_c1^=;l|6SUsyc!B380^?yXHPteQUI?$U2VpUoaBpHp+XcTU8>qsQzs
zPiO6q(4enm!pHkx#;wZd#t&Oox;GThE${wuf~4IURbo{iRs_mp)T6f(x3j0-Fu^}C
z`K$~~Qh^eOL2~(N<B@p(=C;FZm3i^`1@RLS^Mi3alo%YpGg$q=oB#EXqIgd4za$#P
z%+!_nq4;Cj@g||oYc3fzYT_k>OIg;yDfs?h`&-xe3>J23|IG8N#<hOpk-7g_PsQJC
zRpRJB3Oe&gLF$FYDfmJ!0Mb8E5_GcSSMisBeD*CRbWTEvj}wvj$MR?WfbY*6E=xju
zcWliM3Qm7B4$9n)*=A5T^}9zqTZ-!$I<xPY+k!nqf2Y(c>QVbm)VzltUGVv?(AW3B
zv$48p=g2p@E{?Tb8vNCLWlNc|xMtPH&1)`WO_=J?HI-vG^mr)pRoAZJ)^mmpKDwyW
zJLhkk8T+O@_~!MGmS0pfX8hzK!_TdFb0+*=)M4|;>gyx98{69sYl@tB{<(vOod4<v
z@4xiQyYIbp?!Z{$gT%6ecFXwjJi9pFd}q8ge&$Cv{At?jrIi;AetH0Zjh)ajk)5zU
z>=e&?=k--DEh)Vr+-gkgUT6Qh;)UyDx#n$Dzlu8XM+&xt#wK10UU=gTEJiYG${!Y7
zTY7Bx#zha^yrTH)wd=lGRJuOCKKReSbqy_!JHZ$3Ua|Dq;-!n{{&rF6&hTB=PX9w$
zahKD^cDlB7Ix}m!hjO>_4Q#0XhO%9MqkTNSyx@(*@*#HO&e6>KNiaY+G#=RrdwSHE
z?Umb%Wf)|X`$a~%(IMZ9;b8B@kT=4N*TI7RW*MiiWErP|#H<j53a<s<PVDLxpAt^Y
zOuQBx9{)b<$eb^yKrbiLzU$YakJhi>S{gsjEI9MWs<X{<G6ZZrYyAnaj$OZT7o6!G
z9=EuB`K;KqSyz{jE6Uy9{GA)@Gta$p;-uKxE6-gzxLA+Vm#(sBwGFrLKelVBbL^kS
zZ(CV$=RJ#JUxepef7OjMiZ7q>;M1kfUlyNJ;;`UFZhS&0K8!E>IkKorf_q|EX<`Dx
z*{Q@ySPM2CA8$&3_v4S^ZA$mIY59GdaQ8_Q293PpssFkOf{bQ1=b6FY;m=+OCVuxx
z*qP-0lI+}LpI_os-1kt$s-m~YJbC(vvnCC{e9@H^vt!P8lUuUD(SS=zmX<hKE8}Ht
zZ0FPxvvh&;X2G<X4=h{y@B_=2KYU$z+05&vl{#0Hc=Le1Wc+eBL!<@q>Dv<1S=!>B
zf{mzK@6hR}TYO^U^n=5@m)i|<PQBpe|GowLu@Sx;N7868KAEK6LvA~pfyH?Kr~6y+
zrm~)wpTs34EkgVg(=B*%{P9Ho*YW(_pSFp&>d4=Cy~4TiXTz`D^UR~q6)#>och3E#
zxh(U-VlS-p;;y*h@R5muv8{Sw?8S5Kb4$utPoo7caq3?F_Utu|NgLq~OHZzj$BN>y
zE!EXaB_PpjTuXjPWHpyd?A-kyLZV9(Lo4^}j92`3znvKL`0>Ghp-IdsW_gO$@u(xC
zJ`CzOwI)dqXk2S9G{QD=h9j#gy_YyLxFekS&1do7><sP<C*DIt5<9cw?`=!4M2r4x
z>Wo|cdiguW=5OnpO7`9sFGwg>YZ-s-jEm>Yo<AS9%V*(@d$aaWXD-{FiH-3)<qzN`
ze0E>5cmKb_&Q&Gfk90mP@}`XT)l)|IF=KSq3vU(w`RPl4RVq`a+(}c;SiU2BpBJ3(
zo^gMMbf(*-&eZYKCXFA&3LCc+#5X582ft*=nRt*HzoB4NII%g=DVQ7IE9=g53LYJf
z-;9mDnRQUQ$jURFgSj``xht1S->z)(<g)U~lgpQ`jP38{b_7|Qtt6OxP02e~JEw)F
zUKG4;`t{{gizYr+_SDlWmOZs{`jTJ&KIU{=P*6UrZ2D!z<F8t|_Ak%8_2v^xues{(
z(%k)5`ligK=F64l#&3#`E*N>qxRYavCh_sKIa^kbA5ofHaDnYCn{nZz=M*OzpWtkn
z<Nhw;Tv(FJbUp-w|39ypQS5wnIsWTPLe9Mx*-m>q*Ssg_mjvhY_^hh<tj{gy(m9{?
z-4<W-8B2>KvcF|HI<wGF=U40kR}wqFTA~TJiY9yL>ti;(`}*sf-W~J$kfCG742e0X
z@I%8kJu1DQWtaFw@=2dBgrQlE*AbSrXrwb->dct=8-8HD_mM{)zLy_Ze<M8*S$Aaw
zYm_v8^@T^5J>pm^zmebb9saW(#QHb#kE-MS2Q`<Mqf2-dbJB}VJzi4k@O^n9e^bU+
zQ)EG+uxotAcV&qgUE{39l9-4h%78Q0XSD6y{r#&2@tjZ@3mOb!x#)QdZ(DfVy|>+S
z+k$&?T06UBdB4jN{r4Zq<OUX7dC+z`@W%c84rkE#b56g>;roH*@FM(IaQxQ!JAI`J
z=_^&xS8`t8|8K~?-$8uiyn`~E8ERUN{<XI&W2{;6ubqNjEY$K{8S6+fz;_3OlzO<%
zN#uXj@_yU7Eq+1&t#5CQFJnQOIbX!X;?Mbl#aNs<GGy%(%((uVDN`_z@9&f){mwcT
zwLSU6i61ThXyS(ro#SXy-dVTo*=J&ow?0&^3XEet^Ge)Xsu^p<w+o!D@~y8_QDC8p
z1ukH*%-2;T)`XAY@~!g->#j~AZJu=^agyg^6$~7u+E~L#_aW)tQ%)eLnpl<iza#t&
z)zGR|E%3LL^g8cr>cqfFtiI3PdP(JH$=);*ReqqY8W<R(T7uKSKtK(&zU4ld=lLx6
zvYY46S@mQv_wNX=!d;EKp7`e6ujAg4dl)!<!E>DI6F3*xVRdAnpRy^hL7=tj%v$#Y
z1!wO0EHz_W*Q<Q$SZKYYnzJ5Ff9p@GlzXYQ3D-|G)c$W-h<!T?vj0!=@7AZ<-#_p^
zX$iMJ)9L%`bfbBGl6<GD2G%#Km-PwxZ>Rq6sT}b`i;k>zpRc|I2kxY8$J+hPq)ccO
zQ(Kd_1lp;_){5lc$dk`1Iqle?pwODn-b2;K1xkR4vpyznI5a-n&^3@p8RMX@@at;V
zkuPwAYL>;`O+sh(Q(El0DcrBb-HN*&_bRRo_YiJ5t{k@pHzjL?YL_*hyd9xoBj|cF
z&pp_m=y(+ke9S`i+o^Ll@OP<JS-H?61bx1O2V-#4NdIT}(npQbJbDH{@?%}3IzzAi
zfe+zFGwOS<YOKSdrL|iP39Kib$U|LD<OA98J7i;k+MTS%?U8#*E<`r`PGm#m!tX>b
zJlQ}7L=I{>;g86J&q=-2_T)a?Zrm<U7DOI0oXCU7g5QZOxbhHaiR_~bS|9$5T;N0>
ztrn`0$g#*$O7;w&UWPBP3XRZORg*@-E3HF9C(nsK2`_5UO+Sl%`P~zw)B2YD4!ZWi
zMQ}$$*S`mz1V5h+iXM7S(_Hk?bAGxbGg>!AKRq24UU*LQ)zeu&Z(P6Vu-|EZxjfT4
zU4v&Lms+QUZ>|&l6zAum=l9~gJG=}CKcTzA9f4~tyvAOoY=CD6h|{*gn(W#Gu?e2@
z%L4jgiG3KYy5Nq{c0y#sb7R>5%d;DPS#kYhNBmC9k7rY`E7<LZ$c4y;=<ofm6FnDq
z1x|GRTAW|kr)nFHZSd@i*c$D`M&iU)=``3DvH4<m#3rQL1hJ=LOU0hyv@MU&eneC!
z+yGpATpq3!?kL<DxUM*fZ-VRKJ|E5f7+hQIiP)4bzPk#%p;~5T6aOyq`cLe~zu5C^
zo$5mR{H$iWyzZ*IIuqBOy`3&*^Nfo*<q2iN*=hm%r3D5K9zI&-P9A@i?4FcOxMJrT
zxrf-DudzBxWuJJ?pk}JW$)iR$Q+-CAep)ki%IHz2$_`5=rR)!PgleLnf+~P^<f?|M
zsr#IzqUdWQY_+5g0@+r1P`Ai^4k||-sR~rFI%>lB@{83<eO;}u&+F@}`uc{xzI6$U
zj;l@j`hmV~)7S0#`h~t$Po8w?c=Zo`-LJ3zmaCPmueQGCO`bYovej5$kJ8r``r7WQ
zDbv~aL|>27*Z%rCSYJoz>sio77F|2|74;!y{a2wh+ojZdB~o)tuEIHS7&@paJ2)NM
zJpxaPR14JsjqRgOqR(O=4RG)MT^#zl&nw*L&F=FyU6y6#n`f&To4K@OHy^n&uxWJw
zn>-*7R_C!Rm2laVcOR6>qxD^>@A>QjB+mubDt#{mTDJ9s8z;L9<^t2Cc+!{RDWCso
z*B?+DfO7}ix7=pUwH7dtdb4#WJ1gD7b}lzqH(9@BYnR)tTdjH4@2oj!u*Lp27Sbst
zmVxsS-1D?23$4}G8tXx85d*OgTaQ?a*$U=SYpJ!&dck_$deOSqx{vK)9<ZLVUb5C&
z&$3C(J=PzsSFM+=SHQ$#*CLCZiOgMgUJS4mP0)JWs<4)`rOXOqEKTizCA;@n(5WH%
zkdJOP<*5Muu&H$(J%_%kAGY&6b-ucQJ)<V6%heU^zcY>fc4n|^(k&X&5Uw%J`5g4d
zKq>%M6QC8s;Syp?)p&Lyll|7NVIQVN>QVMk+O9rTJJe3~nda7m)-ohVaIw(c2BLlV
z#3o$E4orTVP^9HRbXVd%YtTwtgHGBSbl29PuhwTRpE@m4ukYxIz9ZrKj?7+haoH5r
zMqfMWYd3xErLX<<b)d$Mi!Sv*SJ4~Mu7I+S=%uWFyELxTu-vloPY3oq8q5|yv*GWq
zKsJNxSF+#M?yNRhi?cRmtL#vAUUn>dWcFFvGqc~${vi8*vcC_8f_cGF!M8#Imav~4
z+7Q|ist()X{BTjYS9k;)+x#}XD7+^8D$CY?$+Cz4jI@Y!iu8#Lh+G@_L*#+T;z&hg
zRb&mr;IFZK{rizmB40#yM{}c{qUS_!i7t$8j(!;ZB)TK|W%M7>?-}UMvy1Fj_DKwK
zpJk7;AFw~L_c=|SJ}gFmt#cpCA+B;>b~ZR~J0IknlXF?l+?)kDujOpd`S%eKYFt3?
zt_XfvNK(Go(Bux)Fu8^Bt*S9`O_EirDgHuwfJMnl)swxN#wItZvx&3RQeuKSZ4=^~
z0--ms#u8@(X&-VWP$k7$AXV!Wh1`4N#u9TjX|mWQuK_*V2K2fc&{pInXOnl2YD8QC
zm}m@;$wHosaA#1ulyElKNcjzvLuei$l0!;sO73RK1e#zY^ZD!<W~+tN<Dq1=H5WIJ
zvKH`$@)YT5HzX%_$tfuxBGge~VA!1bxB@~N6I(*ca@;(^?jYu&WWt(@Uoc33fs}58
zsSTb972{bj7*Eb}oXOPyOcGKC7zDw91tXBQ2IzZG**rPPm(fRbTSIF=xwfuPAz{s^
zZ3aYeyn}izOl|<;I<+LZ!>yO#?hu;~WRr7`FQ?S8iqskh^j1>lQMO>;lzVf^Dko<e
z<2~fq1ML_<)L8GsIs{0O4Mt&jBy@6k&V$dwccEngWf-Y|CWI9cRzmt1t`sUZhmYgo
zVL9pM0q+jnLi*nK6Sf41%c#xV<OXYAvda2BeV_$!?kTAk_*Q4&E7WeLsVy873Jbpb
zG<9<{Cyj(uA|VM^I;8x4l)ul-QLS^#1sBnTLTc9tC{20J!dI#CN!b+H7i(#2I6~V3
z>ePhirqDp>ESU79e!a;zmK<kmc(((Io*Gx7Y)`_+l1^j-Og*Yns*$-uQf^VrfZm)`
zXOPQ4s)joGP^N);m$EiPsrTqn4uLX5k>^urfrj%u7plQGDAs^FybErdacN&z-40H7
z0d+IbcoznO4FvX{J3x*+@Dh!nzlDq$ysD{1f?7%q6W~-xNe!eVpap>z1ez`S3_j~<
zOO7B_1Dxnc3(D?@libJSt{~?O+)Uzcg?1IhJ%(F>dmOhC_XO?@(#m`29^5~0dvX6v
zZnbit;n8sW7~JoX&^vK=11lHnutmLvZWwyC6M8WcuulCgxzXyHe7P3w&VzR2NZ*(m
z?*P}0a9}GOsDuMMw7n@uW|2d~3Ca-r^DcBW+z?t8Qu};rUr6m4*GX0i1(4yb$nRF9
zU@2Ie9G$@@=F1`c%SXC|+BT<1c{5T=+2!P2NZ1m1XL1+1xrO_}yL?^VyyO<q6Re5h
zhscPZ!-AE_g~-Hv$e*MWSrXc`^x@KybfN=ZItxif)4>hPj&r~?54to5zjElXglC|E
zxoDbbme|RKU>?ODU|DblxW>dZ;og+H(Ul!q4$v|1@~l+|{Sw<RX{A*W`ys8*9?EE_
z(}{K2K`c6n9;ET4G@vwuN{w($NZS;S^~CiCf6+SAZk<Z)EZQBp3J;~-+d{rg8V`Zp
zn9`bXm$t76SUr;!<ln-UfqwdBQQw4yEA5L#>PqC!Ca+M(v_YcrBJI**iPoF82t7tJ
zs?gPZq_$e@HnkJ#iIqQ*l$YVel8Ggv<pdWC-Q9DLYg*F})mOi))F11xYiy9M6oc$Y
z9Ax+5pt=*?AB@A#TX)bJi8})~ia1GaliDV=O=_FeHmPk=+oZNhZIjw2wT<0v$knvW
z4Ss?)ABbv)N}syY{ctOqhVtX>(vQ!1aJ%!%q^Mt->W|IrC!)8!fo6uK?SP3*KR3tn
zc0-y*B7@y{H|on?q}`Cl7}6NyttuZ|kdIWxkjfZuO?}m6#9xiOljnszZzA4UUfpWS
zNGy^o_FF7)KBM$v_46&@XiE$nBszvf$B^h45*@37t<?6wwpT17Jx{bo`U}#VPUwEQ
z^rs83LDC;>f@_LxDCAjs#y#mfMo1~vuNn)v4hu%_op{%NVsUQ4iXTaOudi%ih>et9
z%NDGS^xGtl*iz}~(3_)dc~3Bwy9#L1%c&wgE6jJTkhi67YkGXbj^)F7JKKY$3MQ+Q
zRoE)AX}IJ*a{PynWMy)-4}~CpJGlkSHiD<LJwY&+z6Jex?b}Dqe}MxWDxeyrB@@uI
z0YwJsMXTh?;Gz#GuW;Z9UK%ITRzq2-!(ls;JCG*Dbq6JD`heFCjfwDB@R9N}aM08f
zjDqkxBj)Er6Mo%mD3ja@jkYE?z}c6<a|5k62fj^&82=W0)f(%K#O#5-3OPTFhwukX
zz{ZrM%QigQs^j*d7yE>d!d23260D5>@S#O%tFZ>YgTAfEls+_7po|<)tJ3HySnc46
zJ;A+nCdJQng=bTNW!Hdb!(CVJ(s+pvtu*1hspG4euLhVO|46O*O61=cU0`L{FDcts
zzQ;=mJv_A17*xW+AY74e%DzL>#`o;^rm&Hbg~N0pBWM&dkHn`BkB2%$EI}r{Txrvs
zOlSI8eee;9_pCx1v}aY*A!%RAHh8j@j{1M$tB==*0l^<EHE6<NkTY#VtHcsz9%&h)
z<uo=n;o2^*<xGeMxH#3h!+a*hAM=#P;>YAZ%y}AUy|e4Wv92%=8~7*`tTUgN<|p07
z8c5I3JyLdxXZk~KyX3{D@9^2Q>sx8}T^^@nf4VO%y)Jc97}o5~92m!+jzuj9`Gg22
zcFI2alN`JsXljwJi$C{&#Gj_~&t80uT4qQ&apsvmmuoS^Iw0--{zT`W9VY=;1Ft$m
zY88AqWIVHwWJPj`KZM?eNL%ViV?f)Z>j1T?&^~EfO;5UAD)G<fX^>ni??4$xJ@z}z
zW!)EX`;Qs9f5r#tAxb|CO7MoH58ldCl#+Me)M4zEJWAUQcJh8F6qfh+pZkz<wM*0E
zfJOLllr%a=Z5cga_Jc?8p7um-J*T(0+M}BE%}lS6v{-#3Puv+hI5_+tn}VMGU3Jl?
zmb}$bfb>Luk;Al&@)lj!aX>z4OXlUM!C^1>C*MWBMT+H+?-bsV4kg6bK9bh86kf{z
zU-wfFA39<aMAqbOo2!hP$X!CD#qss8Qr1Hf9QH%-Nb7)+GGqs7Gq}+vx{~*cNxanA
z*hT5*`1LF;VQF7o?Wq^$PxTWi2^}_IFX(IPc7>6!v}K|d(Db6E+9b3y93)i6a~M0J
z7B%{@SUFk@93xV|UBdYz>>Thb83!;nQgmF)k;^aB7nD%>b4oN1PNw1r_W0^2Oa`9X
zYUtXy<DmYOhvs)BOZdRZuP)QY#Vw!rN)Ia|bsFx$?UjZ`t$q-Pq0~n3kWn4MLZ5Dp
zJ^N~?n5rDOrTT*hq8Hp2+ec1KE1|3SX}c(^ZmXSwZ#?j^_8YBhFKQ$`<i`V*3|>Fz
zZ3(W(xehsI6k0}{ka2gE5^a#43}b^_v1o3qy_WQ5<jmECG}6RWL8VIMUm$8|zStk6
z1L(|Z;20?KWx}H&WBCG;aw>G+bf1gYCNN|Sn^?Q1wgG7mqm9y=70ecKik<eiX@?+b
z<ma7~+DYHe$b_xyC^kXfqtoN$M#7N~Xs5NKZduXv-A6w89u7YGxmsXRmdPnqJ(NSf
zyrjmOO@_22h_v7`^eG)%8@wl7z!V8j_g_uDYG<f3gxvBKL63v$-l|RoL?5kr@_jQB
zIvP#$1N5)~mPn%LpW7>;#;(oNca2>-wI7D~y%;R=jmD3UJ}T!Xoc0S}<(-G8bx_DA
zUiz*whQ3;KUh_L+{L_?zOvpDT85@<_<>NE$K5sVypZpS1?*Vi@=HXAIYg|u0uTK6v
zhvKKKL$(v>l>0qRII5{&?(PV+=CEm{kYM8Vn3BO<@T;rdLLJh{`?wtP*O+eA;i!Da
z5$jl^P1Wf#E?SedO@X!*DL!uHZA8DV2{+)S{<;PiwgMT;!L#sDMzVEZEdwNDQW8!+
zY3cc^&e#^hL?ZYGh0nYTXJ)JQ<jR*okvvk{6urSi>m#(H)<~|70Y7}Y)^&`=5+~Hc
z_K-%jDs4+$U8se^AqKb{@TI1N6s{(?J}L9^YuiPAeSm&|mlAwdK{9GCdC-V+sU#G0
zL&4LNA5&62<2iLp6B^6ko7j^YWoIB>E6yuFBUO!uniQ-NeUL}7GP+Ihy*C^ni`YEB
z1o-`4mbBGQ`@Fc+-K&*g?qcta!wAe;R7hn^dvqG~q^4fgg;P40>+{O@M;w~2k%Qha
z@~8Wr{`|&|&3YOA;rnrBTn(8p*h_mSI_dQ`>ij;G@z&x3_2cTV3%oR3tVKcaGVO{t
zwxl6tXrEBdP|t1KQYF`V=a9^`C@=jsUz?K3EosHlRq8%L4gNqW>EqS-&PVEcEOH8R
z-Sn!%X4d3};4g=6VXKjT`8U_!Lo?CdRd6;O)7C&!+)y|R+%*@0CHES&^8k2vZAr-~
zH87T0VoXjyAHDST-zA6Sg@)qWgLM&T;@`)=mo0T%rq%KTl+@j1Rw6m&Ds_wD>#Tj)
zH*IC5l}Pu={P}-DzqS)~@byqpKqoWhUh3){BJC+#rFoHxsY?oL%aW#-mnYR?2u$=;
z-V9UFQ;FnFNB-s5<kuJ-TT63&dAxSGPD`!rcsjB6`R#-+O8N)#U%8QZ!7f!QIxKyI
z)$;dH`Vji34oRDmbJEJFgsFAr2#cE`lkyTs8RYPe6piISq>b{Il#YRFehULu>nN)R
zAA#kA<U#cAsoK|Vzv_^u+Pnpp=&RT-vH9}uR7a1=l$fHDPJvWNPTm^iDsaSB82Qq6
zz(cdv-N=)hM&1y_vbZ{^<5JwJmG#g9e(Os%QO)!{iAC^QViP<vBX(ly=^QtoA66=|
z4$nw6rjAfIC`0RXSdHG+fwQ`)O#&^6dIaqt8S0M~=@#wv{)^Mx<3CHKvIui^LF1g#
zgSr7~9*PtoYqW93G8<i~v+b$_iY}q1+>4%gX*JY6rUiv+?teVkmr72wBc)$%hC1}I
zW*<tYL{@pbLWX6OOW?>qjUtKv;X~J#zO<)9_*_2yXAzU}A<dQaNS(}0@eVCh$@$5<
zya2hw$GUV`#&TTiByW$L(pJ>4SD9Z}H8fxEE29({Kg4zc3qGbH33l|p<V$S^7P_=0
zp4=csdTfQX#+J#riR2Io&d4XQ4}R#}B%?kuJ5EM(HC%kYeg^N=w~#VTOUZ!-F;eID
zmU)sfSP9=k8zj;y)E2$1(pP^QFUe{h<nX2ObVb@3T1V-Z>(NNExmftPsbP7!O&ji6
z1_{MR$hX@x_tRz9K{4=6X*2WKw0+S~gzFhNrokN4r~4cRLq8rVoKyYQ`XHrrO!S|>
znuRKE`V7w02O}Nll?^=wl1qDmXrOAJ4u#KNIqB>vA34*K0_8L$;d?!KQkj3S=f^@d
zy7q&M^JUQV5b9`~jy$>)(@A9P@DS@uy0m=x`Vk^&88Cfi*XQ-}x!Nbzg+n0Wz$y*X
z$nt?;X{+R6;KxhG^6Pp3J0Lu-f2?KGPzP*;g7Rf3)%!XabTpaLwUatwThb+_>y{3y
z!)GxsJ{v0(X68hc*&#<TgZ{wt;hVGaK}*$EwP*c<PO1y5w{)c+)<gAVRgB)OS}=&!
z08V2b#8~EujdSPD|C%{pSE=dDCb^c`Bny~H{{VAWmZ=K$m|DR~L@U)3>KSIsKdYWo
zYt?$z8QR3WyX~rq)nS<FsP<YxD`bVOh!wSLmV0ezHL~)p=2i=<z16|$XmzqWTV1SU
ztv=Q;>vU_Rb%r&;y3{IT?SeV1XE2YI4Hht~qZ7R%=@*5mrA0lNfzNDI)<jYrRcCsV
z$1umGxf;NlBwbj0VVF9Wxg5i(*9dh6bvu*Q1o~2=al~E9jLjjeCoqNhX`H7~znP@F
zmUA?#3*1Wo@isLd8vLH~9Ca6UK3Cn#c>y(lKwZd60uKY}QK&FMEmx~qU*LK5S0KHw
z{zmR9Rz|s3?c%&&?Y6Si1D0bQsh+YLTYc18t1q)|wppiHBh=^Ab)>4Y&a}p;FRiiG
zc=a_kFH`%d`CKS=D>XlY)g1l+z3$><1~F#=wa>Edq6X#&G5e=L(`7K+>xC<Z$N5mC
z1UblK)^Z^-(MTP~=^YVy3s<d>f!084ql4Q*%l6vW0cv-Es-2)~7pQzR(sc}Sa;%%Z
z6zX<^&OLbUqmGBJC+nCKxt`29kW(=qX*iS}!F=1(;mb(4;N{QNslCe(p!MTCK?n3o
zU2Qd#^Ay4d5yq-22RwRF3#BDm?tOH+f!rHvUk827;yReQ&w`m8tQE$!Cv-@kt$<@r
zokU+cKnZh-wwUu6O^t5Qz&m?TnmN6kh6+N7fzV-udydp}@Y0xjXQ-4udgApkY7K|X
zDHwRTwT55jOy?3`TTbDiIlVX!`;PEaIBL$$nzLSg#0r?x)J!abIs3q4bDji1ua14-
zxH*N}{^LYur76}_kwfAn*3e)e@*t<^q?}62hDVnX$cQ-&h0Z{#4ty{tmKksZv^z>C
z>y16=t<T0hcXyrGi2&=X91j)cXbS}-z6VrjsV#_u1PnxzP6F@#$i&G&9g4&?=QPJL
zK$Us6r7r&FeuVpE!X+#VJ&`qKno)WX4V1&Qgayp}WL}*<S)C34FnD<iXCuyhD4EZ_
z3HsU;4Jm}mIp}OUzxKM9R%kkNn6ZE*oTlz_?V{rwYy02UpP`ojD0EwnQceYC%Ehf8
zveOMslYb#k<=T?F=u{i@z9kqcY>BLC;+-Bn3|GxnV45=vT9{M*=P0I}^l2y|*J17{
zRPat&y~Uh4DLQCWG&Zaz$6d5sjw5j%K7mx(@;6i(D-R=on8c<zSUar=k=8~|7D!3e
zM(WheJx$I0b@S?wd2a~M<rGSLH8v8^NK1hy2}T-BUh#`em{Z0|J*vo^*^v4)6f&{`
z_yZmh-kro5+ZsaGL;6&@MaUyx9`@kSk9J6fSQ?8g5nMLh4NRJN@zk=Zzn+q@W=^hN
SD@nK{t6}9RD|v#>rT!1}K-UKV

literal 0
HcmV?d00001

diff --git a/docs/source/_static/css/Calibre-Regular.otf b/docs/source/_static/css/Calibre-Regular.otf
new file mode 100644
index 0000000000000000000000000000000000000000..3801b704cc8b83ee419b44b160b4d2105f4e52f8
GIT binary patch
literal 49856
zcmb4r2V4}%)^GO=-2**3D(#>$_6+7sGJs;h0Enod1jT?7hM**gW)XGQRd;pG0Rgip
z7y(J50-_)p1jM{S-PKh$Xf^h@`%d+MdiQ?cd%x#irn{?7oyt`w{Lgju@NgwM5DA2o
zu%9&1-C?U__eFvz+DH(@n8}WgW_>n?J|GBrGeImaaB_2XpZ{*dM}mmoMiA3@CwI?D
zuYdlSOAyh^3BsVOtGk8u;@lN7f*>uS{9^ww-^ePzS`&hhEr$HL!CK#d_wobjkXHeJ
zqk<uU4v}sJdIR|G8ypt1ykCsl27(CPO%U`CVZO^F$wD{CA_u|`9qt>ZUHra#Z}{Fp
z5L8WMM0CtQ6@(hvmjn4`1c^zB4F%mV^jp+v>^nj#eE}&%+1tV8?azNU|6?QFCS42P
zQbK}L;UAQfZ29{raa)=J`J1KNL{9oVJ;7yya3jb<_%I+_pmvHdlx&gAg>M)Alk_3f
zBm;cplvFB_&=M)+!AN}m=Q7=S263G*6C`bQttHCL2x1qeAPXdc@Gyfm5kybY>r?&a
zI1~QxuZ%EndzZlb$hLPWVM>f`dpG!khJw^~J~YvZ*x5!SBbF0K+TIO`E@Y3kcO#-B
zIk@dzPAEy6w)YNSyeo)4<n*?5W1<ro0qrFXjG%Q3$hB?nB#c8#+q;BNkbK*_lo%nQ
z+ujWbwPZ}&J4Gyqan$pnzetx6ZzVNt>4rpa>GigEBf?brL)*KY7%bJby?6NHT|qb+
z^lwY=OehRO)6A?bt*p#kLPNsLJXS<%%^V|Q!vmsLn43)u4K<4j2?~yhHjC0mYop?{
z0p^o^Lqq(cw6nEAv7x?ER_2zL)^>RI?AkJhwdHteqoPA1!p-y)ZtiB5miX1mE;u5_
zKO#I1XPA$)u?zEEs*Q*VGzYHMkYa6RJ9^~5Shancg+!ZaLt=upQD(l-o}iHE7;Tg`
zz$_-pH$WTa8@1Fd0_S}38Th~S%q%3_Y^FBeY@XIP)Hgi94E`*jY=p?$A9^1X6{3wc
z|F02oiGn7$N5n(d{8j|V#6*s<u!xV3H(v@vXYL<i9=`H_Otc6M@z;h&hlB^2M+Qf>
z_oi)}h%{K))`TTt1^<~5E<`91LWIGa2eE>PB((76NJJ2^L^u&ZL_wN4q)Y^=P<TcG
zZ4ePm#1PTqmlmE{NR5Nv0HB;q_=^1eAPsYx4ZlIaD-_bB{`F-A^p^0SHDL#>m`*qo
zGvMt%ISl((ng4sPm)MqQXk!Gl-VFNpxdu1F9a3<8?WtB!TQE?@5dI>4+@})eu%B%p
zHw;?66nJ8qK;VT($`9I$xthb%im)X{1MPpd$_%LR=xE^?12y3BF#}F`^zdll+!!$x
zkC_>~M?r1?q~Q9Zphh#OvAxXyoEr!<|37nx+k{7CCe#uS&w22~GvzDJGHxB7ZHu;g
zaev#}jmHMB1>BAhC>0Go|9`I;7m*r|f;*IthjLL+(hpV>UOO?c&c;A(7LbGgGlxFw
zSE0E$7v@mk%Kr~a3$e66)DjN0;GPBn&q&}9`LDUb_l)>Q_HH{ld*YJ`4s?8v(fDX@
z`v+-l6zwVElaxpeC|VX|#K=2zRCMax#hC4?H0jp8hpDP3r`DMD>fNVrzy1RT4jMdU
z=rD5&ODpT)BSwz0u^m0eZtS@66YN(_nmon9(P^slG?(c!X1cn~n(gl4={0Ar_q_QF
z7WynAPM=B7$jT`!E-krv`&L<b<(-PE+UlCR`o@O4P4^zO+>bcy>lYXuwfo44y{i*{
z_+}e%;K!c=^ex=-9kG4q%EbhseM*GHFaJ61Y~HuK!Z!!UO*Bj*9^LSNvSev!Smd&p
z*cGSFojiZx;-$-(*;k0%f~(hdtXaD*Vf}`Uo4)#*Fp!P{b~6ckxK$NIe_{Z!h*(Ul
zB~poF#0laokx5P^*OEKQe<VF5Hj-PChw4r0Kh3(DnVPB1dYkn(vosrHHpMKxSGQih
zdwKNo>22J5YVXCpwSwfIe}JzU@EZvHe1YE);CG5RM`V-JfZqp6ci>klX;vqw|1e|C
zx|{VhGZXoZ{>0C{*Zekq0Rs8YKmR->?BP!Ze-8g#`{$Z|7zpC;di?e`ND2S>@5$^Z
z&QGkKXr7FC-1Io+$($#%9_@H^;8EhEb=1=TVXnu+IXIh)gJtl5d<MVI$QR@vq>lVs
zLP!iGhX2X?zrF&AMCjiZSdB-Bb;M!f0QC47u^RTlGGZgKhxiWG{UKsMkpe3{jW`J-
zah5nuoFTp;(uoU128_#jB9pjCTp}(L`-m(em&hir5P3unah%vjTq6pIt3)A4Fb9d<
zL@7~Bln}Rxo5U?*J5fed66M4lkm_oQDx#XGftjl#8i{(MfoLM`5=kI^JRt57EyR7|
zA(2cxB7P^WiMON`@dr7act?&R5NS*B<Y?kO$Q`ZZ7~%tINBm7rAOvzO@sS)){6pFk
zI&vIIl9Nb+oJdN^DWrs)Oj4vHX+S!V49ut@tP`4aB8^CAQbz0`<)jPQft*fuBxjH_
zNd@Ujb|z<$ok%yb3ptzEN_<ZmlkQ|!(vxIK4^l~bk=;pe(uABtb|dE!-x9k>Q*u7p
zlUzuu$OR-v`j8rOF{vgO5&1+j@tk-;_9p$wUZfv6kPIRFk%43%GJqUF29teBE!m$8
zA_tR8$zfzTaf7rVBS}kg8Bs($CjKNx5Py*)$sXi9(v0*a)(~G43B-D01MwBC`c32z
zGL#%dE+L1KVWc_8SmDHGxD5@(6EY4?#t1lLqQz4MmcTz{SpNO*e<WcbAz-*V|4Y!o
zX@mIA1kX!&UdnejaOfaS>tG;s5Egzg`(RA<?Ap+&r*cr2!k!*WjEP6UgP@6y1PfAT
zFJcJX0Jg+<xHsJiZ`h?;I7MS&g>8T}v>R628CXNPuu{rk1vJ4-KPG-9-ov~}VDyb)
zbk#5-!^q*JJ<QT<axRSM5*U}|<OXs(xr;nN9wASWnPfg$MBXAR$$Ih;`4jmY$;0|L
zkSHWwB`Qg8$xz8~$!N(0$ux<FWS(TPBv2A6iIc36Yy>%ckK~BttmM2TPjX#yQ&J(R
zm)w)wm;504P0}i*q;hE&DJShE9UvVh9W5Ozog{UYdP#kyLDDek3h6rO*V3KRWa&}q
zY3T)NwzODUF0GX|N*_pnkp3q9Q>rtN8FVyY4LAccgP{iF45k?@FbFb;G+1HqwZRU9
z0|v(o(hYJAZW>e?JT!P|AW$7BHDy6fq^41GC?6`4T1joBzN7Y2C#cI*5miS$rhcRT
zq&`r8(-f_sm9!b%pSGY!(PQaJ^fY=FJ(u>O1L<%&mR?71roX3?=~MJMI*Tr#i|KN@
zmcCE_LjOVkEu&>!WZh+EvLUh&vazzsvKcaOnN}7fTPOQgw%a^BHZ(LgJjBv+qUA&o
zY2&mL!+iatBEly|1Vw~vmrjfViOttPHby(qL3~Ws|N8p^&1C<ODF4{7z)<b-$pH~D
zzWyNL#Y_>irT`D$7zdG~gP7v*iNZl&gM+>XCy~bKGb<<mh_EnUJu|0Ig{J!XMos-p
z<SdqQ{*>*k=jp8H=^PUh8lZKaHtll(moK<Z7yB?>Y}WKI(}CSgF>j`rH}i`;hwva*
zu};_ESQwG0*sxIF*cexRwXV^jzR|&A$=Ui}vHa{W%Dao{?*7_<kkC+H_s<P;|ANQ^
zI_V)6^=KpX6zlZ-vQAHZou2xxdg2N46#F|ztYXe*)^mcQeB-op0zyDVhz^PN{+#12
z=FAfd%=<KmzG7v*dj7t8{=T0UfUj00_t*c5t@QuQ(f<>d05K~-TmV{;qgG7Oexd+H
z1urqJzJ?%?Cg?M(pf5%w=u@F!7@OeFL|Ffd(}tKseu$o3h@M@DI2IvG@S?UJZnNZb
zk)>boT^ban)rMomC&WKg>`|!L$j~onLV<U<m=`YQg@2K!4G)SCTM+T()I{i867hLC
zM2Y32#PU&Jl#dqEquXXV`g0?rzaWZ%F2{&PW7-H~#X4iZtTR?$XRN;Wv3R;-#UY3n
ztBC*1I$oUg`1a+n;&aXlF=wS%VCAO~4_p)|R&MSd8WHEaG&<7PU%Lno>u~E)=DwjZ
z|B;~ofU<tt(1>{GgkM{eF{Q}f+}{^Cz)6B%KG!XN;nM`ahz-Xt`cC0DvB&u3Q@g~E
zwl$64ahLJ)%TdAK+T}2{VF3~GBI{+b+UOWi#@oMxVnaf)IvN_P4UEC$pAtlEzda>X
z8x|4sC5f2uiA0=fOzY?Czx0cbD4Z6ljS7hfz=tB%(M2jaZM9-;!7CzzwPM4x;Q?@X
z;9PB3`&*=M6s*p+hKV1aYV?C8wN(45No}A1k*#eni8G-MV7$QWeZ^mxKx->28;97a
z2>fRr8WRu!CsGu=;FiJzn-oIvn1;rzfF_2<1cpYy7{kj-tyloIE_}qq<H9k)u#qrL
zL@Z_z7!rrSqC=MBU)Z6zm{x2-XiRvBSWrJK<}tp0;#$|ggu()VlxS^Oh<`-;2VQYF
zRc!Of;rho2{bQ2;F<JkZqJKC5@3vmRTTF07Y&2|fXhNtqI$FCd);ARPy)T{){2m?=
zBR*lKq9T@yuaTj#(Lf#+60U#7hQ@?MhOU65fRMP50Ngi`u8k!;+8DwcF56!4Q+=N)
z+RL@q3OQd?3}0=67;zBGR1=6DAWPo^Iadu*P$GGdJWDo#tkWuyN=!h?*$a})HA$W1
zrL>z=EgdXfCH+pCDlL)Tll~<A&7h0H5QC)#u?E{f^2j#04pK*jLA^nf!H)(nDVkDH
zU8r7EKWYRufpVf;sF_qWwU*jN9i)y@x2XHnM_Nr=(PL<DdL_M$-a{V%*&>a;Oy|%g
z^ke!5kSThAJTXQ#TecV^iC|fjYz0UY8)aK%J7gzhr)3vpxw31rVp*f?7ujzz-jFfu
zX=rXZ%5c2l6hlu#Kf|SlD-G8gCK~QEJZgBsFw?Nuu+H$Q;R{1#WMI_MsEbi|BhF}u
zk-3qz(Rd>lBQGOgqadR&qbQ?zqt!+UMjMTG7$q5<GP-P(V^nNZWpvNzC!;q;9~h|;
z%@hogi7-kSDHsYS!c1WXG8DR@k!Tb$MBUH~^;i?3uU&sZE--d)QD4;e{#(Q#`ThPv
zU$tB*5J#WQL<$vB{9J{kKX_apbX?Q>B;}G6a{QWU`HhPW4|4rIlGQErs~GzVfmI2H
zBOUsfmo0u0t3i@*s&2)}(D|lL?g4i8vofk<fFbH>`>W7VP>dcb^b~qN9EudE<D*|u
zPxS<o!SkK$?B-X$dX-;&|9-yHV4%*UPZv;ollC4yVwx745*ZO29}%6j^5|yBnjqYu
z=dW5EJV!M(EW1pD&Q1`n$Ohk-_O>K1?c8x7Se6=p;&^Jxi4$=tQOn}um+jz|rNkdU
zp-)*B6`#(?m1c?4r<$n*#hBM<02=V{Hzfbz>cYt<G%YluqeeCg-Bf~=&`IblcnBUy
zA#_F~(BMDcAzMwB5alFOD5Fy2ppMkzC-rqi?%<+Q;-C&OW!ZWzQY}XkgC%QyVi&4B
z=N!Lr{o1h`xjC^*_G{!y2iihVP}Ov4UFj5f$ORMKYQdOaO^u_GjqU?g_dUZ$Bc(2y
zs)vMC%mi961_~3PkSlc`cvYQHiL~fKt#AQ(p~_mBT=@(cQ6p$O!HANR7h0qj(5*g<
zytRi|RZpsdF8i@;I6UOapO7h3V<(sbVO9&@t3_ITSeduvYB?2{Qxbni)xJ2MMl~)f
z(%h(8n7+W3y0~!G5ocBVIvRe${)&fYHwstkm(11dmEmXA!Uds~T9&!CsLYh_)wZza
z`5inJsz&EfD|Os&zo)CId<nNq%bhrK<b+yo!<8e`3=Rd>vupVtCcPQ9a2&_3T_R%(
z2XgXOQ{^;z#)ug|P}At=LQkp|+HS84qJ{f>5M{6X(8RB%(M#QG>L#sFPNmr-OX2w-
z^qjK)p3%8c4Ky420{#44_?c>e!E~c00mUHUIVC^MBr5wd@<|)YL*h_S8Io1VQHHXP
z_7JM5`H5@77O7T5@7TRk<0RXVbolGzsvj<1e{xV$j2b9Jln<tlq3+bR-D#(@RYz04
zN=(vJ%XY1d+a9B`@$eeHN)sihsnN8eDlg}eW;X2;9}yTBe=0{ko0bPg7kjI`m(8D|
zDWEg<CLe`WTA;l+Z=O~=Z~jGXLBYj~`D$O2^;}d`e9G|?DJjQ~r=&9qWg%VogxZ&~
z=SZsQg_zW^u;|!O`S3>1pReZ~J)ESLhco<7=$08L-??(-chRbGv*uW9Cfd6+>N?-$
zJHNRzpjvkQ^r<U{k~SwEP%D(eb{cI(kEsITXBV_dCVVSArskpFWpcDyDHzvS|FK4G
zRr5^Nav;BGZ<C=tJvT8R)KfKn9_-74>gF4lLg((*G}7yu78H#=VECis2wC6Nj>83G
zQ$-w_QG*Wtwl!XkylFHQ4v`MX6}bxV6sBqfZ@Cjk%bQx&6NJ6=+1UA~Jye2;LocC=
zhAj|`YkMP;qWsfmkErh(Okp-`twY*c<W(!*#LVL4XsA2$Jtvo*Ls9)1d7>91cW0*3
zx{0t!s$k!YduxL<YB|4OSq;m28tn|r8cj37A1?G%agCSXL(QW&`9MyA3T4+0W}eAY
zB`1HqVV_1Zm1~hJ*@^&$O}`n*$k+-u?)-800@c+M`E?rk0tQtX41p==&B*(43U|hi
z77}zelsh^vV=oAr1{oWPG>E7a2sCh#Fi@B)Q=C09D;J9A)oB!`_NSaoH!bzfnB_Vz
z#BE7(<mt6)`QyY|{%X9uFUPKSZ<5K+?@w2;rz>-Pt5oUd_oZFd+?Iv;2d!PAV&^(9
z%$%xP5|j`w&sBy*r>31gmYR`%JT5#eDsG8Jj&3W_RiPtQB@<RYLo0u#s$~NH7Oohe
zt7XDfTH$rBB(z5L=FZK>hctpS+O52MZkF@68eu(sZ*P8Xl}hoP7W~l;v;cWf18HHA
z;7!TV=B>!Mw>={#8(Pqt7AgO=J0lzt-k?L@b1f(xy%Ew|_zyDqo|Mz+rX_xt9bJ5V
zW^GiL(Wkbjq@GrtPFcPzEOu3xM((#H#Lq7zBQrB2Ju@?XiJw}&EH*B3*^$^2^3>^!
zye0sCeC9D*nI^QmNh;@i!g0o59t#`%WNfN@^R9zicB>xc!WNQ4!%=}SgsPJXiAav%
z55M9RMITOaDecUqi)TQy1tBsZ?94?qKT+3RS><_Ss#;D@wV+Drs^=8iSR%})Z#|@t
zrerW({eqGU3z21IoV*xyz2Q}c)|H_S#l~mLki3TPdy9R{m&o}zgWRK6PUfkeTy-2a
z%-eC)f<-5DV>Alm{P+L-b25SWhj2<D<a=9<)^HY_;)ldqG^rYWTPJ^hiL(#}%HK=R
zE`SRrlEEj|M|C4SLbfEx;b>lr;%*4NWOtjW^mAv<WSf@CZ5c%dg94h^0(o3wwKk4d
zR?DjzQBadSDn4Fb%X=cy_*hfXd-J<LK4AYs2j$AlbUd(lfS36BElJOWgT$4Q3pIjM
zg?}9XPrN)J6pzZ8OBy!c{$g6F7DiEvQfq}&xGJvJ%5;p2QXX+6=G4ifM^BxKISMyx
zOoUn<pBitBlx6a-uSUh80oUYP>-ex*IZ9F@D+j^K6<N`2j$kUE$K6FnB;HdkQu(`S
zTn;h{XXIv)YC9RS&HfYhLN2C?9gK;5tnPyxPL2C%#}ikksD<gmdP+Wco|9eZvfWAX
zYBU`s!bFTILu1gGG68;M3fQd*8M_>rBCA`1RV}hAmG9bnP<>Ules$tkYgKC!<@A<)
zTla0<zjg1{J$pO!lap1AQeOsA6Sl73ny_{4)^+P3b+@ie`4hc@A5D3JnMemW1wJXp
z&tv4)TkoKnJMva_JPE>xOCQuru+4^szn8CM6!tndIdAID=qM=g5T<MfEzm+3wEzu<
z+tr>%+df87jd0a8!fR_3Wlt-nGw)s+%LOv=EF&*LMkVk=m&hyER~pO9_b_tfbM<m*
z8A?|2nlhb+mM7Mr!MAE|A&VMP9>ee7!=MSsS}J!*@;vLWMi-E^QhtPSpyliX#Eg}(
zuLMSpW|XeDb@bMXQmE!O?;TGbN28^3)HO8@rPrZD@p9W*ub&#OUCc>W%VXo#EQ^Xe
zaO6m;91Wr$W&1d26kz$)62a!g0h%}>BhHMGj3Oj+2#FUV@gT{~1Q@l+0_@W#e;~-`
zgv6MTbRi@xApuwU*97^EHF>o!`J<G4(M>{jl5{nb*z}alm6L^79+!fZvp-nh^TCcg
zgS<!nA|b)r=nB@wD9P87?<I#NcO^(VQF>f@#lR4(e%nFL4i}~BWU7EF1MA&jdNh5Q
zK1yGtuhI|cSM=|6t4t#6EaPNeGCz=__sag1{bNWQni#4;Qg$`mVEC=!A;XJ?g@*M;
zgptx{EJ(t~jna(@jmnLl8VO8CM#GF{<}!iIDJGA(#XObwl#i0z%V)_0<=f@A<R3d2
zbuj5LufviK=Q=#=fI1p>oY^s`<F1a$iirwOMU-N%B12KE_^ngtPJKJg?3B=HN2k0_
zcRJncOm`mKc}C}e&Zjz`?|h~6wa(R@-*h3muwB$$EV|fr8P{ccmnB_RcG=z~%~)nU
z%6OddcgDAjpBg`7yRt59Rae`tUS0Qg-QTsP>wBez(qFkonXRle@iN(B!grH)+te+!
z+l%f*_wL=zyE}AW)O}<3o!w7#&+C4-`>);k9)>;o^sw$Rr$<1K=pNtoINak#k9$3y
z_4w1YqiIi5b5lpt1*ZO{UzuJq%{6_l>a7~7a#e+>zEK@eomExzH0;^6r(;jgo}oRz
z>AACKZqGYCANG93b>z&r;hYn<fXnBaI1pCUUDRFGv(&-hA=<A#r!G`CtAE#w(Aa2Z
zYP6aRjm}JCHq30InX8%BY?av>vwdb~%nHr!_F{Vt?KQbqV6U~k3VT)bdfw}=-jd$K
zde7{=z4wvcIlZfT|J+B~N8QJ=&*VOf`lR&9>vN~iPksLCYt(m0-wAzP`!4Jo)^|(a
zoW3=EpZ7)mI`v!FFREWozv_N3`jh=t{muJN=<m@#r2pRjR|kw8uy;V-fXV?+2fQ07
z8`x)H^uTWio*I}x@WCLTLE8pp3~Cwlaqz&wlLto*P8^&wxOIs65bq(0Lk<kNFr;)y
z^N@E#yAHJ)>N)i6&|5<rhW<XxdYJRDg~K)tJ1{JLSmCe-=1OxL^9APN=G)DS%zv@y
zX<=>QXtBT|$>N$toy7}_kCy(HF_v+bYb`ff?z22)dBd{C@~P#omLIGrD-)|BR>Q3(
zTg|pwVU=Qa!YbXW(5lMnvDIJJhSr?5jkUXVi1k|QqT%%LPQ$~8HxK`1gvE&YBZ5Yx
zjJP?n)5w^SUyp1aWj|`ss0N#{HuG%O*=)BtZga}!Pum#V*Q5PMKN({&X5pB^F?Yv2
z86(&!?S|Xg+s&{Gv5U1^Z+F7(#uJ55q!1duLL|R4KAK~n<QFLY8MfvG>Z`m#ABL09
znpOyVc@2snFIQf#lwRgF%1RXBOe47vp?_0chcjeKDe7B_ro@v+`4dup7h0$6M`xhE
z)GK;lDK(VNLQ|-z^kQKOCG?`R;lqjcg^xb;IydTfIz;G84W`$+Q7^^RDv=!Z5<jZN
zk3VVQFj}O9G)xOT6btv<;e+mwnf0$~eyNk9Kx<{qYx?Ehn$nl159bzI9#aoKPPrah
zdi1JkY5uwDrtHAEN$R`w@)ob6;Rg+0I9kaDZgiM#VQPJAaZ9}V_eiR8<?(PI)9G`9
z>>cNvE{|1Dppz|YJ%33sR3P~>hK;}e+9Zx)^RMenh0Yr|{(w%hkrT>!_(j`w5@i6V
zC`&9~633U`HojD!*np-t?y6-oc}G56`RHr+%04PKedr{&A@h=h(*iYY_O#Fi(<hn=
z<ioMZsJ`@6PL7%lzwJW3rUe;Nq~ADP@oiNP2l|`2>ldyJ5bTjl_uz}6XLC$%W@TJg
zvzbL_wBFxogpN8}rTv_14MiFG`G+;gNpPovm-xiHs}urimBy3j-%FGEkz=_doZyqf
zr8P>^(D_}=J7jK(%%8q{fAS!M(@|)n6kS2*Q@O~ey##Yp!F*(|?VRAw&x=P(<4M%_
ziWGI^zf)SxIih`bjp}{nFWIv>M=n}kdv-ncqU9Xw&qaM?me*pnlp+Zwmyu{gnG|i{
z6P3akVayCPW;ZANl@dpueJ|ZVjZp|bD6W>gj5bLxqc|nnR4Z(P1@;##K=SNzP9e-r
z;dY=V=?<Hd+`(xadAS*tG)vKuL?z;$BPG@ZERa&*1dxuDf~(;AT!0)6f79Ug^?pJh
zp^w*YM(|3F;{|BA1C-p!3(5twFwL9-sbc6oN|)3uQ#fy}<>PC~(+|+9CMo)b_f*dE
zp6lYE7Ul`z)EvHFt&l52mS_T1R$F%ap6Oo>)q;^gO&BZC>(meEy6-PvyQ`|s_M5Iz
zOi!#vQB}yl+8DiPoZiU30BIycnN8Og`8hgyFPb!dK~7zQW;`uO_dJ?}I;c>myZ7JL
z_!c=QYuLBNNtcVNO@DK!1@5H%q&}<F&Gen!S8{4pHQ7FH&fbC3G=8*V%~s^=T*(gu
znd)W(=n2o}aGp@>v1-)6maMEqtL{qCdh|%ySm-u>{M=a+$1l24zZO?XAGq&;j8w?*
z(GOswVeFm?l(%2}@;D8efsWP+KM6mV$xa=;aJ)dJh)JwMXDX0Y1$ns^ookRH7R^;Y
zDk{0xTr_L+7?0WG$9mqpuSO5iDdjKGZZ`!&CG@Zmx(f3r@0oeRLo@zjh5vmOGQR%;
zQAld{LXc0MzIaBw=KC!Nzc)qQEB^SoEF*YMl3HO4Gm}|M9_D*V`QiLk<!xOr2UH^y
z!gQHb3#|xByo1(NqD6O%GnzabQEa1U6Z;4Hx=1<RJ-@c5IKTN`@q9-I_xa;A>??G`
zVEVkwTek}`%kLEU&2*jT=b~Z%(2)j6{;c-heXpBS4+B+kN>*W&>1*5a-hFJR51g7g
zw<KQ8{^hh{&MZ4q_K#lA91x?1-s7&LGgaid26V1g${X>cl(rtT$Bp;6`S3weNps7M
zS$1mSp>Rq$>{#h6M5s`Y=cp^1U%&qLvLa2hcG_h-m0)aVE>ME>zB!WDmuHuyXf|zI
zy~$MQKCQ3SjF5BJR;u;O9Qop<4syMRxGzw-{uB1bBA!^pC<gq+cP7hjO3OmIQ)vFp
z`?tszUM+1w^Od~%rcO=YH{jK`;Hf7>#idg8cPL|tPjJs#<CF(A?}y*5X#)9vJU+=$
zBHv&6mqQh-FoW@J>$m><%)$m(zy8zRytTppFiQT5RxjJ^ZFra7c5K&?jPr)<UwK!H
zuH7|NpzVVBGLG*ulo`bsr}Bktb3cZCF_uvr+**TzE780f<A!FmqshIQ<@xTYPMLo&
zE2qKq-GmZ>RSQ144=9R;rWoJ5jSMs_U*l8ioTL^KPb<+p)J66%Cv@smpCzL;LKkNx
z>IamMuJ}!JS>$W05&F@JWM&wrc(|3rD&(_!QZ%$~CgTWS_oNN{hasFo3)fCcE%LA3
z#Gs)jC<UFV70w`kR8Xsv>psX7aU58$c;iMX{}o!-pUdX_;Cv}ewP&VqibaX_e15zq
zqsWABr~tV09~LspK~=dNzlmuu2>~5H1XgP>>c2JKxS^4U&7H%WnXt{sgAb;3!HxWf
zkut$ww@nD(w^7!*4~={<jl6Whl;REW+`{mA{D%=7=r!Xx#Q|vQmRhu;2KVI@^raYW
z*~<tkbst*L78;#G)0K{keQedjAhaTv`)e8p#+c1P${U$;?@&d>8IN9@{%`iztGWA+
zURIZ*bmiFinZdql?>nCU*PN*}%a(2oQT?|qR--tb`VH<dIzE_DtVv8`1~G~&`dx`O
zw@B0Y%q7e`ZYHOw!UL1a=Sa~L6SRI6Hd%P;b_yQ+PRf=>>p`<o_;SX{tzmE^y>n;S
z3xheuVTMfNQ>FZAcV+|)$NvVw7Hy!cXrvX2gfO9k`hi9h(RXN)@E!FtErg>g3YS3}
z?nx~f#a@`|On#1(PpF#!7e$~=HUDch*-$S<{rHwlrW&t__WR*}tNH}jPJ+e)&Dd2+
zp`X(UMi|4}A}{Ak<0RC?rlQ_pcjzK`!Rf*-euTPGZ0aMSD|L?sBdiyt5G(~rHJO6$
zOH%}i65X#B?n51j{{ZUIdnp>^&djCfj-uAZuaAv04Lll`mUisOnN!OTM{QRtY&gBW
zakbvwSOE6M)nIQdaOKu<$UEM+vI%_)x;W~6O}X?)et3}znbg8oKwTW(2__!%!<T|>
zd+!7pyZWQLNro!EFcW|&>yxEGtvJk)=bD^3g`Vp<k?U%tGjZS?81h^rN^Fvv!*H#=
z$e=-$>*I}oy$fqJtCdr-FD7ua8RK)!XiF2Emmkus8TJ7`461xJkWoZ1&`2_YlUA7w
zV9-H(nIiQRSNv)u++n^9`*ttixQ5%rfmSM}Wuo4^n+L<jxX9Sdr5u|R!?^`=C@vl=
z&PXs>%<%)TeF6=a<>yk0)VLH5XY_7WuY`?+FJxtPV!Sb!Bc{RsnxGxVys<JXZt-b%
zmB->`!LiWiYuIr1;5(-AJ22+t$zFPwTg~tRr?DE^3S|>Gwi&mT%_)Y)g3Skp1>1av
zGm4EkA9%xfzY2m*UIR_N5xN=0i8*-88es%<!gx@*q3#d%kU2+oHf8}fA9iyT2(Zp~
zssZa;x1zk$XOC*I^>7UXEZO}ipP#K{Z-Zs;v}oD8?Z%zvmNRKc(F;c14=&EeJLHdz
z(n-uZMy8m`=uI#={Ak_{Y;()OHm7jf`s>8$jPYUK2KGvl37QBt<gqju{*=P+f(iPa
z8bc$n8BmJFnCNGAF8ZB!!*=NX|F%O1|Jx3|AKRf769YN8Q>~=@7$`RyL`Y${V1R~G
zBWYv-`_5K~rykI#Gwi)i!bIu`EsRDhDTR|T2(~!1%7RfO!`4H;OL@~f_Hal}x2Z-+
zmzcL8SxiQL%8Lh>WBd0XRsU7phn}`@p|eJCkwqn}i&igm$fb|%T^o%PoO2gesQ;|)
zO~<U=e@vqgN>aFn25EzhvY|C<5^SnAp2!xM+NLU_*RG9G_jdY|uE@=;)F2nxvAz3`
zsdH-=($NX~j^l*Ng}KwzeVqQHkM3U^y`RyVDtaTo%B_{gm&;LbIeP(BqaT(2t70P6
zi$g1Cs@TYxLG$f2<@Du)M^34;&N!E;*bBGM=ReRWcxyz0ZgV}K581%5sfW1nBak_I
zgoeNhdRZeqH<1h8T7mq_P;@1UXpnwf`EkkzmkO(j&W3yMRyWX3*SeJsRtdzE<4j^S
zw0q_>eqx1CKp#xmb^e?w2v_E1f;m{7D)E^qbl|<D=#gk!W<`sUV&4S@Ex*SuuiM7m
z+sg^83})gA%sNcdDPaUUMcKf)C|scg76yvRVye+jX|*lq)W3{>Rc3s&Zg~T;Zdg&v
zHlTlgRAwDHaV6ihV&O%P^J>BPF%@(y{=hlYtc$5RH`5{(?FA+0!CKer15{Sd^X;O(
zkK40K13NlqneTklX}N(#L2AT~qRz)8uMRf#4UF@d6?*DQf_f4?W`C*ITh-4M`S&$!
z!|_eY>yy;%Pr_nku3U0t>5=oMrw<-a+oML7y{Poq^w=QN@KsS^iE4#Tu2&7KMa8fh
zML0nZeE6Y%^hJJfZytZ2`<}H$gZK}D3|jI>AnceicToY@o^HV?-&I<mf$`jPVIYN;
zntXa!2)Fe#qoruJNR!O~$rs7L-BmtE122oIQ5Tt~FPG>KJ~RcNUw8S*yxR(neFkTk
zEyGp{!|Igm-3c5}iYFvkCx5+*TpQUcelk9v*hZl@J}#e4;O3)Mw@FZOK{;6kwzk_~
zYlA(`Zz*<11Hq(ulU<y~Z&AJ${BI)v*EE}2JW$3iem+aardkYy`JcykzIOk*7g8Ie
z9yi$w8T@YLYrBDSUXQx@<n6Wo_utZN&Gr6Z@qYp(YDVD_Z%_cpe;Mzk8R#Gge<Aqh
zL6POP<x=#{N?G3OF`X8AeDu)24KiCONCe6I1;4@mnu@xh<K#JXLW*vq1FIR~@6<R^
z;p?l{_qP5qI?nh!9{<SXzIxved*MI5KP#E%ZN&UY<k$BPJ^2MuWmL0~E8$Yd1n>$9
z<mY+YD2{$C1NLdk-UKE>|Ef46Dsk_hynno$Lmnm|e7c)}(}N$ij7h(V29>gogkFb!
ztMe2(D<_t^-Mw2{diSnd>BNa{ZWGmf1J<eCOaeIc!NirpFCV|*B^x+}W$#3e53yk`
zaOfewe;j^p<oCyNG5q(aCy6@2c0m`-IYkV=ul%cWV<gW<WAfpIc-q==6&Du^XTxOZ
z%Uqy8d7DIiP>2-u;ZM3SP+)TriF%&lr06?-vvM>S&j8KKlMLTzh1nf>L82Zhj1=|Y
zs|VnO#LS;_7`}NtqgV=**zno~eJ4e3Gayr#d4oh=Adi2$UAe?7c8S)GN!*=uY+uUd
zN(~g<+1^yX<8ZhWo5ZEox>H<gtvhdj+(|2rF~+=eGn;daWBHwM(R~e1@K8*Jg|Z6O
zT>YuwPBFQ^0(GpAq9k-zi9BRAi?Zxpe6(Kc)pzKlJCpaFR9#4p3Dqo-?O7YPKTI{<
zTkGz6HRM*tnFIUwYT{%~1}io!TN|kg3p;QoE$u+s$rEc=?9wQ<iefMM9OOSx%FB*Z
z!gkS-f&yj8yqv0+3Fb2>w12FSEt?TVDUjhb&JwzCtKGi_ayH6pmbRvX&3hItOcfY_
zw1RSu;EASc_*r#A;b+G|A+2-Og((HIG0!nj;K2)Irm5D2!A=W7Ri()DCb_=`^{<wq
z0zO3fZ1x=+n^`WFmN#7<KQ6iROzmVcJ0ifxC*o2`$*IdZIj8(*s})BX<BU2KSdBI`
zv4v=U(MX0}dxc{Q7jc2y0l3f7K>qNKM`7{gQxHl<wyKr(!g1O?h?*)ZK1T`Vjp()P
z)+O+Kxdu>*;Ka({INk*AnMSm?kzIayo083$#jwjSg4nYQuJi+uyF#?4!d??LXD58F
zJ+U$4BB)A9iIr$b2~t#wa!Eb3MaowsD&Ll4zu6QzfpJ$&^t$=$HyTuZl19pDKM9J7
z?m@GI)g?=2ou3TofzMcmx|<m#8JgMWN<->Z;9**&{4Y!cINR)$jTxdJE`e^|o0kWE
zxV4+A{o>FgP|GTetL~zZ`sjK#5B<tZmF()=!xwXFO@EwGD;Nldj$;IwTIYY4_m?%E
zD{d~k<mW!aH^4>h>zqZidB5E99%VCs-nh_+9sA=nj<R(pPJWfHLI(HiP^aH!R5%=h
z)29d<Sl+`aH6H9MzO+DkQZBL|;Rf4RE>o2A6}jY_8%S{#jkzj4%vUHM3IR7zz=K>N
zjOK?4JCzk^EG3M+igvol9EGWlXe#9*?7WJ`(iOs3N->&2!?)f*@g?Z=HyjG6cv;TA
z;457?_E8}(xG_BCql{h=7rIm}NLO>dm!~=}UNB>mx`ICaZSv7H)ycyvBQ=WDAIp#f
z7^60UF=`WkPzkb@%?xBCD9d6Ptc`W6K`Uz<(R5=`Ap_yC7<B`UYzT;lNQQls-9SWa
zQ_L2MirLCs4)xuN6kgZ)1ZY-$F`F(nYJu1&^jqO^j!kkbmaz+JuF2S>>vO8yr-Cag
z>B;XcWi7wMss8pH1Mm<N<yVZ*Xz&$Wg|0j}4!I&SrwH{dlD^@;R)VkX!i6)TT5V`p
zU|?w4MKv6uH6BGyP97ePjzu2T)kQ@$Y7lp^=q}2;(jW%ywf}g#Q5MA^7qNB7{Km_I
zsCcBh@fDbimX>;!qNBy^rR)4(E?fXVUU`B(b97}`SlsfZOXE||e65*AyL_MRTc@ha
z$}ZKg>DSYjpAOxlW;5(h>`gwEVOorxvwl9o44Y*i9=j@B&Bk1hrg<l!H}p6CI__}j
zb=>3h>+M|@wxV+|eZ}OF%9k+w)A>Wns1@;ez6@4#ewSdr7S7nHuB1<Wd-xd4S2CV2
zL5lXIA@<z$^Tnsh<7xM^YA@HEk@DU6>B@Jbn+FaYJ-%13hvQ$re%Sm@9cp4d)y3A<
z>CR6-SKoQ`sLI7k4Q2{ujuVwf&#R|S>^zi`W_o$qQD3!<;E`2L*WOmHNm#dB?SK6;
zoxFG5YK`7d*}$vMF$y2pw)ZQaK@7yBu=uO-#-dWe-a;C%VE&nr<CC%YU`UV6^zoP_
z$qQ5hJ$e`bj#LkaiGZVDP%jNzjXKRml-<wj^3;`i6nkq_c+8Qg<Yk9@4Ema8>(51H
zM4pK>WPh|<MxiajUD(Rb08gp}cv2O>lco+<E4+{idO+5p2U6tuHItn!D1bTOnHC>n
zkW1pF`_<@Rb<-vC!lgF=#2CZq=6$Oa<V|*e{aAFZ@t}Gltz8=)=c{sAeEr_N?CW>3
zPRD8YYwpsGt39t-t9nnc6XY7jC<a`uq5<at7;s*VWl&GvdMF3!jY#pY^o&6`{o!+C
zG%bZ;9|8UAvCL{d*o*sbE@r%SDwu|DWIoF@MnPFWXR(ig{tfVV<74cgK;2mm*%ToA
ze6|TPUP6W|e{vPK9PZ?%F&ul}ol}&bVY0ly)Pug$t#t+i&kb-2gN+B>bYuiePq6bi
zaz#k-$?PLoA_r)pBUkp^xZ&RS(8Tx1sx`15$G#9^1wy26XHnT|Igt@YV<w{E2L@pZ
zx4{v%9yS9(*Qan_wVnlT;x}~p>ldq9RzGK7%;LZpC0MowdU2l%ZfmtnWw31rI@Qp0
z3is-CBm0u?9m22=p))VtIriS4!uS!~cN}`{$e>2yqZy|F3s5dnjOPSfemPVHpVyzm
zh50arc?f2wdARB5d);uLgklJ{7*(_v%sR_mXt{%Hy3+zR^@Exo>uW-NbjzR!w%}x=
zjwIYw>0rtZwPN~zl=b2i-*V_jN9M}M(ST1<%;TV|Gr4oEnNa^c?%{KAy62`aIZ(YT
zu0HBx1SW$ruW<Dl`cjASnrXAm2>oxsYB?;ficv3C-CF}=WXZq~<s-t21H;WVXalz0
zpdD5Wdh&5G7;vn)*+>`5*>J_Er`}Rhh?2|!l$Eag-V-k8c6^pAkkr4(;9x4l>)Ct)
zQCQYuS0^Bqt{Mmn60l_@0h?A5TsdoQA<~UwY`DzKpEEPKbXarW)wApD^+uR$_CUMN
z9%!%G59HWu37h$`AgkGMWgr)3aEVwhTwKq_wUK9I@;FSMjmh&8zT%hZ1;#9UkQnn4
z=5pg<bymWiD%xO>gKm&B;{a$A(d>eb^7}0~)L+Lc6BrMb6XZZ&+iWuElsOl#>+mV3
z{)O?m#)B!$`X=^o)CG6(fo__Z%eLZ@Z9L7HOvH|30=egV7;Hf?#Z@#<4ug~@z#AuN
z0E*{GPXruWIi2$)XB2gZo-z#+ynSK3`FU((+;jG?&3v^IFod5`h4Jx`YSh`}-?&2d
zMXJE!vd@v}0d5->uCcM@85X0f8l&)dEo)*c5=6sG76FzQkUVGU$EZ93#^^3FmkZ`l
zsyGsxg}mMzHqD5`AX;b8Npr4QxU5gcW6-oOh1uKG4RsP{`44bf{SKMI+;GMU<&9&Y
zFrF)*+l(vU+{E4+&al7w@D}<???Row&IJ=p3F^6=K|c)VgkW8FFvUz|pbKj`bX;sn
z_~bdvR0c*2QqGujh12lo+ZP$k`V?k$6Pt}#vHDyzOklxGvjUrG+Bjg#&3sNU!($<u
zNMI&JBMIu!Iw_VDliq?}AWdN`n6<4dKrcw<Dq(K00R{azoI~Aov%!jza-MtGBt4Cu
zE3Y5qyj*Y7LeJAb+9FEzdwtovqngR|%oU5ZE~@dKH=lzga6G;57A3gv=g>m&&6oZq
zwdnRe)tzkZ%)^>``d!TAvVL&?Sd9`iks4Q8aVnDo#{j#(1pr)W7Zz}V8eyiv=?n9j
zKmd%P`6Ula+8hVZlxQyPI+_xC&i*xmbEBQiDIt+o40%>YmRCv3eK`Dv3aX_%;lp5W
z@K-49#>@O@kRYxbqgfz7fZc)iW7y1$<)=dTs@a?qdy+-e)+G=O7J*>U3{WkQ3mT$n
z#g#}7J9w}u86^q}`*6b6)^ST1!B60xpU0*$;~yKNUbB&+wTyjHsJj=&VY}nYQEhg|
zJ)-8diNm(H3vEWnNH98HSc)|)KwFV$(Q7GMEG!HIb?z`4J`IE%@3&y-y&BE1i%+af
zivA8fI)WuAaRLDdQ4AN0JQ8=}#F5mKY6ZSfQA`;!y!{N{sAyyj>QjxVF^sUuOW9nQ
zb+37D)_9FDR7cey6B<;-4$7d|SYP#6WY|Jy9f=9nD0C@sJ=PZ+BbPLgNcGp_Ti$XQ
z1If*|jgbcmVcTxmulSLmWHx|eh=xt#jQPM)kSPy<MS3IHh3?a}2(+s2={jKsb&Os*
zH*RU5ssDNgRK`7rKxGs*u7VR*J4pFfI7kT#X<(myvHs%Gr^ZN|^&*SCn*eqHoG=zH
zp9D0EU#QFsNcWx>u*A>z($a#fm(FIX*`|aAuas3!!4Bv#k^-`p!O9%&004{E#HvBr
zglkgNHw!@D)PcTP0Q#m*)Hl&TqIi(Z0457=<_h!?j{juwrU5`OO1A<@Sr^j>?1l{7
zZ9q{Y-fhXu)mK;-9P(0%{u#jtEjnwU0T&Lw9@@M(-&iqQJ}#cgjE1ZJS}|PVJ7E~#
zqZ6&>!x+$Z7t9fFaIpXg0bf}$<AsT0Qt2y@0qz3lyTEyynV42`8yFlyA>EMOb|^he
zU?6z|9`EjL&`&6X>`SE$pqYL^MY2%uxFvyly?k%V#WGF$ztGP#c#X1)p$|W%apiE<
z*5hU&C*9=!TpYf}AqRVu&^s(PT-Q1F=YGJ8@_ZX?4|m@DHBth_F}(Ug3^K$e^}}NM
z5}ANE%3uV8Omr*zbI1}Dk<FE;L!~n)B0pfDB>U<tKS^1no793f14wdn3qPr-(u2Ot
z32$VVIS&~!6=o^{5&K5yAX8-Hd)<!%KMIzJL_r1C<>r#)mgEw>45RbM?bm0uyqd!8
z8qxOAe0vHf^xuxZ#MNof7+<*=-on0^3X;#;qjcLit>2vl*280ak}g+(`w|OHAp67?
z<hJDHvkh2CdX@(w$pa8co`EEX`X$!;0;h%+zC#O}gZ$7WK>qKpv&YDPqldMqkA^oN
zt237kT;OZ(<eOEqK`rzI0UvDP%fS;SYxBxMr#u;0Q6HgGnQ)$fNvxrz95#I@9@4(8
z8`}MJAfJHi&(e|E_|Z}s!YQ!#t`o9_w7a~QHzT;|yu@TYLpj*72Yl`UpH<?)<c+wt
zLdfkU+J`fd?Mo?k{h{Z=EwKM!Sr6*i!l0KR&7Jog!Qqs8)DcqBtQejYCPGqM3!9MP
zYXl*yC|R8NZ(27%p5WSmRM;5^sl%0PPFz@%4gUDDN;FWzKL!N4>^~6bEjoXy|CSK#
zWK}@e-XlviSXLAl<D)e05p26EaWS3|1T0l1*70lWV0(gK8I3;9V1PFNHmtwrC<N>{
zC}2Coch!vpN^GqGI!jKl6SpyT0Ez;#=mSJU&@ZsJabnvh1;^lHxU_2N=cpsVblj%6
z)xkq?8cy&^(REn=`t4w*L3cci!_p_P9>kz2{0`u<281Q-QH0Qtm(C2%g)IZ0F0FEy
z%QfOoNn=1*ee(JxNUICQc?d&SL0|<RB)U+AJoR@6xFMBjiQp+vQxPS=rn-QrDuF_t
zXi2SZJ6toO#}YSKylK9L^N1C`#-7XiVi0cg(zx8y3?8kIHU=Qs>ao%=l{2~M*3mP7
ztob&O6{C<M7%K1W%)5^(umt=V?}bhnSB+t*eM*r6Bm8$A+4k)&jW%exf1s(~dS=g|
z^wcYLS2Z;Sm&=gUl$VVaKFWk{(`ODGIQ=&2rbZvf>1f$>ENNrC2skx@klHwk`=ND}
z9XF4I9aMf#)E-V^ncnoX_8_P}W+>Tv$n5Q7G*<J!FlJu@YhMhV6`ilkd0R(jw<nmW
z!2jxt4+MR(=#z!CZ^g6WOZw4R#^3p=1XlD?K#N+9Qm`)IArMNoK=kDTuXgdjkI3ae
zGy?UFR0afpypldSZH0-TTR|cm)o25p)IY*WXZ2AH%76`*R|F#d84gTRs}tHhre6s5
zAbsZ(R_K*zg0+F^$>88vUtZ6y(bHZ9L46I#=~scUQ1IdzxK}6P+3Nf$^a6B@Nu|*Z
z>_Sw7B+4c0{v9l{Ps0`_36{#wSAd3H2uItc3N<Q_y_$Zj*C^+?_N&3&lezzFN}lRY
zy1$3UU6!<bVe&%NP<wY<F;*fMVkI=}xFs^^($R|z7mhy7;LgHzyse(SdL0g}JY`nE
zS-6Im`ubg30@pBrs(npzxPx%%u8vi+SFUe*rL4v`vOC<!v8iazM-}KeS*g&!*i`6X
zR;nv^l!0KD&lb&V0F?nHKp3F&1;ge#Fl=61#2w}U@-EeDAmC}w>(79r7-VA5Q{aXH
zsd$r+DW>T~aVa`&&IwP2%VJvdOCWpr(wRd$5f2C5IxZdAV#^11So693N^e*Xf>O6q
z=>X2`n>Rd>rY-jz+QIKqB0`Y=NASfLkrx^^d`=U;3(P{<H{50HxhLRn&K@NYGDPSc
zlv6;{$twa)DCrEhv59RpXV|~LLsD_t9T>6wqH2Y^P^atoNqfRAZhZ8z6#Kdh(JW}Q
z0~WT<H6BMHTbo!NQek^O3fB4PRjLn8jI+3z@1S8NsPi4Hl+mRXFk*>(4>5pYn?^`z
z3t-67^;8D?#fA8r`X7(TPEFdi_W*{9uUQXJ@zsfYk}sUf&db-NXQZU>%ru-rd$042
z@Ko7@Eq99sRkho4g^pte3OxkwkvZyk;J~-r_p24NnI`CJbNU$$RF2mPdJp`g1knZm
zI$<C)5V<JXH^gWTGUJ)qSY2uJ*1tx1=3Im>8a(!o^yJ{NpUu1y#ocqwzrMZN#J)^G
zw?O1=O3=%^&vXy18IkrGum)mq-JffI{j#u$eS>acsk%^?1N<I~G;c%%fPQAdvHoBW
z%xZ~l7a##TAp$I)H(<ip2T!n{eAH~29xQMT&Vw)T0%$2Rhf_U}gHC=|w7YS4QMbRq
zhjJ?uf-O8~pEVInm#l$!B@fy*%jcGweR07Z>zVIvAK)N7?W7Vp+!h?b=8t9LQ(Q)A
zAa>pTp=D^6Pu7bcrv68lT}esW$6+Dl5B*MM{}O=-f!wo-#0uk$xu+UW<g(Au*w&5^
z7qMaaN)`JcI<`1K<rlMHiYAxN*tRSAu!?<lEY<yzD)T4=)385H7G{MpW@3l~nyZIA
zpia2*yaWI~6eNgVLo`n?1-Qe?1W-Nl5&-a!Ps9Km0?!aACOQAJ4h{vD1AWd?PT>Yp
zfm^&DA8{T;4kuA4=mjFjzmO7uqWA=%K;DsH`ECPHKxjb)oUM8kh09Q`$p?@WaP6%`
zwqoLLyh}ZiEu=<rBHE%D*OAwH3Q`1jxK7hvqBLwKZh@0n1Y&?Q{~qidtl)#Li~$u7
zP`Y@u^wsTG0<xk2BP($BQ|Kqo<`r;~MCf*Y$_#;ib*&N67G<wt5xz#g;wA0?Jp%@h
zFoTe`v5nJH0-tR3ZlTgUZbd-i`fs;wQnPU|O4*xt9%_%xLBGmgc~tbDJwGgXnVLOU
zjDaB8=!Pu+P|mq)s;V=KXK1ih0UsPq2hMN@02K&Efxm^W=!|%BfC0jP28X7$n;-DW
z&~yawTVQACfe!}2pq1+=Y-eb5?%)Gf@84OC{W}HV-&qa*odWRhtp4oZDFFY@YQ294
zfIMKIh~?HWKSNPi0~II`4;`C?%pGP!@?-tds6e-eLt=}|Y{1-Nfu}M_#F6}T1CJAw
z^5O3**`EMT(jdY~e!_4PXbsph62z(CbZQJNAwiTn%e3K4>V%ul%xP>$1G2B+ATQB%
z6NyBL>MTQMH(Yt01Hgrm;$)0R`*5xgU{glkeshO>-Uw!h2xYk$J-Q&E;pB<$cPFW9
zArvNdL!3$*mz=G6{p_~ooA-Cj)-eO7gRfB!YPnW14K9Tr8`<VbFdZ)d-2%8GH)QCI
zZ7oej8O(+=+`1+<w+-nM3nmo75e@)08xFMCc(AvK2AMW<iWWSiQ<-<SGnlO?<f|q&
zudUchD8^<A7ese#ka%PVae$$jQX1LBwzLuD!9`M#(yo+%Q&q#Z=+6V-S&I#UOftw=
z2b;j+haO?~EbL>{L+A;noz)3?+YUBqWiZj$s1(-3rYE$q4#nAQ7BQ^dReT0pwX(pf
zwHUWw#N#YLukh5|ojj5Qhz>pjFiK|u`oo+vpC+b#rUc~9NNxmgF^*Xb_9kOA4@4^;
zG!I*w_6RHn^(6lr)D!-1P*1W5>Jbf1P?;W~11C=LzXNdf2%b;osot%V#xcoU{T?tO
zf+gyE+-Bsbn+vw6WbWxp5>0uj2lxnj_x4G^L-eV1ZG%qC#CV82jDu+0xpKv)08nA3
zjD1%2Md)X>?2GtNr}p?zG*j2PQ4AB6v5jJ+Xm-+}&4~xW6eFHkXx<Ac+KZ1YaGalK
zz%viFba;LR0%J_86_zyhUVPCUAJYn#Xj{rsD@-vA{0K79k8lC~1QK&3sGJu-<&0?^
ze!JbH585mgEGCg@8u;_S6*b?O){(DHGNARwwAwyz_vK@=eoSl6%x5`FYm^<lT4F=D
z0`123aLXBt&;aM`P7yQoB%d$a0PzrqILq~bLAY(hWN~OV#8+9M0f0&wKq<1g-{A8(
z8USz<_}m5_4)|sl13^3oaKq=23h0*qOu%FtMKUND%K(`Lkl6#(m*0vj%np~6+!f}2
zt2Lk<OI%u`NQ|j+X~$H(LEGBVBSnB70Zho~&zO)KfC=$I$?ahK<aV(Ad~~cG<8rJW
z;{s>|oG79Zjw##F2!f9@R47Ib+}OpuN5gQ`!29<naMS?KX^R`Uk;*N5YKG$mhK)3X
zxPd_D8vd7d+~%uHrbfhVwx#N!n%@Cb(-EMW)17P3p>~{Etn$JE=J@`7?Km?Zi1u}n
zMJKFn$C<4Yab{CRoLLRVneD?kvo;=jT-hbSm8}C@nWKm+tJ34j_O;{60G9^1vZ{qS
z?YOdaQTv%djB=g<Am^~5%;xjlgC^-Q(0)q}a<g44z+`*!i7nv0U)7eqJEECL&s^oN
zovE7Qee-#|23d-RS_d%Mx{7Ze^e-uex9e56FKcHW0uS7~Scft|jgce4WV=k`LMt}w
zLsly5eyjck5UcB~mDR85SM4Dy7GlVX+o2^mWToI-bt8nV90oL3d|Swh)2EP?QXI1K
zM<iA8DP+aT>vVaXdOU=z)Oq}}-cYf*9pF}j0d6%Dxy_#eZY<mr3oyX#a$(ht^l<N8
zP)9q!ZSrwuL-ZOE;5JDQaNDT|xLK457Ba<Vz_BfBW|!aD#x(<ut*Z=;)#KQP0gi2$
z9>+Eo<Jh`FXR{bI3<Gn9*>am9lH+U>Du#HB{Wy~2nLd)kQXk1-3BGv%i#+=j$#G4L
z<e0OY*$i<SSpbOI(JXB?;Rn|W=6n_ebNKYd!5sEKQaUeTGPFt`&2jBMTGuRv|64VP
z=9ug4GDR&c6r(u?)e2W-XgEZ3+^H!mgJ_N^)qqct>7zOB({(#DuQjTwui$8o&57;N
z9K52@rHOruLQ#e?hh|^Y7yCI*o#SI~w;;DRK{JjPsC_L{kg*D}jV*6#7Z*>}$8%)i
zc#g@nfPRsVo786wj_25OC8t(ZpSQ@>d2ZlT%_10>&+#00a6HGWIUKH6AJ6f)0&NC>
z<ROUXXub~d9A2}=kMqXy98e{F@PPxiq;-7!6d57L_Nibv$0u$%I1gq*)PnFv`1!W%
z)UoX2g(~Psdpt*BExHU7uEy~k#U(BGN+6zN_Uv(E-Elm}4`Mur>rFwT5=_n8;yF(0
z<2fEeJjX*Zo@2}lp~FOo=ZMqn+;RZNb5#EE?2bO3W3y<7A>lHY^1)&}$7l!ii%f{o
z$8&7{6wgu6=+%f;!4Jf994JzbbAxz}()@cZr4Y~IHh-)>p2KA>#B&s8-l-^rc#gS#
z)7s)WIzAWUIh=4jM~Ob3!=~J<_b7<x$P(i@{%~611@Roco;#opYUq1=G{>a|bQz*K
zjBzxF`|PpfW<xYbNy)vIVjRu!gBZ<mYZj8Ikm-xIXpTsIG{<O&<`^wTb38Cd9qvLj
zN2+Gywlz4K!?~}eOItL@A&hrxekW}%P=+3jNlQC=^vs!<V_{)2(V<{0Jp?gNkXd2|
z*t_dLQu~v4A8q^pU5JSd4NqQq>}yN}EX072ruWh&2nc~f2nU3~XGwcNh;T@OMtyZ7
zA`bP-e|7^yV)zcF9=AX@!SECG7wV@xE<!pWsw^QiZbj&l_>?nWt6lanOZRHnE5r$Z
z5Wo{+XyFDw#*I;+sCHgO0MXbGZ=4Sx4TLbqSo>BZK-uKm@8OdHu^}RC4D_MqML5*F
zu|3q>4`Xf?PRAII(`^`z70Wc}S5VmXO=)k&X3eyH%>QZbJm91#vd3RNvoo_hy#d&r
zC9E3&Q3S+@Bm+4pNl+0)1q2jDlq~8QPmV*y(*p!WBq_;&A|eP9RH70E38x?`2)8F7
z{OILI=>PXschAnQ3nB-1|LRY5s`TpBd$01VK7cX*iK&1wzrny@s&(~<>>n38OM6yz
z`uykRomQqllBbu_%*!iCE7Vc&_7rp!3>e&-d@ZpqPUt&$!W(Z+d~N!)i9;XkH~66k
z$XniH;r`Qmj}{Np$4(F(37y|=*Y-AaB)ld%67VoQyrkFKoajFaY{~jvBeJ(G?Y8g-
zXL2jHJm!jN;q*C|7c9FyJAGD-Tjkx0cUFx~*?c;&qz`<5QNf8w^YPg|m<^kAxjP*;
zr+xAHu(|D>39-4CGb3j8auT(S#VeMTS-<%33hOe`o42xHIqAJSoAl<bd@S5=`n}}i
z-Pxo!Z{_gt;$G`=(ub^c(|f$2LuA&Ze)rxts6VAUamFxelN(<AsQZSjb@M-3yQb^>
z>jw^g`l-irw_25!MQ@3ASQ;8Tdi<D)S*cbvjq3+LWwkwtE_|Y31c`pENOa*7UZNi>
z5?%O&o9LrYEF_aH#%Eg>vT$pDbYKVTvQS=h1`1zNhd&>lu*Aw)@{HV|lL-ZXeJZ>p
zvoBNUOZqckzGPT-`j7^{!a0$_pnJyi;mI-Y(|B9V641GN8)XAlz6T-%ocII*Cq6MO
z``3mA<(}g8(qqrKiWi#Sd~=KDOK)DccInb}IUO^5IGP#Wf6vj>&=XGA+YN`Z#Av9q
z-LQUZtNy5L{d-kG?mOA|kfk;MWPgX%_b=hOA4XGTVU3m2?bYybZ(UZe1Mg&8Rauuc
zJeze{!zGa3c0~+qxo6u|?&mB6X4_Tn3Rw<3T=r*K37`9|gg)8M$D!r^f*O6o!`&rk
zLstqwddXRx)>fTpWaYr{aG7KEuk~C0JF@lrPt3R0xFbwjGE?iM#Y}A{JyV<A{+|zI
zxAkUfhkuZbd-~W_GSmBBm+YQ;ruV(u!aZ|SpBh!rZj@DP)SIJNb~!rq<miIVW2}ot
zzw!6PXzkHS8^c@7HP8PoIeE(q$thOPjDpuApQNjCvgf`I`f6vYedunz5KX8L8*dr&
zjU#Bze+eC)^L$_Xj{1}QRs1#Bb^o$|JUTd;wMuv*VRXX8gqaEJ6ZR(@K+EQNftrEF
zf!2Zcftv%J1HA%I1x5s32uu%r8rU4z6D$+V4OR|b5$qD|7VHrm8hj@BY;bn)gWzVA
zB1O@hX@+Wqt_|G~Y8QGi^g?JNx~=AgmWRFz{SeA$x4xBFI<Z<}qr_Vi?@YWWv2Wtw
z#6KneIdM3<_9rFIOq`SWQR0%sO^JIF4<#N=N=&MdR4J(%I<2lsYMyj!Qtza`XtWxf
zG%RThyZWam%}ZLIv>8oShmuZ&L*e9bsc=TPY&a`iHC!v)DBL=HbNKdf_we1}KH+}h
zLE)j{XTmRrUkgtO&j`;6e-d65-W2{eyzf-weFtQXXAaG8M>k5`(4rZsJ8om={H*;O
z7VTWsYwmTg=hm7S==SoEaWg7>_`$?^OJ@$eYb;tlwmov=#}{Q?)~w^z&F0;=`jOo4
z2L|Rp{`y0GD%{a?@GTvCOq%~EG(g1<Qg04^Fm8qnQorpEQg?9&snG}(?SQJB($*zb
z#&=t+(A1o3f{pv%+T|*CgI#mLs$yNR{bws_*C$<?OwHXU`l+tk5{+a<FQLs+I@$pR
zAE%ssXoVV`5+|^C1ZBRO#B&$`H9^R(S7?vk_GHh#w`5({39Ubi)_uL?y}q}N&fOZ^
z^=PZlFU*RXb!tX)a?LH)(qP;`Pn3q@ytv^Yvr}m1pf1xy%}=vD8g5ClO(&CWIx{Eb
zZcXThp10ewYIpu}&)$V!?)-S}gWV?Qn(k_-$<~z>vrQ(E$5~?|R+H#h*>l#?dNuIR
zu`515#I}uVRz=ftqB8YpjSX7wC0G{*|1qz7^X7MUua_IWFlhFfG7x1s)anDKP!Un9
zoAlrR+$%8oPxno}gY6&<t3-|5^u={PzJBMNxl`VHJtuu<uW|1WUXb<kKet)6t*fII
ztn_H7=#XfQ=rz%v(LY;ls#v#tzkTAAMY*4a`n7K|<npX&wdmDfS-shjOS72KmCLN)
z#nCEJ^XBGF?t66HoR^5c#n{MwgZuXDo^@N_cb9DXX!E9b-yC@VnB3IJhWT^1p&6>{
zfWh~m*5^YsLtV3=+d2s4fK^_Pg6D-z7MQW=$$I*cFb&8^^cPgN{UY0q8vfd$MYKM8
zdfu|OMxsrv4`r`V|LE~R^u=w~iy>>Eb(~Ep=CPW^=K;+~^=_T&x9Rrgr;oAp@X~SX
zI>>g|;7(R-=b8m=t^Sozx78EnJCj&>F-R-waWr&AI^8*S@#0S=FU>vDB-rKQ?!9l!
zDyX+DB7^BSu`v3xe|ig}Te2{^)vfp5c5lw2L7iqc&eF3otbqAoTx?e6!Kg1(y+OxX
zx#m|B8)ci3nFA(MKm+^qnKFO^n=+FvroUbgskEU5?c{{5+pXZsIrV~#pXq*Uqpap#
z=d517=)HI5(3qSv@0B-3E=p_|95LYOLA~#0+iRCjH+5`SA@$zu2YY6}sTGUmv&Ox*
zo|pVy??Rnr&5nE;o!ywWT8E&O6@4p0>7}iWrZG2}6tZ%^sTn;#dVbwYqPe-L@B10q
zSP`*WMJrk5qK&vJMa!YSrj^ChYQ)8Vt5r^q%xEo`FA+Ma0plj3Rj(HN3KFdvT=KtG
zjwyrT#ik}6Gg%yRKo*B|LB~vet&8o4(b?z%n~g58UY$F?*9%=>vllV})o;i{C<B}D
z#v89sKpEIW{h;LNk)U<3;E}+mzqV;=<%gn&etjg+zMx~s9L$8fL?=4n_KFU;@2!ML
z-9<e%l`wE<jb+r(ys!V~UC>9yi2KLrBb$gmvfBsV`Cyl<t2(XUw{Ow<T?^(tc>Cns
z)Ry!H^kj^A?9v+9W?q>6-gi@a>rsQ&5OrJS0;z6e`Q-_ty~Z+1%4jT4{R<FVy;nwi
z2Z660@T~*BmZ;@Q3RvyX(UpXDiI#k%MhfmPUJlV)n?xFC2U9n@8%?~8Cq&80broB=
zzA9vX(0ATvf&R1RJhp&tmgo~){m`;Lb8_de=svyMErBWb-8B}z*9-M?%|X!?Yb<k}
zwQlWGL(GDIM?|IEV?jO2{e+Bin})?i{CK?v;_sUpSShT5__pX8h-P{Xgef!m=55T>
z)5sk}BX?3<BiD>DH5X_byf<1WV6s&C+b<neUt_<V-(6IFjnS&U`rLK*<BqDYxmwlN
z{pP|u5yrBPy=U;tjjVWhEAX#ibSgc*SFN`K7Y3v6M^^+=*@78eB(wY>Yv^Usp_j5B
z!jkXkB?-|*tu1taMXHC*|9kdG?ZMWMM^VYvGmHkRg6rHh<h1fxLr!PISwr4kuOa_D
zSLEjO8IIh%zps{?FFgIo$cM5zckA2pp<Ht)O7+Z$-dLM{YtgN>EG~0)*P43U?BoTq
zZI-3_iqXNeW=uPly{Svz`v&HkH$I)SIyi0Q#EH|gW=wdfU#@v<s$NwLmE8O52ipri
za2E;;oXPg$=2jm2d!jeAWWAtdxO$*Th}CIk6f3XtCv|AHUfhEYLsRyBXRJ3TXKyi|
z`ka1O=j8{MTNi!KtjAmOoF|)^%;q=II$d`2I#G=!E!5mwGJD+Lr)Q3N?CMD8VBgMx
z$A>=o(4Y!^CJmW3ebVdG5X%v(&MG!UM?%p-zcz{#lwb90`C#fL@5}x-b5*b){gH6%
zpjl_Z9cyz^8PK=deT8}^cHKrAm=8=CDjZPoZ&Wt!X!GfH*R}0Xzy692Uw^&w(;f68
z@0d6qtzCB}tX({N`3I(z)F7JF_=Zhpr^mZLU`|15y!tX+)`FJwx}{6kuDf|@b6VET
zp?<_A<urx*Z8GIFkos+K^rFF5t##J+b)vY<uXTG(b98$>;OO?UK8{3Fqp2TuWf2mq
z)T629y-)V*+r2_NbFejbt<`ru0l&7kZ}FRNWxs<~F6(mZ<7icO&#~n%eWLl);z5J0
z>la%WEVgnNgHY329{PCP`!hev`s~f_t#hOAn8&WTA#CPY70sYk*0d62iCq6^>w@q`
zRM8=|Za5sRXw~VOvm=!EW-YTy)eAcwvMO7Z7auy1n`%`MWn87UtpT=_YgR`~<(jg$
zfo{?!_BOor&@?kQ+B($pwE<Hf&FTN>&;jPqDg9r`nfl6WQ{Jm!wGNp?8m-f{a<pO;
z#ba|Nn`~c7?b<DnI=G<prl*EX8e+c90w8ns_1Wf6AMZ6A(#NSv9oplmHLX^=qRrY%
zg%Mo;hE_XRCmPDVv`*`9qluwAP3!(+{iF9cN6!I+-FGl2_lddc*3S>n{WV#H@~OGD
zVAxtSdV^KYTy6cZ+8lzAF0=A$)BlyI{mNN|$_LXOsjg?$_nXgJx3pccVYxLABtoyH
z)|eZt@#_rpl1br4$nC0^fQl;XqW2%~Z{2T3n|@hv-fFA&mxj4l+}XOf*NnE80cFv)
zVt!y)wV6QiTR)n<uAFD^4R(#X)JRfQeGODu;~SObdrDRGwd6dH=_=i5$T?FD`QJ+A
z`p)A-f?1qj)sT}uW~n@*wF>wusH#RwRo+;ovW+h|=jyo1G&ZX&;|TvhQe{vOR*Cn@
z;$MfIrmpo}sM;FSR6}F7%J2tOp6@0VA)c1L>s1xs?L7UerST)r?tDK>n&0ugQ<e5z
z$+Lj}yRZ*n2jO0x=RlrSaXYTc`f78w)t%}(UmN1IRl4tD!imjP7jf=IOM#hZSu~uL
z=FEo-@>t$j#(7&uRU^(FlV@e}QrW1eGK~M@xsP*ZHqeJ!XaCDsuiqQ{zEq9nzwxd1
z-&luf$oB+rb&V=Tp06=hs4~D;##oBw-FTn$JCNQEVCX>llfmHz@Gfcoq;l0pd$_MG
zu#C31;J+;AJX8hG<$<$2XFO<Y`~pm3ci?snX*E@qI8me$u%sCs!SN(rUQ%Z163$mm
z<@w)M6~S{k%3zjjAK^I!`vP`2b~W}H>=f)A?4Pj9u#foLsH*-uIG5~vmEtqO>*JjL
zb*-wxIU*^(9i+Vhd>jRipYWGXK2Grc4LHA^ymO|MZ>GA)$H_#xj9%p(`Z3z6i@{f8
z-!fIx$beSH1E>5KTy9a#`CmoTgV02AEA#<v#9L^iiTchyg#8u!ON=grHsY+N4WWy8
z8&4ZT10`80AEAdhOTHVb4WzfhK8`&UqXo%BycK#7T8Oto3obqQa#gnQg64-=>N;bV
z#-HM>xD26Vp`{q@9Z_p2lh3iM1xIi#>iCM{NXkm{kl@L)!cS5b#qg%Tg}>tMH2i6P
zBVXTfPSAH~>-$dC0I&Of<0+#!9uz+Gtj4+Uqi5srE;J*&=~*d<c&qu<<JtJK@!o}p
z<854-X`U`tW<r;mr=;9GD?IJ-bzC{>G8G<=xBFDs<#|827yLu#_Az5(p$u^4KqK5l
zHq=HoI`Tkdf@kCD0RDg{8)L7+*3xo9Xv4D&;b~89#M6rVE^;K^YWneH3UbAF0pX9R
z<yhh0H(e`yE_MJ`czqZ)p4T7Nav#|clP|~&ksDe@A}2*wY59U|0q&|IcSI)e?$ik)
zPeqm@*F>giS)QoMae~qn*rwPTSoyAw&BM0Fid#+Wd026m?@O?kVK3x_%GTJr*jDaS
z`Q9dv`_!erIXt%$_8WD%Z<3B9-JgfdPvoqeYdL#L?hkTqPebej`U@qMFiKdBLT>t+
zG;4K}O6}8apq!T`Ct)g1u99bvlQ}chc`Bh{=cak8TI2RN<*9n@+qTYA*Wc9s202sB
z@o8}GV+yiaz5-~#OynG`GT_ho_Nye$e@sVKJKqVUTMCI^!Tk;>Q<YW`m8H(RtJ{E{
zYNo!=*7pzfeWAWD)AyCVICVg+)%UOTeY?Kz()T_3p5LeU-QCnL`d*;#C**D<=zCb-
zOZ9o+u0BSlzMrS>mGr&pz`lbx_ekHb()Vliy_vqZ(f3=xjp*$B{X5z~@p<aIV4CBS
zPI<>smzdn8<isNAz_y&=c6!f5%A^8ZTa8nc>ZwNRdN-y9?(<p~2In+SgBE}T(|Xr^
zyH3lXcG3_Sd7R}`mD9ZB&UmU(4QNUcb2HUN-9y;0o8DM3Rf?K-rhaB{2AF(DjJNbt
z&i_n6!ppx5;t4`Kue(0pj`Q(O-21!kyN@%#BF0EgZ+g}kW{fiaX8gr?k(1SibDGl=
z#-BOY;A!JI<FCdD;~C>g<9SkAhE3Ed_9w08*bm{^F~)4;edASQyz!bb!Fb)6$VpI>
zjVZ=dW1jJ$@saVeF_zPzUNPQdgm|tohclu^8;gvE#scGGATc<#%-}Qy=P9SI`WROZ
z7&DAFjW;+^YC0~4#<owN-erKNGVlYnJUn7R`AOg^mHb!a%+zb(&o1hAbq9RgTiweE
zSA97@ZGalWscFM?OwGCH=#o!{Z=9GS#8sBq%2UGGxaO*EoINNfEDq%yw()8*XQ=I>
z&f2ZMS3dxsL2jo)IRc9T@1_a&@g_3i9!_A3mkAX#9SHA=yC)6K*V3SdmIha9X>hgX
zXHB2lFI2Ce@QHpxP5KGV-q*8#Uv+`L*U<ON^}UY1U#ss;G<4i_swcY(-w1d4R5%Sf
zZTOjnwV%r8BsXPTWn9fUUw@-azw%Y^HDP3bm%lxy@9s|M${2or!U=s!)=iAuKNr{&
z_?7YcI>9!K)PKmhe6P^T(Eh|aiMJ$n<t&Nk62~y!z7b8iWs)vUYL?V4>0!=_7?$)L
z`f^t$eVg=CxLUXY>sN0L-x2N+zBl|}cyRc+@Z|7E;kDr(OrM!&wlN<yUoyWiH=0|`
zZ_OXfe<znpz99MP<o3z;B#%s<n!F%+eM&f`NlLGj;VCmyzDU`hnwXlIdR}U^)GJf#
zr8Y@jow_yk-)YHdSEcnx8<{qq97pWKl(tY?1|^uuGewoQ^Hmx9d(BA^dymSr|4)^*
z52|wZ7FFIZP!;UGsy1g9*R`XnqrFviA`U}MvA=;1B7B#%k3vPIIpZ^gdp>kg+CB!2
z$q8%$Vh!PIA!+&Xc^ID=#CiZIe4HK{BGv=yM*D>7$p21+^K))ym^0YIv;@M$QOX`k
ze1}vzZV`JeaqK4!TKA+{fq%6LaihJHln)brCva^77893JI$xF!nSrkeDV8I(JX&NG
zNqGRVkK{RqyuM~1Hip^x#t70Kr4o&I2p=Yv!{osaWPX)R?sND%09Yl|YxWUigl!q`
z5?i_o;WNx%lQf)MCu&@0f&s~CHn8OAbp5Iu&jEaopr#y2+^_LHN%@UoDw!Cggg5z9
z3e1-wtqf#Og!Bb}<!~)eIu&piSR@zS_%A2xjo^JGIU0l8YqYUO5#u|&!+RRCkP6$s
z5c6WN<0odpsKg#7<`6OeOw7Q_zZ`6yy}4-Y_#bKiNbGBgeX*Kk?*d=P1zzI?`zWb}
zNG%_1978GzeVLR`203(K_vzU4NTnhvIxqvThV>w^Neog#A?^~g6fTl`G-1bKK+5kp
zG5K6vO74B+K1}YzZtjl)-%((T#xRhJpRh`RHbVA4c*~%aGDu;erc+HBltx+m2y~SK
zT^*yuq`bF~%2t(&TSaK98{r4oyGSXAlycM<`iSG~9h_Mip!8DIRC^z(t^gxH8ZXcv
z9tAeuA+<6tHV(LXI_~Dj=ay)J&X1I69uP>0jwI9=a6mqY^#pX`1GmsK@E-v?LfN~(
z4wOQu46x)-_Rqv{2w0B+>oGT$1DY0w*?TFOD3st(b166NgPIbK5Dy$|Z^ONe{V%?L
zAXYflUPQi115p`UxJV!m>oO|Kzj8<fhhmN+9k#;9Kf;4Y;laH~hvx{Nj4x@Y`GEH*
zIZMRfG5nmM#DwdPf;XQ-MbIi-2UHvyN-htPPI*GoHn2CSJYteu-AL{`^6W&3i~+BL
z^>u_<4~=i2rQMv~U<-Qh*K-<qD|*Aj>}~LK3Z;=oOzTK<J$3_WQHSHpf0DmRK%cK(
zBDHn+Sx+h(Xp=XGJ6k~I^v8+)WMNBA2N?)HOyDSGFC{-|<YzzFI1bhd$k%bCpPzh*
zT!@l4KY1%p>ZM6NnUwu*$|<B=ntZLKzD~hs8nzVJssyanv30TaiSa%{4#7Tz`(N$t
z>P`Mn!%oM}z|O?Z!Y;$_L7s=OzhDnzkJ#IdWMtbVc7bsz_620gi`c(|`BbFqYPHh-
z4A@EmTSYLFrMB|D(_Up<W`6{%nZTL?mU4i#l+nfBX>_$$z*!-v$^`$VkRWnqd_~~8
z5jyT@?<42C$oX0|3M`F*$|iw{DY&oW-6`F&#9W@3%Mf!GF=r6-9x(A4{I&yL+X3}V
zLAp5fa6WKVBt#Y=G6+$T5Sh?^2%6JUTGs;5B(x~$4}gBh0NW(q*HPZ<Np}M|Xin%B
z)Ckw}Y(=hyQ4%BU4Z_>_cFHFQpPBefbMr0Gp~9ayG_3*GI;6#VJwkOc{wm`yix_L*
zFURNt4Ho(psd)m<*h47I1*GH1s@>o$1iXiVRcec}w0T36h?WK#n@4duKv)S;TGt&G
zTzs5b=omG6fm2IEE9IdHU2np#b%{@8LW;xLV5dBB3BMzG2`6El+8utxwE|Y!p&~<(
z*7j~(k5RT2<MI$D_bJ5iEivpvS_FVKjI>w=y{{$3tzi5(C4K-YaRYANUD^}0O>r*+
zy`;k<5%Q8rie;gka@0o^sHbb&v)p#Z7EW`Ow!epGBn?oP(j_V~;yAcCh<h2JD2Mzm
zMg4aaKf*CP-8O)q_}EAU7GnGd3}fKUNtF69dq46c6As8nIwg}6Q`5~isv)u8qf<Dl
zWu{0+k(b-R(g|1kIWq8M_mbtrx%zu*T0OO^$+!&TU09KNt+6k{1<kO$8*Q=eus34c
z<0igC_zvMagzpf(L--EiJB05LzC-v9;ah44sU4j<$!jV2{{t!W{|B~9F2%E7tCXD1
znak)i5BdAD_rK%g)RcT`*P{In`Zq|mqW=nIjpJJQEp3~sNV)5v|MTgKU(Tr61#m(J
zoRC2;J_Sja0!L)P5gGK-FIV^AKA12s@;!#{wYWPqkF+*Y=>Md^HPY%x(e+A8U8klP
z#G`8>;(?no;HC_?DFbd2i5%Z^`u%!GBtLyK!qrAH)<tq5yOC;TY^ncbbS;fqGlYzm
zp2v^W4JmHTu~gSF`;dBvoc5=^94U>Q#cz53RiGUq{j$19g<<v>iI>Z1TmFj>QX`$f
zFXNfIUf4nE8#upRTK{E$Kx!80CrUV}jp$(!CIi_m{nxU5m!r0jnyfamx-KnF-{n6b
ziPaVRYiDrIcWXusj`G)!{7<wG&@PbCE~(l4<kZJo3TdZd{lrTR3^e~WWAC=#r|ldw
z;7S3Jw)#DhcUoZfR`PHR_^40$AEvc;n6Gna>@D^b`gV+rNV=qcSQ4}MgEK!^+zSN9
z!RLN^DK35uv*J%_N~QphA@O}~jJ=OE_eu)(4_pUG#ZS6v(1@b^GN7d}SAZ5nKuQ|e
ze~yO)D9@3x*V}K{%k6w{q>a77-bo2>q#dK|P4;K@4tovXn}B~iINT?mXzfCa_AYYb
zhH}E5i@V^#p&ITXpxFRCN4a*}E9~QZALZ{+4>J57l#-X{xqv1vPQQJ~j@t9+4=IPv
z#EsNr={j%rURr)J0|y-E&vs<YIWhJkd#F85<10lH(ERA66hZ>JM&6(_Y3Ig_eS%P#
z@qc1YXXS<R+O8$Evgg{%isiqM^F($i4NF|Ax;%^UL!9G({Azy#)^?F|KX1E=^4;k~
z?oj9H`23@O;K~hzDFa3pz=?Z{gcphHn6UFG3Wvy;Lb~+4o~@_-BM|)z1u1a1hLNT}
zXwDsE4j_)4`h!6IkG*RdFEzv|>PVyqb?Vs%6ltLB0%HFWNq?kdsV;S7VLjx$x%UIK
zWVGR1YP54;khNz}*ZeozMLN?SSIQ`jqJ0Kd<B?Ic?3`ALc2AD?b@t!m@DQSmq$L?M
zv`wVG+^Sn8?q4L~FN}6*Q`99E?5w?L+N1sI&=S$QI0SR?Q2rNNxUsBB%gixeY03(@
zLgTg}ogcyjE}oNVqsPHs7~t%^c_mtuqD~fn47VrqaK`_2`Z}+LY4GOONnk{1>#(%g
z3X_m9e8*$x{6g2W^d=+L=d?|Kmf$?|XgpbS<}t*F4DeB!acI9i)t(9u$_%qM@f4;@
zF+dOM2hWv~jMBf`Py1wR>|f>B+u3-Le4d9bZO@{;{fB!(^k>dNjahQWXtDilZ>IN;
zvA?1=-L2_=rR!Tp5y*$WyT9o3?Qo@xtc~#GjpurHpE5q-7<pog)=vBpDu^>K=4czo
z{rrz!=~=!*4`9D8N93pOuj?3*63e})<<0(rKhjfHjOWOG3pJ7S+s~1q9Xrwfh%zf6
zjWbE*Ov0Rvyg0QvssDz~<0Y(&C^>u}xjT2TLcv}iiZ&a4cAXY|7;QpY>dV+wp`qX9
z!gFpRD@^f^zFqf61N;&HP9NR97f7w8Wkvy<qRY!|m!+J$zWK?Z6&{6Z^tb_SRp}#0
zZ7w4&@lu$1ydvKxi|%aym-UyPyp)j)sNIpgaS7@1gt%|#)%E?G|7ZLIxqsYVWv?Lu
zANAC3M#g0H^EeW0r$9wXGMC98a43e+w9WkUQ(NqdZ+E+{>%r(E`eXS>qh&U02<@3(
zNXN(N+(}K>LzLEm!ci&GKB4(vWJ9tZ^^g%|F;1TaoB}C*1h0ju-P{{1t77eQJ^HL$
zaZm)S>4Y}U%+P|6p3)JaBey3hWhvu*K%)BrPJ2>jj8ET$bnTW>*cSc$pp@4pe95T$
zkM<scfUhKSC;hV!Qj3{k?qMXH)RaD#?q$`7?!T182aI*esJ)EuO0I+^j_8mgu@4vH
zB_b`^Fxt4eSZ!URN8Rcz(WRuHMd6JfmGfkHNuhrGk8w|ETrQm_sjC-2<HxAy57II`
zN*ipiN`j&p1$06Kr%%$*QsqeT7zw$L6;HdVlNS)fK0PMxh9u11;{H$d&6wT^+O)H^
zpA^QAzhLBeKXubV`zzx57g#ONb1S1*0bN2f?3IEEQr}QGX6mIt+a=$A1+HCdFXoy;
z*^blgmXOe}HbOJMV~ArD=|V$5$Bq@vYqH`{>Pea1DGjx->IV*XW_^z7dLb(HBxP1q
z{<>*_N#;F>p^R={hKV&F=za#5hrw8^NA^3Q;Y@iUBQ%|gXFoKXFH#De=@}fpqniE;
zqk^_IEeRj|rl$@By1pRZUkDO6>MZ#=T|+E;HI98qihOeFNG)eZOxE8E=9niVeU}>r
zSEQ`JibCbUgS(V7`1G&?rHIsapq2j^&u>X^x;|Ig&(o^iCDbnl8d(ds&Vd8=Bgbdc
zYncNjFXArGrNkS8D~{7|`pjN(vaw3~1u4wVEMOH|2+S`)F2hke>;l@UD@f<Fxbb}_
z>0);zztl6XIeAV>s$EX*+l$Y6o|)8ugXSY+W9<mIO=fitNG$@^s1+$==0N%Xv;3E|
zxKroja-c9z#bBW(!uMggML1;-bFES<QI3S!uS;9UR5Vo4FG}cjO7J^R0)q98Qff$z
zeC%xA79kmya(%)$nU+e-ZTy>3+77n$oMX`#<x9_}t);})XntHlX?l`yrM*z*h-d|$
zmH{6Q)`b40t+9htrJg)2F>;snyfTC6LV}#rwW`z}h4?N^3aKn}rQaa4rG1h{Shtl@
zjK>Ru{D*H|Jw|$3@|OgalSC{JP8)Qm7xIq*02PS@a=5vW8>CKljiiI0W5wHI#nL;)
zC{^*060wwvUVWfb_R@-fBFUtbPU;@-_iVl+TVxzUOYM{KQnB<LX)AYW`OtquqRQGo
zk>Cy<lSI;DipzA#orH<aN0DdRQ*L`PqC-dwr-KC<Khb?jB<#l13PFHE<EB~ogEwiR
z7ctV-^Pq``iQcKbi!{ByY>`0l;UXjkYdt9SHLTMv*^tlQQoH;q5whJSMWV}2JH&^f
zE#tM0<6*`heUUZrSqjuGHB!+y1)q+QH$oPzQ6z;DDSM+AtndjfZ}7fi?Iw}JE3kWT
z(d#PXse?8bttoJJ6u4FtgUpNS6wmnld*0$kA-!Kbb<ZsB+j$jCYW~ZhA9>#IQj@Ia
zag02LN`&6AB0<Fg49gq<wwRF;aX3Kan9#s3@j)#o{-yM#EpSHtCmkwDVj~va{&wFe
zXQ>s~xi97tF5Kg>um4x(e^p$L>bvBk3M;!5>yo67xe}Lyl+IyBCjVzfLqh6KeXJ?z
z55qzne(>&x0~QqvEM?;uT}rgHij;$-9;**MpFB&VC@>{h(<T;+aKL8L4$}uOE*Yhd
zu^hUSI(H3i3cb3<i{W&hu~gzbm&`|nQX|eQBLhe?X)!zFa=QQHxYLgKjyG26)8d&O
z4;fTc*n-y_A#IPiSe3Nv;zM|Dth4`(A)S~+is895vg9)rqzB`S2vc%}0qILFhs#;%
zC#!Et%vYi365dLbXGntI8j)PZ3^YRjk)8<qOl15?Xew6b#rz7tNnc8MUU<``D{z&@
zm++=TdyFrXATwf-5l(1G$08vZmyi^78ECoV#1ixW2VY8V)AZA^+^+7`&x|5*){pzT
zCHCHmx4!zz2^(`b=}UdIHT*&2;R*cFnmv{AdL<5Z(Q^N-R6bSkleu|y0Hb_(5zPA>
z+NTcCJfNjzNp*n8f3ZGYl2Qk3*5$fQaw4?B^K|Qg4929S<$pqIbg54~LyZ!XUK$=B
zzTCQ+Ud%>n_W<;>JH}~VKt?2Eud$aREi9MH$Y(w=2aso9$Vx%rj@3G`fd7*h&26iY
z9Q)V{str6A!^ZyvBsWnl_he5E`$1i4A?tQMQ;r=2DU3TNGXFGDB#P&t@$7xbGnHPN
zvzk!*A&xMm`DLsp=}8o2Pmd4gytAI8kW%E$iQ_P|5MsCBCdyUnANeb5jd>znM>Fy)
zeKhvMfz3m_IVM!tY8DCuQXN95K|E;bCZnalH6=+a4T<B`!wYg<cHYQ3iqi4*s_W~0
zdzd|)maXh^mDCE`@P2RI3ja3&*>+qVm^Ac^0?2A=*-N8MoLq>5!f_*pxE0I7GkOGJ
z8DT{4gN%{qvwj=nLG6Vre1bi4?4&|Ibbpuc!?b;Mz2V|Chy76{jGXDxkns7eCwJUK
z?8b6?3&P=)zKn&PiJ|3M%y?xZ>!(=_Caw;i;;>XK2tL4z7-vm@{k@iF0ug;fm)f=a
z>EFnYP#@(**~Qvpz>q9u8OtrL6=$rk<e2e?jMn3RvQG!2LrN(plpNyh{r$asYp6Vk
zVi^2S(l45#e7Q8Y+i`KR_B*qP_yuR$$i4~J2n|5Hzf<FSzKcH{3U?^qsU>xMzn$Ma
zWvS3=Lz?JZ6o+hgNiDkKP?Yp77=ofS{X%E7uw1-BXU@BX!iP62RZ=9v%fg2u9UQu4
zyv_9(@T5$M0SkW`XuC*2-pL$75_?n78bCRmOv^<@^qEwqUw08&O=_r`^x!UI2Tm<^
ztkhx0N<GyS9UrY#d-eo&Qe9n*827Vhd7v7^o}Gu;voi|)7_YEHXsUWsO+)9&40LzQ
zQtzPyV~+Yj%~ea(Qg-yLSG&|+bx<8rzo^4Tzz7;4Bhg4gcXWzT#z;3ZjEY7j<07M)
zQQfFvTx`@dt}yBuEseHDJL5*<F5_;aKN?A%L`%sC^p%W49#nU0i4eIo$fxY>uSm}`
z6PJsTYnP%6q@rp-yRar2Ra&Ym*mKm1e6>M4NFDOhm7Lwp?%)=*H2adreq3$H*+Z%=
z`FdF0jJA-!s$0~v>UnjmdV%XU^%A-5!rq*h)t%(|72+7LCa~{*GWqYJ-aw~OFZ7{&
zsrsvpYA3tn_o|=NSoJg4E9#)(SFakTQChugWE%C<hv?R5rgj*ujW%izxooF?GCCMH
zqffh|(M{!(=l;qf&%?mfU&(VSdN}?Do?haLpsQt+kx9P&#!KYDF+uh`$p~18oHe8L
znt_wbj7H>wjnYs-7JK<c4@GS*&m^)(OZw^6>8TrT|GuiG2Jul{<GBX-6v{{drvXhL
z^yRfdj}drns;=UzzSh0sw89(l-Gr+tS6Q&$8qBt(G}=LJH$qKb_*5qv_uhioMAK3O
z{arV94^w(A$y)<*q@?aTsi~{S8>;Et2t;C<@^nVos_Sn*clNLI-!aYgTOIIFc*(eh
zbGe>)RF7R3gA3=nOk?8;aNu26k(zUPAss9TCYpkWw(fPK#)Id_dDa9=g{BU#9z+*V
zLe3>nc(7eSc{x{MD0!>OCH(AMo|^}M4ftG2)VXTvlJ)X)2_@`YPR=fm<*Pnr>|BkB
z(92_ewCy;Tly<yngnlH&u0Jv2Caw++nrc22ns6?qX~V-yTWG|&91OLGs!ndwm~Fj|
z{e@z)wYPdmf_nPO<hzz@MMC({mvl8)6jK=ti2vGP;e2>T_S*`-Gy&p9#MS~>8$xpp
zXc2nt+{=j70QU(l{+*{kRvPjxRy0agqLeCvFK-^VyiWBzB$e!Ylq;2SnpV*Cg&G@K
z@MZ>>%;1?x>n#V!%Yx-{@U9pqync0WRp9MXTxEzURgWJ!qlk>mVY2xjA(y8zvaVQ6
z0+*~yP%doOLOWN&As6xI0`BMYydGMrO1*GC5GpV%I*Po@!$%=i3Ld;G04|(MMuntC
zD!d#_$o&TQ5-fOECRlK;axpxRxHi?maUO3MI-fu!`i-2+7fV~l9t)xJAd3Gdu7xG|
zTffd9*+nN;P?xNipGuUlb2&MS&zqNrqR%wSTrR=1mt%(lvNRP){Uns&P=gcJ`O1VU
z<Wib8JiO#UBhKYu$e|THoz#>@`is3n`$&P;r41)NE0H!8w5_S#PVaV+D`^b8%ba-v
hwCL;Iqvr$tAGp775gk~m;7$yDh|gqY*w^un`G3g)=kx#o

literal 0
HcmV?d00001

diff --git a/docs/source/_static/css/Calibre-Thin.otf b/docs/source/_static/css/Calibre-Thin.otf
new file mode 100644
index 0000000000000000000000000000000000000000..44f93821ee80e78a1a8d9aa92b319d29ea01240c
GIT binary patch
literal 46740
zcmb4r2V4|K8~5%Vx5w@28CegNbMFwbQv|U$EC`B95fsrV?GWTBQq@>MqehL!*flC3
zaK{~r1;H+2*Vub+F}4_E)MOUt^7=iq2WsBr{l4E9bL{NQGtcyyr~GGw!oq^622?I3
zrF;kUAL+M9vSK?$RYgz~)pOwB!OnVByTKHtJW5fKef)!iMp~t^{uGsFr6|8~{v*Q&
zy!!dCJrrelMp1T6f<}6JN2zjlP!!z>@<+ucN2i^Ox#&ny@?nr3k)V%``{VT5Ly&d`
zeq9qFf@voW0eU-lcTGsnoMP7?@+C#Z!#ndRIeJPOeJ~i3=yvdjNr_I@JJzntfcK>o
zCEJ^pnvwb845fwo6+^l+MH5nL&L#D*6%!iu{)3WAp<Wbq>UD?H_0JFAd>AGTl+J*6
zDJ3DX@E_!pEd0+V>ZCLS(!Y@gikxg|Hi8O@3a03T@M1?Fr`}UC%3iWiLV79C_N2#C
zTABquN;|1kB4H#_NRx}1>;41|8#0{ArJS*(uB;i<PG^c*PLk>Ul6ZJHL!Bt91wHCx
z`5q(@e&ny5@~8_-C_Ac4U06yrrFztb?O-IKnj*c7>P?NS3o{VErY<a}rclLoVSB1E
z-J&k6pc>Ndbzvo?rhC?f8+;a4QLX48s3UEsfHKC>>2+b6YDS-{3ri>!{iH4|r99|&
zbzwWInZ%_oETg7Kde()R&*J6OYsrqfczeo4x~VR#pq!*f>%vN^gY;fqxWQ*(6*bt-
zzApX?N@X{qF5H;1w;N}6_U`26<s4{8Om+^Nnx=OioSKyqmp;|Qxxc~SoSrx_Av41{
zU7w*(&(_Cz42(7;#-!`R5)xCqJUVsq?nQ>KS6!?-iLMJflkg~gdPZVuinEP0c%*Zu
zP9*5nD<L&AHZ>)iBzSb`)+;$WNuQb-?*Y8MA;#OQdyg*v#xpU)S)Z7hpig&>hAK@=
z%*fQI>*Jg=)1%|`$<gUa&Z#8jv)B0l(KzSC6z3889Op=Vv>`er&KZ85kT+H29Sa?h
znVzW6@c2ik1g1k3MyBRKyJDs$WM-!I^z_Wh$?-^n-t&k}^+=iauaTaH#8`bwMq<iD
zkF<m|$_%sFo9aY)!9QmzkTOt-R5FCZsHs#MrH9a9DwWEjQm8m89pXG7raw>_;F%7z
z6R89$lgbd^^zhU}Y&QIj1ImF^w8$?8;s~cOpiKl$q3|{lcp4x*9m@59@;gB-dO`Wa
zs3FvF2>m^;`^UWhD}Iz%at82Eg%X_q#x<B42{ELM`m(&Bgan|>q+&(-?2kD-V8wQW
z)MThz67VE6@lZq3FELO@QdbXndQshBJ$L!9dO8CY=?gtPGob|18_vLq^c86{NzD{v
zNe?+gI2}^sAcmBe4kbE6iS>E@JvAO^{(nY{)QR-S2q+~7o{{h*;}R{7DXAS9ThF?3
zNqg(-P5Oq6JgG+_<jR1S|G#HQph!)6VI<_wfqdzZGX~}knJJktt9n9do{&QRc|e<O
zbIwB?3lAu7+W!Zor<gkyN=boINJ}RI&otnW_FrRDKl{kGOZIR0(JpnnXA80miSHE6
z5Q4h@Ag+$0K1O`f5~-bxkxx{x$_5QpjlO8?z%@}jHf`42Nz;PY>YQ7;v~q3Trfs|S
z9Xh(Xdw6#8^6u=@rE9nDJ$m-)-KTFq->(J?9OO6HKVZnvz+uBj1O<nLjtmPQH9BHU
z<k)dvj-No4S~hQ~+<oxK@uSC1o;Y>-?2l*8T|9r`(&cMcu3o=+=hp4i4bd_28R;tu
zOIA&v{b<3r)Y_jO$JuJQa0&I@vT0EirGH2z=1lq7T()=d@|159visYwr|#9nKA4nb
zNKTuanKji^Zrr+U`;Lm8RlBG?`wvtvoiTG(Zr+@)=gyl?*-5(syAhNxoS-VI4b_&K
zKt)kAsRF8qDxu1#opb;_lU_!DkTjQclbn#;)y~zvb#CJ9<g9geac<+>$+@TVAm`03
zo3?am8P;;Vi-SvmOO%TqOFn!6zRtj}9q@|=euco#M3qxj^ibgUUeXNs9hcnD=4#(K
zbI#42TR1z5{Ca%kH?rl}I(~7O{_x?$L&_I^8u<BrsQyrG>jsMY&t($&4@d-m|L;NQ
zgCP&R9_SwU+`oQ5^TFr`A@`QvTYGQzy;-uPe;Mn(u>Xb9*)R?6z<Ej0Pv~d#8`?_$
zM?y*LB=&#j{g1bJYBscQA<WA{Y8JJDS_>_HLQRJiF`4?BT1hQ|74QR``S~#C&6E*(
zqKqn~EYt#OGqsJ{0{ya;+DUDvc2E`6YO0djLse0`sJ+x~kPW`2s;T|d0qP)ih+0Rj
zppH{VsH4<L>KJu``i?q9ouy7wKY|2zkvd16r!K(AU81g0m#Hh%b?Pd$o?1rTp>9&Q
zsN2+CY9n=z`knTsUejLG8@e;~2i=uIbazUidr*JU-KbiSGv3p^sQ=LYC`|XJ{-XO*
zA8237O822@dH_w){b?yZh?dX;X&F73wxj)M7Dm(_<_Sal(+YYBEvJ^!N;;5kKo6rE
z(!=Qyw2BU*zo0|tMszUUm=2{DQQy-J^hml19Zqv}7_Fv9(aq>Fv?D#5Zc0Z`i>c+b
z6Fru0L4QeW=y5bpkEeBX6s@HvQ2VGG)Klsi?Lx=WE$JA#9i2$GrsL^WbR6B5PM}?B
zJ>7<$NOz!<Xm>h=s-ZpUG`bT#nL13}r{2;&)H}Kh-JFi3o#|+51~ngKz&vUWH4kR}
zT)HD|pxe`vXg4~U_MpiL^9`JW_TpZd4SQrN>^2$Vt^!lw!zm)`{_{VYvXfBIT@61M
zCrCtl;R{L-BtorJcyH&|Kx%Gaha2F4+WYSt%UY<vY}7)%qw#?jBf2|K4}by1Pz@=L
za-v#N9pND8PW6Q&b0jqemZ%;!(=3=?b6^6kfT?DI3A6{M$tf8A>oCmssTb6rFlZ9!
zdI#uQEp&%F-I?}<Aqu4<pgSi)r%a*e(BIL^>9uqrZK8M5`{={;3HmI3nZ8Fqrhlac
znEZAUm86M8BXN<qNjgh<Ncu^JO2Q<Ok|;^M#30F*%#eHy0{2Qup`=W*RkBxdNODYa
zMsitlQ*vAKNb;+sRw|P!rH!S$w57DI)Lq&`+FLq68YCShjh0T7CQGMEXG!Nvmq|BD
zH%UvS+oV;}Bhu5-i_&Y-JJLteU!`xQRy(;}Lp#onw{y00v+H9w)NY*JM7uP*sdn@2
zmfEegE3(^cx5w_7-C4W4cE8wRSp%6?<|*qh8!8(u8!t<fO_O~sTOwN{E0I;m4$Cgd
z?#q6ay_LO}{fCh;Dn`vXGi?}8rYqB%8Ndu>LYN3<JQL5PFj>qj<{RdFW+P)_%9%=L
zKXZgR&0J(|GtZef%zxyJys^BQ+*#gH?j!FlA1EI#A0yYxGv%}7i{)$O1@dB#lq`cG
zD<!d0r~aM#gZ!GU@1GnUo1U7|KXqbiiax1-I>=_xv00h={(j<PpzUvLG|&u;O-zr?
zN{%<^rwois&5Vu(SuS&sm^27@L}&Vm9R0)?zmF7twi5hoCHRXp{-0R+$EGGHN86bB
zf6NpR6P+ILiD-zJYskmsAvT^vY&?f#CK}@OLxv9hlp*jluEWGO3=^w0?DKeFH$qGs
zA*PM^EX^-vVvty7P(l{;NP1SXAv!BF$X0Anh9Np5LChIy`zz)T{Ve}TF@9vMJ}%K<
zh#vW=Vk18z3WG+5iCM$y2*brP!#^)G+*W3|t*PN;fWpQ0juwj;{fYJHiRsbV`q6QT
zpbTUrW{mliGDb{^6f;DA?8IoXuxK0qXdD0Nj}stTFOtXF{))AX{lqc$BbPWaDNdXK
zdXb}EjM0Cj0F{JHF}<yXi6YI!Ppl?>)*}-?W=ep*N%%xWl&x%iq6efW+Sn!9*d>a6
zkvNG=YVXe7CVk41^cmlziRpTM3Q=(qV+~@93}Ph>pV1hAcZ!&nBBrH$mZndcm@3vF
z_4A=gwbdl`({xA|^QVjX(?83fA;xFajdI4PN@jdUlnG7F6tia55oU>HW_?~}maWVz
zTkEsPaAk>IkRuk6^NDqiIOsX`(_!kTl&NCMG%>@pk3AkgAzm!pW27N9J31*NEjm^|
zfplwU@2(!vhRnZ3*j^xSjNXu%1C5BOt1_V!*?YuB0|(eiNZ?c1;v3mbNI<MO3D_D%
zLSl<a;A6eSm%2Gk!lcRM`FXF9cl{I?+T^&@9Fg_pEPX~M=-l=16SEQxMDa8j^zoU5
z{9}Zu+SkVz^vS82pOc6YA4$ZKCbTipu}Po3q?5QbeR^VQ9N83!HZD@ZZmSndOPHFL
zpcgBqPl<!ggQV({>qBYL=`cI%Dki>sEHMVARFeK<mFiyqmaMPONfMzBV5uPNqs6~4
zfZkqS-TboBQ_0W6kQtW>J5o9XNKN5Ej0XnNr-sa_P(?##ydf3(7y{GuVg^{c@RFHB
zGG`{hLLxM&S%gJ=Vm5ipNSs3c!U`qX^kNMRnJI~4Mq9UdWJbq`bKMp&zyyGp41IE9
zY-;@rnQ<gmtaF#nwug`HF~Ig1XnPE@J^X-oT`M4znUI>50ZSaJV9;k|=qG1I8(`f>
zli?uYl+;Y|2_u!BIz<eo8L~2fJUKDN_RKP5CZ-vtLR4I0c48c9n@Cs35*~F7Aw=@l
zXZ%>+CyM%f^`%0}X9dIC=LVv>FtHU;tBITy03!M}5PchgP}z%~POk?U=&^*A_(?8G
z?nz!qTT2H?6Qn;%U)tH(ePO4z3$;tN`^Ij$-3GgIySFk%)<M<><c;C7RM|Y)X4wwe
z1=-KC=dw49gmD1b!h`W<x-o%F7!%7(WRjV+jD^|C>|}N^513z=w+v#ea*e#L+ymr@
zP<gg|joc_NlW&z*$`8qp$sfy~%HP{Jw{K(L*}jkc0Q(X4iT0E2r`pf3Uv2+`{SNzm
z_SN>s?a$g@w7+V9&;BR--|PkZ4+=YlLZMOU6yAz1ihhd0icyM4#g_`bB3Y58_)77u
z;s-^6qF7O(IIQ?laY=DgaZm9`@l^4P;x~nrm9q|PGnQvv*tTp()|2hU_G1It(QFKx
z%uZqR%A~@javp_Z1~ta+*bU2ZV;qV@5j>F_az_x4LbXYbxKBrHk9Anv0okKIsLxAy
zXp!z0i0q?PsspD-#0=KB2i!sqI`nuI<O+HMNeu?I$IY<zrYB;4I#I^sUZ`6As<Nxk
z5*<FmyjgMb!efofB1JWG)X3qF+o+FDP#COsTX?U5Sc;{cFC%xYU}vx@wy<w|AqObO
zcin}k`<jP8R^2$9SlO>oH?Tl9Vnfmo2b_)_G@rUqkvbxO6MuQy$lX4gegU!lMpPu9
zo~pZ*Df@A1Y4QZ8!0^-|0b@!}=4e&2r7OQ%zS?PB?&|5&b7xJTzH-j01;BpI>{T;n
z%+8%Tb4BjjwJTPwfv7_j6<2j>OlWG7UY}ZaRGY^1uZTSa3AoL|em!w>4Q|o;59E*h
z-~54E{BnMDccZS1d6m}vES70-2iy>|cnltc8ekT6K<%HrM(#Qb9+xUtnb9TYNXg;-
z=QWSEkMz~y-Db33{%rK=HUT4}$EIqP{p3|?;X8+FusXCQR$}GRmPmc3W@nXAx63Xt
zF+3qe(?0Z0A%FDFug7W<0=MW?FYKbHhbIMV{Dy74c=7bsi^pq{qSxtE>Z-&oQBjGB
zQBhkGtE#qasnVu9retQOq-1U?E!{+ZnJHQ&j6wz4tVY3jvurDagV7e*tCh!&{N{vc
zzhhXT#qrkn7POf`N9~Xw^XBka@6KUkTy@xwQL0m!yil2C#>CB|HJJ8ucGhv#J$D8^
z+*@g}6l=Nr(M!rR_h?=|xr4f(p13J;z~Oi@ZjXE87@UWOwm~6xujCi**OkgM$M?(b
zuE7IwuNTOGLQo6jg`&|w=sX$jh+7Wq?~{<Wx_SlhPv4*(Z_p=3Xo6#RUH$3UtLM8m
zPnz(9PBne?thH-buU@xq*6JBb^#jDoCNW(xCsR5U)rNE9(1)Qsw;p>~Yp{n<Gl56_
zzU7S!XFwi8HS)Ap%Zv=_C%m6fPvj}o$dqUeI-thU=%OrvL7DiK3>R2XfgEL`SF%KC
zelb(AIXxyOC2hil)Z)sobp|G6dBwz2nlsz0s;g2;<Cp1_Xh^<#XhiwB^QX%%>X5n%
zGki*9XdjI*>D|Yba~-KaTRDBgz8q@Mo|`dhS07FP;8@?OItz1h<Ia*3nj5<!`|BdH
zTJAMqOt0Q2$2@pkeey_=PI<g4;-sc@%j&J=x-Ihfgv9A78ZIIv>fmrqO47`PM4fU-
z+VtqJ=YIR`d~K?H;dkFJT&6i*vH7x2sV-rr^8-8wJe|NRj}{DMmA@}CBcll|HPbIF
zf_Is;9354kz(@UfTpq+c!H3nzh%_dwK}%5wIqLilOK>mT%LN{tbt++!S#5P;1g&sb
zR$}d!EHsx}wbsM3$wGi!Y2aUl@Ee4^+^AfhTh*WC;`{U5x&l_&<I3otj~*}=7r?se
ziWD$t4`%V6sL4~*1j3%U$y3}!rc@_P`zl$B*;K?byUNP;>QIJ!^STXX+8w4iCTaTm
zEhJ)ZQrUz$8uR+E5_C92p0X~paHckE#`MfoCzVh<lux$qWUOjor!3j}-pnYM&X~S<
zy2d{;yq~U{U+P7xk6G~f)8u?xeqr0T1MAm*GZ*#&#0t$3Yi%woU{DJoSeC*lrT#3b
zEtz5u$P`08)tFj-d&q0e>xU;%3!Tu%VC^Hv4F`VN%_r!{PkF$^pFw`8A!>kvP!NVE
z_S0d1WeHEZ88K$HaPXg9iiaet{dh(>VvcDPt9*AuJ;8W5=Bx%aJ%(uD)i4Cnxap9v
zm<eex&wAO#q?BygQfk~;QJR*Rm@+w5rxf0+mALOTw(N{6uawQpn?Glk)AUud)~#E$
zdYuwkrt+vGY97qvy>{(cRlYiMlBJ?zvt|FjEe6n93}bc5DSWm-2lFa*dsd0KIlMBF
zMNRiPoZF)}bI;;f?)CB^tV+FvMNW=OSXDUBZ6nK1%cVu~I^~xvjA=0s-K<7|tx!wc
zy$U8Eaxfq}IBEi&a3CfD2Lp-iUiB8WM1fAKIi{R^6k<ZoCKQsNV{%X;1x(Kwvt~_K
zy7DSirG|}Fl|Kb~lJsK3{EfH)Qo)-l2SQjy-g2hMREN<QFj8Uo3z^c4ip?;ml`;dP
z6wW!b$^kr@&s@HMcHlFz?F>GQ9-`f_>%}nmSNs4*bN-^E6SIZt<BM{X&*zj1Kj$cy
z@%<fBSs`7G8{uv#s2grniW;GArMMevB*$U+KT~-X4zc>HJDeN(4oNTFK+Gzwf$5Mp
za%e|Qw~&1gE*{&zeHV<<Lp5HF>|_QxO7p;J?y_V#BJbG44zEr@s~Od<w6L8+G`ML9
zrg1~1knhPW=VbB9+9GvFR(XBVaecPX{4i9kbWt*^9Lgs$c!AYdmWKA=&SumZH8jKI
z@=V5_c!gZ$!K(7|cn1`7pNm4Xa9=f7@FlN;IoI7H6)ZzxRa?8Gas%1k4-4J#VT<ry
zu3VBg8<s@SxY)=nof0=ZhH2FF?BSiejLLVlT6OYYS{P50(9Wcc%S)XU;}o_x_D7iB
zWoon<x0jX4@f4&+@FPJIqv|k?-Tf4%pBkZ3j2PwHF|5*vIvO*K=$H|?nj9)hk()^v
zSIXTNwke;C;g#qG>aF$;i*(iEVaSAr!I={Y`-_<o+-I}Nj1ntkDs*%an+3tX=nL0u
z<svp&xhTg$IgDSIw>)>TR#`iyHLt{(1=&Jxm_ze&lqPhO{7)+<&naEBkXM$V;U&s!
zp;xIArK*uoiUsD0o$aZzzLzQMgr30gqRLn;dLnqrlwDZW<fSa?S}L?DRa#J+(oC~*
zRJ4V!Jf!1x`Bqm(kA@w!H>(U}Rmh<fRprpidBrf7U5b_TE;D*=kt$&_U4Xn>e<)c}
zX`jk0Fyv+%66~RCdYkZ=ztYhd6YkBZwiTFJC0%JzG7D!eBLAi@nm(gJYanAU9z{~|
zSY~y~eKgA1BSj@j=S3!=nMp~bk%*NlksQgDS_@3|{DHhO*&3{*?^%@UWCp8owk#8Q
z$d$z`jrtd}QgmCXPGQiuf2GSx8Pz%#HI^zX4^>wxYi@1O!nrB@pp;{GB|4#2E<%IT
zjQ5P_7WDD}*plXyq9pR3R!S>lUa3#qg~b-s1NyBAa?%OE+97)-iZU0N*TWB%u(lu@
z9X6x#9AG%zdPi2qsK5lRrxHA<wG34#rz|}sT`9?EN-~O)gwgaj6j;~k{RBurKceWT
zl*ECOG^QjRB?15Ze2QM+O&@Tjf0EMAno8(Kk|xfQZY?AcO8Owt1EgSaZ4EBLZPZaP
zJr~gB^j`WT*g%z%reOW_lZ1esbC=|Ubb@rUw8pNDU0=HjnMT%0))#Dz5wdvMaghA)
zGX0rhOehn{e8ntamN2WpdbpiAz+ia;xf&$=HS$7`?)S(K%b(i2+DF>QgET+e{yX~u
zkm7IJzf{;O`YMKioSvgtrYKV!0tx+*;*~;Rd$XbJmuwEZk=?}}W-qXhluD(Ga<no|
znWCJo{8qV6c~<#I`FjIt19b!E1|1t*YH+t<(}u1MJsb9D7}s!7!~BNF8~&lPQ*~7J
zRt2bHRWntKR7TacM#@G_8?|cW*(jjVm`2f!5*tlvG`rE)jaD|=+Gt;+3yrRR;qyi8
z7hivIu`%1&wXu8Sv5n_A$Q{xgW;q;oIO_1i;XT)e3+AG@wcIxD_a^ctnN8+2d84M(
zL(~)0+3LmWP3nE>8|ptCWsXf9JskTu20D&$%ygXNSmL<L@wDT8$Jb5SrcO;eHto|i
zq-k{1tft>J-PQD7Gp1SRW|7SnG&|btezVujCC!^QZ`ZtA^U=*$G%sp?uKCmE@0}Vs
zwQ~w|n&33W>3gTmPRE??IK9zGG_jibnw6RY&1TJ3%{k4>7K#?i7TOjaTXbvDzr~yu
z^IB|dvAe|^-idF=_u_~1QG6CZoBx_$#qZ<K^7plz)?M3QJ3*VR-KjmIy{UbzqjWC1
zKDsE~SGw<Y#kzgEtGZvEW#Fjl?mWwRg>#AXUgsyyLQ4g>zI?#>72k4p%WW->wY=W)
zY0Fxd@h%xI3td*Z6uFeUylG|MO4G`{Ro_-gtyZ>r;o8!5u<IDtWY^iQt6ghct*sqe
zySDDtdSvU1t^aJ(tWEDWQEe8rv9!6?=4IRFZNuBnZ@aDSskV39{@#vhr*7xbF0frf
zyT$EJwY%5u_x4QtuI<CyXSOeFzq9?N_D?%d9R_!Z?2y@ENr$oympZ)e*u0}l$Nn9M
zc1-FxtK+JU=8k*acsF-9Pq*G~0d6DR#=B*@&2{_EZM9pe+YYxQZjansxVLxj;y%cI
zr2826DED;tJojbpCii{r7u@f;zjFWJ@r8%h!_A|M$0`qt$6k*!9@jk{dA9K!=DFPS
zT_>i~s80JjUG^I4wZrR2uj^hfz21BM<sI(5-1|)DrkxjczU(9S8R9d_NAHvA^R>@1
zpM0OqKKp%6`dsmO)P?y~g)3EXlCXj`$Khsmtcgpz8OC!7Yf-Oh>~}&>SrVhdSB1eS
zfUYp_-_G8`0ctY}NM=+UVT%}Fg1k%6_#E0SBua%SG)3*oY)9U*UzzVqWbK)K2>bv{
zBpxq=ZE`=nq>3-D%)H66-<f#q4K#Bn1NB%SF^c4<xA<ZbU*0l!1sVpsEu#lovekzE
zU=lW&=*m*$VnVKFDN5|jzKGx}7P93FS?;St_Z(;P+^$2`-dO%EE6lYvQJ)X5@%Il4
z_xC>>cK&?L;qzL&S%BEE!~Xu^VgCL#;pfjEt~swoOM!M&bo6L)LRk3d(OOkGoIKgW
z6O+R>%Qg#IVy-rG2L&G?S^Xe4;>V7fE&(B45rv86dfkA@qy2k1x!mc6s5@u29NMGh
zW?t(p+rf1Fef#0Ui;J%`Pi7VbOdXySiN~SDX5;rIlpk@raj@#r<-Ozkt<d5~+(FG9
zL=$nQY-~(Oa(|7N|GCGP_MiH3qfTXQUMtC=cU4RCg~`KNJhY58o26#7b+i~p1<(Tr
z;f=>}LNm06M!F&?Y9g#tcMMyd^Wv}!ONx+0{vvF3e#gVV9G;c$7A8ZI3@nkm9iEjV
zQ%y&fQX0)Dm7*D6vDOv^+4SaW>9Q$AG(z8$(iLc~v;uvjMsrK?Tt;PWVPPFMS94LD
zL2fLt$&FDUH>z+(8M_pjrAxb+*rL@uy~`pr0VOpbOa)GlQ4=CTd!r`U2|Ib?CO8NO
zJ+9!dV+HPm`}Bi{J+|+M6vR?-9Uk_;F975KYYvSj0GFA9Rh`4&!JTB2<fzv}nf01Q
zuIjhQESxB%O%`;_EEOo>OSS*_u|xf|cp6?T%Mu*T_=OxzKnB^#3#U%pb$S_a-UZA2
zdSfPc6aR8$&AC^a`^O@>>QupSEN&@9>q;F2S=p2_?imU|ht&s|JD0}z_>2g3?;N!M
zr>VLm#`T9Q!w{`O?1fv;p9NI=Ez@zY&#gFm?1s~G|8rQ<alo(+(>C#UH=NjiL-X)N
zc%PobBm3&cFsiR$jdV5(eV{F8VW_H8_)sYEQ7IZ{rZ1Y&c2Fg<5K=!q-_O(2zrSav
zkiAc`pg2qwUJfSZN$x)W15rI5W68(~>66oS=sUF2j4$D<X89IlrRj)9H3j(nUW!JQ
z(z{C0TZ{B5icr70eC_$OE57a84)FDG8+7AWEqaa0)%UYPPqfuwMOzS56NWAyWDL~>
zlpT%yS%c`G9vAYRf5LQ^-oren=@#TKEL`V=LLR(%^I&J>kV37>8^+^~ncg69QlYhQ
zUv0E<$>?nrKWUB$?Khz%CWkHNZ_H?~d4ZXGgJ$QeT?d`JbLY~zH*YQn^z1n>z*Wb+
zL^tfbLynz0ckI~XpO1zF1cZio>9{xeogHfM;O4K_MxW`q38>B#RvbR*bkFaibIU<~
z-A7b}oypR2Z+d1%4E1$tf4>iEpcPv@iZq(uYC&&Hr9vyAtGeAlUw4lI*Ixa0<=V3s
zm-@EX;@7xb-L3Fw&~F-~_zl#rgfnw4r5)9sOA6iZ1-6gQ-e5nI|I``L4{x1*x?Z<9
z=i995PB^5it83So{b#eZ;&eyPYRHUF+Z@KDXGG^Biv&HX5!Xo&kDCe+IrL$R^l%QJ
zhf+(Ql+u@xAicb(E(Ps@349}mb%X8oX{kfL1+}h@Ga=T@O+yi|i1ykR(KJ{@C`9mA
zzYI9zf~AA{*3Hu+XMTZsI<~7{NThy(j>|So{A%(y(e@^0v1L_(rQDvIwy);Mp({=*
z1a^|Ic%hLy-;LkQb2r+u+%q3mwFImKB_{N}+2NiAowUrcaPQDdbXm;_hu76?f9Uio
z;3#ga#e=QyQ3L~NUtBr)?&g>ud#uuN0-m;7jlM^1<!_EfckLDu4fdn9Ab9r!x_5`e
zdh`ejchTW~jOxf3b`A@>#EI7I&#T$NylOC8!b>}_s`1cKHKk~o8D0_Nh-%ROQoJ86
zLqEdew7!?CW<o|G43_VDw74x_$xnuD=WIa;8_cSr=9q=YIdpy*I%|>U58_q%5D!vu
zG_RTiEB{Uo@QeYeebS->DcM5hg*p5^voO!hjT0D0un^4^DrMG6v+zDy4i=<uttauf
z!bw>YF(7aTp;C_KS}SF$`K$xvQk%Km{h-SXtZEt$6`JJCtJXl(%go3GRYxyi1glZm
z1|ExQR~a*dx6y$)c1fzbU_-=N*@?s3A0gW5f0<^F9QeUfseObB)E$y~jT)nkIG;W3
zpua3_;)EHInt!#@>Qw7^DElWjUX{HX<RlYvabfq_79sk<BAv{tV)=4baCPBTM;Gl$
z4`E@{)6jlLv}iV)#Bkg26zdtBBb<TL8!h~nSH<w6Q7u9=s;&8lQ7z%0Ml~A|F{-IH
zkj^g^CP{@nQGXqc%kXGaMoer8_$W@sr)66hG!E@RU*a9Iat5cMA7!c)Bp3R|BK6``
z->hcoEkb~_2MqDJZl>CvCi)&6I(D^^wS4{Iv87g9%Gal&NJx$xxt7HS;o{CYycx{|
zjhG+GJL7&(0#|SsIo`#N5OhZUWGdWefQ8OSHPZY6YE)yvHE>up7vAU4#nsZ+L-`CQ
zs`J;psvGN2VMc4s+;sGpV=Gq2?eE2N)5prW{T=vkcodoAaLtU4Te$rJJlYCVXF90P
za@e-cl-F!IuKDd)H>`j>SP{~#ZE*OOGYMMNz)AcMytERfw_$AzDlO<Z@E006nrpSm
zcq@~@LzWC)TV?0Rs{Hv}R*<2_EqQE&!Speq5vH$m9G;uG8)$j0PR%{@<yGTY2jEg;
z=3WYHsTa%L5xk);FWd7|S*RinRWzEV*BsmNXlJrq70TnmCgDlWCnL|h*}Qcn(}f39
zPpxxnUey;YK-nbUBvw_BUB+HBOEbYNGj<g_M{IDO8ReOqw19FX=)L;L^yuw_HGyMe
z#$`gqZ-X%iU6FHl*0b<{KS$*cqE34UhL0PUsZ~v_<6wmPlL~RW??R0-*&ldx#UgFV
zs=niGML?gv4B)vtH;4@n+A@U|Q*52}QfLH*4p^CY#P#~JuRMXz;B{h$&EsK6xHYIi
z7^&t;(|PXXJQkj;3OAa>uVhP+q!o{I=9mO$6aCCA?arzqSO+*)i(yf_vfPv&X1U58
z_K0NGVT14mtdJB(6aW@Mafx_awb)zGk|h#g4VI}4JP_^WBEa6c2JD>)VDDU0z^XhG
zS=xmDBNfz(*hvg-g(I*FiU4ae>Wouycbp{yOM5#s9<>D-b_;{MqimVV9k++s4`oU_
zft`-0x1-zA9#HLoZu9v+U}|enN*@-VFJoaD-0P+;txZn>F`J~K(U3fPUSA%|P`Y`y
zWe$tJ2Pxqux}lDmnv$R$oqT<$CfpD+0$dlB8%xWz2Q0p4H8tj)CqZsHEX3E?g!n@@
zg}h*p(l)RIml%b1Mi>(}leDrQn+#{dni6CNxz7P4l1>9y{3BVZxPcHTMVE#5y;#-P
zTX~dg;ilXg#Nw7a*&xAPdZsJu>uRd?GC7zn>&+<Dyxszn_Q`!UcW3*C((MPF4keb3
z*`mchNGj7Cr?1`PbZl?ge|dR^MfXctp1yi2oG$vQ$q8eeMr}zskf=pISSs6|wr<9k
zP9fuxh7XQ4f!D{t3|xO?{3Ffv)0NkC+>O$&3ukT6a#kFJJk&9!<ied!JBy9xwOX*v
zI?8sYnzLe@V$#!+XTY)TYExFWLwD~K601j})^qsc>iyMRcX2D=BBDAE<F(vwo8<|$
zd8W=_{wq?2r#uJi%5*8}@I0EuKRq||5K^bug2$e#Q(1T00+#MG+Lkc#Cnnajwsm~f
zmw$-;M)BOA#N1j(I)WHSpPNw-ENvUv9bcY%`LNVxqf;fI1xC6;(7|E4KrQHuRvk=<
zbfL6lGGZN&qlqi92&L**c)SUXe^t_s;i8_V%DDp1c5*`2yW}<yHV_FfSOlZ`Rj+oF
zU%8u}1L<v%eA{y%zu*q!Ru!62LswKoNo&AX>_&D<6jyo{X5JAmn2cd5jN4yfwt#oL
zcpslv@sDA44M#ls29$yYQnVedp37R@^Rq?O#2qw~oeQfC*$%r%Kd0rpPh+`lRX@4%
zM?6T9Fd4NTU2RL;28v7;q#p6ef&r@U<{r=Ecx=Cb$W;&FpLBC^Fpsp3?OE}UN>J3R
zc$-1X8%o)<yw}zUEKg#ELeSj$vgo*DU-qwALX&S<m0PP)VS9~(+5Bg9j0t%YxhGY~
zQxj<`lCSqWY2%^y*YMoCvEd^}4RBKB3HgXC;L)if{$n^Zk%T`K<+JFA+NP;|o=}hn
z2K5E7*P$1}hw&_TqaDlbj$-qK9Yz{_u;EhVF6<Zqc3ffEHa3Vnt2gp63G#$pduY_R
zkd>moQH1Ft6EEE8$g9$TDHKejRwz-51_=2h**xrDM57F&6d9MO^@C@oxU(h8i+0`6
zLAG_pP?tYzn=j$fV{yXNE)yqA?K0zE%NUiB9R}Sx9bG6v>XKhe=zJ6EW|E>g=&Blx
zlb?*+5jZ+Fb<8YnDYJET@ekWH`;Ez8>cZsfW+ZKxs2MSKV))2?2`9E~-ned!E=7LU
zZgO7Yl!=<C*utu+ih`=GTc%82sZ%W!MKJnj3n-#%)hCK%c)QTZj9;UCa50^hS>(9$
zFd5omzz5|)88X#Y)?s&<FsY0)>~`eVXRw@MD9deL%yWkF$owIFL3;gp(RdKO&^wE?
z)-;s2hBMd?d<G4OrikE){B%ON89zY9=#Uv70$<hx#yZ-1P>nmgAPMS)dcA{3XWdWM
zgRo-a(Rm~K9$Icj<6uIU2?^>a5vTg}89ls{SM{))HxHivS)1b6FLq2&Q0#&0*DDU5
zI8hPNPpd+c=a|xrqgaP>Gg@atN6WaGoq6tH8moc>01nMK=+`P(^bt8IC5L_iduL-}
zUdM&Z&~dUv`I)^k{F_;j$}b+2ZISy$$W)1QObfGxz9xsg7B~;NDSH;Ex!sX)D#gQ$
zSp(0a#5Lo>p!pTVTSN{`QmRzAHAnr^2{7n_bMoI^^*%x2z0<U}Q->V_2duoG2j($<
z-2=(*o<4Z9RM)HYbmTn^DDczNST(elo7>Ra;BHZ!x&7kBnO<0>MZswNC&&KpD^6Ve
zS)=+&bmBZVp@k+Q1g*9dTe!WbLa<kJGmmaKxaWq`t&p?sSQgf;mDV~5j6L!f`_BG+
zW7p`RgTg|5wG&czG2GriZ;ilI%b*dRCMGW{PShpIr<IiF?$G>k{VI5V6kSjDUIv@U
zIilFVfq`izSeNsMk##E?RYbovArNvyP11G3BlTlE&4i{sF2XU4&<rnA-$xT<c!CKn
zOOkuxlujr`HVH2?p$W`=JVB=F&Lh<#BPuhZEz4OHXGSecxmUtEa*%QNjtbU5R%kTd
zn+02keJL7JZ|>0ZM96PTg#2XGmFPz=Ku7^ezb#1m!$BB9%|v;>Er=^Kcx1om-Q*JC
zdnn+liQ8PqeVoW0-P&mcA9Qb%oEullPDjd`ccp*S&}eeC^x|-0mvMs~$VBfpp-v|0
z6=Ah{S9y6=Re8dM2?+^NQ3>U{v}l;)iE(>_gU5{v3fepF`0>4aPiR4Qs+0DPTcn~q
z`qxRbd<u&?lbWDGM&wkC8XM>3IP6=>qe2r`bx1fy+B`)V;mvbXeLZ1<9ZF|}HdqJM
z+kJ@CZ_1%k7OzAVq_^lzq^G24lu!bl(;2NOK`teyN{*G#=8`AI2gchaQlW(qsQ#nJ
zjdtyN^ljPlZr@k0?%w!AJK3?5-;l0d{mwpla`Ehg2N#EQ(&En3)ir&<M=<i1taNF9
zzQt+l<bpVDe;j7K$vil!o;GtvmUhgk15DwX8PmY=PpnQC5t3dc9!(G!p`|vN5%Izj
zHELK<6S^g&U|jPB%+91JW9Vm-ypX^C3{IiO;YiW_sqS3ioT5IH(~QZ*hQj9Y^O<dv
z%hHT#SURy8+JH~~`}BvVwnydkCA3+J!p8Av+?<l9rRZ7dPbG9k2~t^vK0R6MxW(W%
zyx#AZYln87TMx2G#I#sLsHSJonRo9_oVjzf!Vs|@oc!mejoRg_>DaFyZlqIn<1Ziu
zXkm9i_kQWaqK-m0Hx^=1_u|iDU4^aS7KG?$dmPY&LYSdI{>q0vD8xIH=)d!D6}t5X
zjs8$Eoc}S!EO#G>e*@leLM9v>-S~^9N|OWX0jE)YN~W-SJll?qVvpqUsH<5xXLhJ4
z<EEe@;!E89C6C7olUuWc+5Jc%YU#MoeyHVO7P%JN-g~1yq=JLlWYiv3#T}^Pvkp*A
zy!IEU;Aa`T@9)iv>N-?8n4Mn)yCYHFp}E2AH%WD>JhXDdo{g3bd$?!*EW=ql)<y!C
zPuae$?bw^2&x-B1XTG|CziQ^Hb3q-ypKD;TW9=eaW2W&tk;0~%;|wG~ji>RbMR8qB
zx{yYyJdHoI2M!U>LijoJ_`J{-iuycf3KDvQ?p_I<26T!Au&RO1MToE!0*dyQ;#EA#
zSj@KiOX<SKlEJX)*lTJ#Lz%H;ENu~MYs*NHu`rC~ENTEnHgMs4*Tz7JpJm%n+qpMS
z)L>v#rwQXxq`yfVnE?KEedEQ0WV?fOFeu8&C<H5t9Y9^?ZuH>s#9CKCeQ>4I!A3E_
zQ^u7N@CT@}mk<XIRGU8g6`Ga^m9yrniB`J{wAvidYIk+uyRoQOQ614i^i^$B50;y`
z!@5SKI|y_$f$kuod$9+|&KI(q$p9d26wl4dMYQ#tNLif=q_c8?wmO#(|Gn%vMLghf
z{-x}LFmEf&7mL|#!1NH>OT-;+)}g1?8#b<o2-ib|>mkDRavob@wk+VkHgo$9p);a%
zb>I+)SNjg_EZlZX$IU3%Wc?s!Jx~ByXB0rz0|g!Ufn=^{@t`)tOpmCa>5YXJZCHOE
z>YB@<MbJN<R+pB%KMy?u5#N%&Fk3Yw=Ea^L_`8MdDNwUKqxj#fAtch4Zz;5`1IbrV
z0g5ws0}vF?z5$sKbm}l8kqX|_HZ5TX0i?vt)cjV{79N8KjuO)w?QzK8v%<V$Pt$zi
zoO<TUyftf9u3WPwZ{^IHd3iIn&mI2}#XXyi9U${(Wi19-l&IOi<w5Zq7w{X*+$~>G
z*RDDQ$~M5A27(NFoAlDPLs9GmP=(vAU_r44f#Xg~*okmERO2b`vAv4w^>W-+=#k22
z!kGWS-!5ct6I~sDjb6k2`xgeSP|ZNX0EVa1Qn`TNY2i>Vu>LJKKxhetqg*Kbx7;X}
z=;UXMSF&htZSyWH9$<BA1qR~MMHQ?~BQFPyJR3A}>;WAZ!J^Scb*lP)NSNNkyoDzc
zHY7mLT{IslWRIA+J&-hu<XH$Z^S?4{4?QGm+ag@)3+_fI7^2^6hd`snXy8VpCLnfq
zVprBqfhOkjSBvZP@sUJ~F4@jMCX(zc_3`z5aM~#bdlWqOLX775rL(UJbjeKjj8UU{
zYufd{jWknqD27>Pl;NaJESf5YW-y4~aP{7Anp=lPbuZGHnOE6^&bq^q<>iUJrs#$<
zs=S>%7=^ir%cw?OzNKt^sfL^4!{cUlbuTx3KpjA7rY9zw@i>&lO;_+WXtlIvq8L02
zL3DE>PaJiNP<h?a?@+iDbeBJcAUN&?IPKr1Gpc<P*bBr`?|{}+M_Ra@lUYtchQEpl
zplB>uN+z%s#8%?)%)F_Z+ufbztQFRKw#0fv5}DNXCL~qf1Tf)EtCphVK&WFGi;^K+
zo(<Xz)^6l61Lh@ifTH!K$iwtTJV8UuXk;nU^<?piboG^8TQ6RiuyueAH@C{n$eTeU
z|4>I7QbV<_$m0!j)RHn5TwKe@!B^~nLT7wD4^dO0TN`r3T{mv#v4Fzs&sFYLxzHEX
ztD~SIp+NSINHxos0*V-{N9>)8ir5~v=`Lp({ggwYS4H5pW=Wd)!*;f1rIkH9G2
zjXzNIxX64L&fgM5mvC2e;Z*%O*N5fq=Ar~4A(rK?=kDM^lY&EX7%Q|F4{|Vbi)WA&
zT`yws5-hZ4A!-YpLEwN}%;H|wcrivimcZtp#9NgZi&3Zd!Es{%;K!r60Nk>+5e?;)
z#e;d|U)hxh`4vu8IDOC0V8LvThwO)TljB!B1@3`R3X!8K$r*43&VZNX47ehm0fX5~
z`@l%7-9xf@zksnA%pNh8@Mk~?g&6Y@=%(*UH+haH#^mWn={-=c-lH?}_#v~1O1Z44
z>Z)!tSmQylI&H?ML9u#aX234L6PkrO%wUlnXgTL=JGS9bwAH&UKa;hcx?r?Ncar$m
z)-G-N+c2lhD941(n;}zXHS&eTt${pv-+m`K#MY<nK|>+>$y^={wMMmt?HYvdVsPM^
zQc+(AfG&22c9t(<PgsL3s2Gq<#TFs>gei#G!CJe^cd$Y7C)ihwTp`6(uBs#l@njx=
zOQNh|6DNx4rN)Dn1IBIqSERx=A>+wDhiiNE7Mt|(dfD8%=<CK{7TeW&LkHor;zSBz
z9kv-a0--Ovgy95R2kE$H8|$PMBB9VZB{wWLO1AMg>g5su*qIkX)_a!g#8biTN1=mw
zObKCX1H(-Rl$N~ClL7Vp<jg?72!9djAs-fN2slfSq&k{^X^pV0Lz{mCBnHvNv4e;U
zB&r|s0w;%s+ZMxe7f>|z8q3a^!0xm8LXfT++#$%TexXAK^!+}x72G5!Z4XI0fKR~u
zm`@~;I;Y2ekh1Q8kTnq`xB2X;;{S@yUCgRpLj}JhP7>6=8u^&HEszC4mgTS+yv6}$
z)K5$$E|fcH6>h1{DcUjRfacXP5mEUZkel*P$W1V3pOOn|>n<5?5H+@2%%oLiaatmu
zk-6pqbIk>Wu~|rfq5E8D+k}LXAhhi>0+kMhBP~$fM;~D1wu+6hSr+2Kvakv)3-MrC
zFoI<v-ey_g%In0%b-XC>5{C-Fl>dZ2UB_3#niC_Mq2FK`ZiB{}$!IuKBQGG6Su<<{
z0&*d&d=L!3g@qqv-QdDz6M2IG!yH7E7`PKep<_RY*E)0t(pMAd6ml$1;_(@40mQ%}
zAsGm)tW`B?13smwKpspL^F+4}Ix0eViD$<m+)<<Hcqwj>3Sh~j7IYLfFo6kVDVkxn
zehoXk=me@PrfqH@G>dE@AajCX`sWPxPZ$2%+Ro60pGl#KwVg-uv9OVrvu{!O8yDWK
zwkLGob29JS609Uhu+c`;8|nnu{-RCaEiKfIkC{3?-YKqVs&U=w?^dkV7RzVP%9}S`
zGd*{8!Op@%=l1JP@7a42$(;W0cL1Oopt}J7M#b=R;cwQaf!-1;>MiB$-39y|7yfze
za+~}eL*(sF{6aG0W5^2c#G^iTuukd{z5|6LM%0`3gHkeH^ki-PhbJrLpPnom5%FYs
z5D}XQ-()@7TvIsQwu$_gm>vR!fh-QQrp|ALN5DFPm`ecSTf7GxS$H7qCf(R$#USF=
z%wP*dxj8uq<m+7IYF%cNu!-C}lgP~nMY;KBBM?`k@E?E}sfd}7n}Oz7&)Q^KBGM9x
zwgcarj7>Y!qHHvSnC~DHnq<s3b9d4G7wUhzIKUlPj618j`}O<Q-HmK3_9IYR#~m0e
zhq{_==U#>RM{@9OHFF0Zpnbw#IPh{F?0_l#7cW+%@&VK&>j6}y@<9jQW)>qe#CHYz
z(KMEm3%0R+xoIqRcm<Lc_`yJ423HvRHj&5uaZMW*qRtkRm}jCi7>oCdaUgw@$i+nb
z#s~3UG4jnm5KW!ogj_v@1$SaC@uux08=lQWl=63XT2-Apja+US3zXN2(@`VhGXu|C
z6<Q?}fdv8kfaeXoNL+Sbvru%Pc#NDY{}tS5Rqb&@IcjJfq!!(A$DoE6X0ZD~@}Ct1
zm}sk-i2i4bK|y#*stuGpi4iNP+k=!qxM~QCq}E6h{WG{#&Z6+$5UV4jiKbbDTk<2s
zm1Q&O`52J`QfCOgKbhQYcWKUByQrharpHA)>5HfQ_cEKc(zJVX<=$%DmTl`xR#e#k
zFH7L(mv)LaEnS(f1^3~@V(5$;GuZ8B5X;egXjfBfmKRtLaz#(#Z@KOHaia7MT?TH(
zS7-$p45Rfcpt&cKyb>w0b4aIZtC18Kq0(#?HM8y`d~9q)8i1AqP1`+)A~YMJYbfj9
z&uONzw&4-omfXu?7@Y&wB5+>zKn;swEVw&lSYEatBL_VP>_DhVF-V}FyEp-n)0`Za
ziO?{5MM;IZvPAksSf<`vUU5}7nF-BInlvdBF8fSo`j^M<)vg@^8WgB!tf-#Fv0a57
zYShYzTY+hqh>Xc19^PbJP24qah@>05nBm^nubej<K}sY}n?x3Lu0@-w&DEQrWnRJu
zb?nz^Q?(qLJoRFXCV84ZRJVz#SZ3UyE!p6=Q^WmcUJp8|FDBX_+$9=pLj7z-5$HW~
zA*KBR+G8#7M!s01PPRE||Fk)2>zug%Tc@m}SM`Ns3iyf+-5b@D*zc_EY{wM1d7<NG
zK@#!V4wav%mVvvsR-1DO_Arp?Gx;U8{a|Z_^COQeEzoASW%AsGM?&6M7MRw1hQr}-
z$Fm7`V;`y^ZsT9~KwjjEZh`<oR8i2gY=e2uriT(W)c+Q20wnQyF)<&5L->xExSu2z
zp)Z&41}I`-ojaJVerx6`MS|Z5fyVmY)&_&vZS`cp1$@JU)m?Z&_St1C?#^A(!b>n~
zKD&|y3MyI5=Vq|yM0M5&?Sx}6lb9+#s=Z@S%}6$rC9C#F6n<KyTY;{?VOVFXU%}c;
z^(n+uUk;}F6fo77gQ-5nW~%3^aC=dmp2z<}dVyPI^GqATGkqL9)0T9$>s#oFdHfU7
z58QfN@@7ar1Ie4y*<~nu36C>Lf7b!s37yeKX92n$K=&e@#i`bZwmu<RH8cX$>hZ9(
zI$OQK<K3v<;9*7pxSEbS5Ss^xAD(D4Nu5M&9FSpxxXEs?b|8@;fn@P?rTJC$bCATn
zLHg*1;%(k@lxIZEL<hPNtyG`IU6?`JC!HuL`hNL(06@&1n*DXAW_)JRVIA^*Y3zWS
z*(4IXWNg<j^*U}g1pJRSAN=t{HZMG~e>9ejAREy|bJYyC3XZ#aJ-#Qn#3I;Ni2J+l
zFUJ#z;|r93)(9yTD13*R+Y6ay66JrL=uw*_?yJc>$(cot!&{KPx5r{Rcdvf+-U4|&
zg1rTHr;Fy}#Vi~M)w!hPT;Sp@dfOt%vc72Euz+6=1^7bZG+#&r^Z5YL;IxmJQmVj|
z(v!z>Ar1!90516=5g9NJkOAxagZ~PU0qYA`%m{Z<d7zUBabO_<Ixt+IL?WC3;YJ`#
z0m6+y7%MCX!NX>b7+q8c3wTpolgj_Ya`9*F5a5;1-wNRYGXNGqux8+PU@*z&krZT)
zp&)y(m^c(eZ4T%FQ&zoK`Ho-!5Xt8^q;SwxSSFvYMw#WGrS_@6KZSxmU8Caee4++#
zBWxAm$v7rOi^w(suFBeuNI`t0#Bf{xk_n<wUhP8SnEW9L>;$#0B$x*pL|&~MQJjdw
zREl2It4(>e-XDFXq7nv-wy~ntmRH+<BM*XAUhTk--cr$KoL8$c@wEvcK8mH*drNCO
zj^y!S=!xSsb9fQGeBq|>Q0)Q!{cQlM<4QNMZg7WVJEVSc2JZ!D@WVQ1u$H?;{J~mo
zFS&}S<ErBMHGG{z82^NZ5yw9ukcsD?yc~mYDhrzDusV1E8V2})kFPz@usFVwT^eX&
zE7>iz-T-JiA6T1Cz)tr9>rfuxd@reUz8?bTJ9yRoK6%xv=d$3r+g1;~-Bu61O+ef0
zu>srbu>s)!C6S{4ce}dI|BHtXhVn#=@IitRCU12p;e&t@9yj<i@Ks{~U)4p}@K!N^
zw+a|sk5HKb2o>^XL!_+i%cJ$f0c4ou&ad3g!{wNbU^2V5p7%|=2`01rhhBhJeRBTn
zt9)WI%N#SRpT^A(OlDJcC{#3*jRligq8LgBliB)<_kPh_KQOBMCSo$n@ju&1gQZ=-
zWHwdThf%Gxp}cQ?M0qdn%&ru1Ec8ta0kYie$FBSYWN8gM+Q0$qN`S3=12*7CgV>cj
z*~5TZK>=Oal>kWIVG+In{bmgTBtNnNl6`D|WFH$K`B4Rb)dooJZv!Ovw*iu`+5pK=
zmT(U+68RSCMMt5f8TWx#29I+knB<UOW!Bvo1Q<dEpk$#PU>e&INb<O`0RTzP6Cuei
zX8ea7#RDYy)P)}jBzf?8Okw+hfJvSOnB-MQ&i$@=bb?@#SAN1Iw=bJYFv+oiNj}It
zxDJ@)z|gjx0{8zkg<z7`Umu2K8uZ0wz$E)s4<MN2iz|*3O!A;}Sl+JhuntqTWz5};
zr}y91{QTpn-aUsz_S8i{@51R`2THyVJ$EsMCq>#o$uG_501<Kll>GcMfRYFHcXuCN
z4@%y2HJB7Adq6<RT_0mv$hfr0lXU<Stv2JkA}HCk+jLALHo*o;K3pnDz)DU)$=9#F
z{N*NqlKc2}Z0}1z$$yEU<lqynG`OK_9VpoZpybWRBcDMjPXLtM=_f4f)N6#tR02xQ
z|G^0bKYshiPrF1=GBksLlJ^K*q(ZC+O74;@sH>nc^`PW~fJX)qcOd~Kug+Jy3;<B_
zm2>a@xC)@;{sAp*pk%Kg041L|_Vd%@07?!H@vH+SH+lr1<WXl}esV9**+9vIF2cQ8
zYM=;9K1X25-7`lMSn|U@NTn6qY=b3Nm<0*IlDiOCazEb=9s2+*`R27>e!WIu$$yEk
z<l{kaHK^g6I#_Zlz>+5g@AuF^V?|i<lg`lm+b5rGAh6_|ubgnOcdM42Cy20QyaZ7A
zZvln>_P<g1O8~$Bwpx0V;P-)uV3&a|?QK1F83O^qE)yESE-z6b%{=46Y}AQ(l11-%
zJeZWH`P$su=pE`*cVC)Zm-~2U8r-4(eP^1h$^`>-m2WzO_X+;Xc{Hq!?_1+sBU)vg
zpX0Efc$?86;GeNn_)&DuS6(<J6nV3#$|?sF_Lc%*QMnY1*r>@$9+U&ILSw&yOC8R2
z{`OtYf^-BAr}GfF;^CY?&glRaLfJe%T}wl;eOK}?j0=mAgfwWb%{ATzI8=EH>mXyf
z8)U5VhK%KIkg>`;8}!^AmBNQ!>{QW=UKL83;Z5pL&x$X3YIL|<g0v>o3OMaLDRSDn
zja1<zaN4>JIPE$KoN`WzoK^^v)Vs<ijvt={9{$v#s@bp|2CuA0IH@^VwfDq{sJ%g{
znX_ii(t-Bc-U!#{2bIXzuPIn-(x?z+Yxgpv1Ijc%fzl583ge*O2XdjbgT7GOfm|r<
zpf8j*(^o7F^#Hs?5sphkgXI92sR93!YGnk!VIeyYfKt1BHwg2SSd(KID|}gZ6-Eoa
zHl1|c_dNHNuXQk%t7np1y7igJHJYK+PVpLzueF_cjpkF{f?UX3ko!Gv{ZeR>1i?<V
z-;3xg)k^Uy$pSzWEcl8QU+#j3|E}oaUn#oW;mR-g+~<ztSAN7u%&3;$9S-jza{ewL
z=kG4!R}x4)a)j;LF{ke3J=}h6o&#L#5FBDDdVb{!9yt=<h~q@IJ}fk!2>e86ipVb%
zjshM+T*Te~i~D8Hav^v*>bl&toWcGp<asNF$W^HQ%97>wxZ6sFnKdg-iyJgBFK*Bf
z1vCNO^8iP!BKH>CQDfm_g6H7Fejao*e7}Tz(QP|jP5%gAar<4uNxUT^B(o)VB+n#*
zbSQjzEf2o7_FVcFzGgDSZW4UMq{{A;Od;cB!Lo@m1AH-Uu56*~kn93u$0!+&X~wi*
zTEchH5}0Z5MYMU$4)}7(0r&>mA99Jjp}dK_xm+i2Blng^$j8fL<n!dE@+$dW`6>Aw
z_;Sfh`ET+!_MClddvE&?`*8bc`y~5$aM!WezSRB@+;_ZZ{|vrX!YCRl8Y|QaCxttF
zucW79pu%4<OcAUIQ;bnWD-sk2#Z1L)#X`k*iq-Jtl0wA}MUCRP;xv4><cZ=}#XH4&
z#ediaY-3i<HfP(gUT{}(5Ick&0XHT`v5{;%o5oIO=d+91HEhv8J)m$Y;2$nfrO<mb
zi}-%e$mNWBWRnf_p*1-EE1pTI?oj-A)E^ooeRSg8b^X597F}zLtVi+Kf~!vVFYkHy
z;@Fo1is2mmDc$b`mTTH}4r|r@z{H1eY^f&7?q!&xLY#aCM!I?VRozSj6xdPu&mqS=
z#P2w4;y?S0gPXiKdyF|$<27J(52es%HjghcBPWlF^w;?4k6hEC-N|^r+@pHf>vJ`g
z`%L7UBM}I`1Omul>7BE$jIe8VPxBuQTf9@7Kha<``1PO2>6!Qm5(|8Sq~}>I)!=sI
z+aWjtz5{}_*FBN_<D+H#BmjYZ8RMsM8zREE&fb1;aZgO(I^AaG*3{4%Zw=g?^2A!5
z>fc;odX{*-l=>+bM`>cRz+zFtRID);Yq#b59nvU0_()!>6n#R1;=jB=a^+;E6ISTi
zD^IU@>fzf755zA$bcLJT$`mjRfA!xMtK5|Z$^|UuAjg|4u8?E%vJ&$Sr-Sj^Muf-4
zhtC1P;f`gd;_aH9rpXgylQZMtYJGc9ThU^2VF42?wijTrCoyQCb)YW}gfB>7Hf*@Z
zfbg>O)3ixU?+qs+f7KxB;uZK>gHoNCJ~=U73!fTkEXx7gB<>-9@kX|J+vd%cPBrn{
zLW1I=M$CnK@4LS%F59E2C`pggDSN0*zcRlpy;<|j>C(jA?k26LQRZ7XuJF9m&2u|%
zJvtaOv}hBbeI>NI({lTtQ@!P#7W9g6cj|j!;;l?Ak|fD)WE%D1PQCmlbnYFr<!qXE
z5>wpg^7x<T*{fzmsZmpWTZY%chZ79UfYt+kpTIu>Oh5^vM90;anc?`YEHp1Cc8n%#
z`qC9sb;<HM>o?5L*PJ)koLj9sjgSo4m0*eih?wl`YIDT_P2u`^vp49>^5xTVzRS`0
z$BgVhQ<s8$WN_i8_tzf{!^&sBsQ;I$e(`Twx?CyHCN-{P&<>%6tO#FCMcd@K!rDTX
zfo{u{CDK}N__R<UAlsXP`L;iHflmPi;y?sX)F1vqJPQ0j#hnX$RK>Z#XExbQ*i#b>
zyO9L40Yr?v%0r1DS|2<V5Rr$7MS+Sh-Vck4T-&OpO0BjOQ4tgr2oD7+g(_9i3Ks6~
z78PtSRZ$yyP5s?mE3KD}U3>TbznOD(&t{YG5_-;W=FIb(Z@&4yZ|2N7GsCZ6Yy?xx
zubDl$i}|UqFSk5<a!Yt$_<m~Wt1b`89ujN_!nPnZDI0>YEeI`IHSd}4J`r2-_>yPV
zl(PNDhNVxgjm5)NiEwD#<jG@Y`;Xh^&7B;ZIQ@mt`XzTwl7O-6C-Z}xNi%islN-tl
zvX*mBFV0xC>BQ$n6Rq}ji|70(R$29N-X9mfvVG~h`HhRw+;_)pzi?sx!RhDbC35Cp
z)-6%WFPWi1W&Hl<B=WzCCY-t`YvtpuStGEVMf!cB@zBm4@4vJ8)~lXCt^EG((XaGu
zRWabQM5MfJmzkd?F3cPE{oCiwjt!hz*JEwst<aLEAKkd2l+}%wuS#5=S1>0&X>I)d
zt!xOvzq9e~=O?<)h<6XQcrnpw9Y5%?y+`-eiS8d7J1;&y{vHbr5?3-)er5bsS#UEx
z@m?^|JybC0)={?&YjyV3AMt|HZ0ARRee?C(25l&Je#W1)&c)$%&icnzR6Sq%@^?0l
z82+7GuDfl;^!4A1Id%WZCJQ451YaIIr0TL(-VRdTR(<PNBg=0>@7kTBe)q=Aahqd_
zny<PAZ+>~sTkn_FO)7SxcZE7#$1WHj^Zo9nHF=J%>qn=29v_s;9x7u2M45bdzc(={
zp3MAFcWo$<oDxmcB&Ov#(Pu+j4}QBW8XvVfF$zj9>`>;63WwO$;U7?=CK|skbV;{h
zqWSdCL;ZTPv}f<ZAU%0vOz8KY1mlHO9YTAnAwj=$gNYfTg6Mnk7UTF*(jv%KCMkEp
zYg@K#e{IWzYer8Hvt@gX9a}DChYI<AQH4$!{P#rL`1#{wRe6b`b%|Vd%o<!89~>7)
z(K-n|fBcz=w(NDn-zHPp$wPjjbC%D0ac6MKb6f6uwN*S%en9+>yWW_xxqRE3bC!%B
z8C*Yk{IW6bKZ>ZP_3`e(nW4#v?yM+D46KTGSyR_~&3$(`E2G|z=T7v-duGgvIX?}*
zv)wVjHWe%k$J-Q73x8N_ex$DZfQ?V0FWj>BhH<w{op{s6slR%0<Jw;!ezRuZIc4hX
zr&g|d_Nle&p1pHs%=w~j*fs1F!|oAREFHP-#@O8Wc~SGT@77IsPg@;J+@07Ky#4um
zw!Kjrf4A6)@0MLZa)ZpN&G<Av<1g~Bw)hh_=EfgelVCTK#K_!#-?sTTznuQu#7E1W
z<!o8<y>MMW*+j(7DLYYb#k9AA8n)uYZo98`2bW4y4!gs-u&khOxZtvNyRQFZEBhyN
zz0JgazU%G(YUS8ZonL*x(|+7^e!KF;7vF64#G?;BK-nwq+H)Xy{kB~*-)ZITZFX?h
zA11tB{@Z=uT^4l4O)7Kl53_NNyB`?;V$Tju9&|r#{f2DWa!zn^Ui?nB(7=dOLJI2c
z^)Kd1ujDE(n6j8o?pc^}QZ!L%cZ{iA6R)&8##H7Ncq_=|RY+Emms_<HbC^0^?Yw`V
z`wN7VQ|A=3L(LrCiKoX$1jmOGi`m4(VGf#KyB#LO9$-I?I`%8ko7X#Y!#|_cy`x?o
zFmTkh{rbIr?OSiZ{_5UX!3Q$Iy)V!CeEB0`XU*MJ@$#y9RnDq#H-{~Ds+_uEQ71Yw
z?0nh2SDEwP7=(acc~0H#mr*K-bLKc(57yf<&52%E=G?L3j-q(k8mDUS)n#l}B2&jX
z&T@8EVb<c+wT>BEjl1Q^su{8P-1s)>fBR04kNI*|V$5{b-xtIe>6Om+=Ea;hR=m7z
zcdNb6v9(p=wYAQpU;Qb%KJ+6t3YZ-#V3op_-^bY4q(gjS4r7_GqCw}Sw_ajjlqVJ}
zSyKA=j~{=C{h7Y()H&>|{QF8LI+G6af0)M^vUaE0@F%*4%}u_Xg}N<xZ`Z1!A58DV
zCn8klb8Gld+j;QyNpl>&XlN%sE6Tnk7dm?HIwv0DCyY6De_ksxrsLiDjVAuEvo>dQ
z-2l5$fy3Vk?WQmfnrP=t4fFYeO=zm>My}#7f9)h@%lw=yUU%3|IxrsV0OoCWW`9Lh
zu!BiV?^LY=*W+%>7A1G7h`yXKF`Uz&|H|Iwb5x$bgjaz%s;w?p!9cO<tOu#~`UTDc
zc|(=y&&ab=73=>de3dHF2UI)U?Ipd*d$+3OY>><K-Rctkw2I_}D65jQK(rbW=!N@Q
zp1Ep-euw8!{131d^-lczRD?}dM(D$Yzl*sMb2IUd`#h7lPr<#bDi4fR0}aeWz-_L|
z0(sz+0FPdz0cV~u-2pR7MW|zm-l!_{pVeT!RdwXqQNOM_s#@yv5$=yTq2QP7{Qt}3
z|LR{D_u#-z(h{!sn)HKBo*4f7sDCTYRym*s=;f*?Fo^m;!?`ZF_0LrY=oDd3^$|Q{
z`X|s3x|=dTW(WJ<CLhpoXgo8yLvLok?`gz;2yUgSggx7h0ZvO}-Ur_{s$2|PN>&IR
zb$`;|1pE)Jk$ifAYMV3L#K)ldcGoQAc^~Ep%reYD%sR~Tm_3+BFdtzS=0N|P(Ue&X
z{C}!gV20|Fa~1G9(-!ZmGoj01_I!`&4^-EjGHCG`{vFi&JXIK&sLl-Bsm>Li2}m2w
zvTpdO3-lTsSjM>*|AbFglb<w(maA#EH(h=Zo~UnxKj00&fj6#HJCpz5S;X_7DZUWi
z@EOA!!WVu6U!-_Lc%UI8JOMxW4fXDi31K3JFFalle()RN2jK<35ngclA<&Y#i!4}q
z=!HyND)PwA_Qk@-!b_B4dG8DLN79KoC^SN9RekzMTNxP=I(bIqN!p?w-Hc!4%WsyG
z&dOU5dJe_}FlRy6+Q7fTw>}vZIrNO7xyYkud~_F{2_T!Ek#_JKBd?y!`rF2HiwygX
zzs*FZ>$REirI9IVH_wPnyK?PsN8`3KE^_Xf5?AJ%+4fbfc`AKtS_gP~fN;?bJ#}+e
z9|#|K#?J@HhZg-1Q<q`78$BVs;Th2xp5E~Dit83V;x~qWJe?wXWhXNFg*t!{`Sr_r
zQ?mYu(LYb4YdrlTIzx=;NQ~$zqhHW1qVqL+M|47}PY``7x)i-8I?d?veAR+eEH1_j
z#q`8fU@9?ZV+LT(!*s`Vz;we%Tvv=qt8>+@n4!L>3j9!=8`w(Pdw8zlxrY3cOrD~g
zHqa)Y6M%+uzLnVba(ni0ghWnU39Ci=Zz|Alz>txuV9JEs<@B0dkswYFk*A!KRZKhQ
zUUFUkHmc)g!$-DJeTH9kWg9hU<nY0A!i`NSXK|gzxm)53A{+Ut0Ik!~_2;N&s*r;e
zN?d;~<u*q{l)C<)a**e!YNcA=IAQipYMrq+8GEa-UpDq{jJ@k-HhxxnjQy^$-!t~d
z#{R_EwNq}pWrF&fvFnWee`4!gV~361e9H72r|4p1w>EY=V^`ch?Yo>kVeDSUzR1`E
zj6KxY*FYQDuI$LRoT#Q{JG3Tjp)^NUG-(q_8f(kbx&u&KPSH8KXFhFGs@l<q_uyop
zOVuC)MXQV5=S40KeO><=*Z;Qbf6tVqb;SC08;;bd;6xR%nFiGz!KOKo2XM~Xt>g{6
z<vk9inyVIioq0w$heiBR{hWE004-OqcjE$-Q2<Pv;(1?+4L<iq*Bzi&jp~OvMB~SL
zzFwq%s(++^#_2W>Fa`U4eZRh!12!Jg59$T_hx!MMEHr13Xt+~KoCVGUF<X(-#d?!|
zK|jgH)64Wz`f0YFTcMxP&+3)>S9+^{Q9q_1XG(Vole#bIEqXIYbv&wH(J!+--LJt!
zqw6%M!`P>sau?u0kf2_xSL;<A?6HOzZKxeEC#Y!XRD?W4kgb-)M3Ij$wQkNxu&=sM
zT?GBES2u8O&uwZF2aQbQ44c{N4o<Flz(5*^9WyQe4dli`iUR8lpq0?VWyF@N37iBa
z=Uv^!SvX78Gn`}dvHFv$RtMFeO=~@=R~kMF0_`;96Y0YxI^kAM!SU;aQo{!#+Y;|-
zgZ4%nbT-=HT%!&88hMtw+O%-J$%CAjCme2`@a*K9W=&HajNRGTJ&b+6u`e=qKZ6~X
zDYY?M<VK_`pu*u9O81>;a2>%Gf?GMK<`Yi1>BX5hlW5<)f%Y7oa)02noQj;`IpcFC
z<;-Im{<YlV-1fOWa);&KkUJ&!h1}X;DHHGy2X_TO3FU_hLY+f>LK8WR<j&B&q2-}B
zL;FM3Y$RC3u_IkMbYu{d?l<S%pZ84Oro0cCZ2w1oett3A3iik!mVZP3l>E6&v;QIg
zi)Ka5x-}chH2Q*OOPjsgY*({S!jbSr;UUzln0CyE6N1LoGP#cy464#(LRI4KojjyQ
zCu`xcXmUTT8#X=%-YFzcn9$wi3lkGjCCS6IeULNNBIG-)t_H?v-1(|`a=khoNYUgD
zbp}r_btO5+B&(6Q=9E#Ctbr4Qs%5fP&&Mpn9a3eaj}bc?Xv>m^^nzrK-U#&5RUWoO
z8jDL1=(VN}f<qbZ81Zc>VFJ(D_!ofl!{D<*h4p+@2&6*>w*)082nPbGO>KxN!Tm5M
zf;5Fy7%Uy?Dmb<vbxWQeonrXgB=;zR_b}-eW0r9`)r#Z+>Ty`g(T^mn^#;sF@Ms3a
zps&23fz$?wg6+d#y8=^S$`#C7G>|$~H{uppOerg&H{eEEC?yXg*gME#b1;tr^9;e9
zurgXUMthc%rfu>76sm?o`+yXq&M~!^H;5;QUk1fikbWh#n4jE3O{(=HP;U{fBP9ur
z^qoL1!dMClAHffXf<fqV$kb{X@hib^K3Zu5?lAZzz^jIwhg1Z&ZTT8<B%q*_C+&O)
z&TBzx3v(Y*Rs_B+p>8g&-L!?|=8Z79nzpnYB&BaRG_9l#y{X%1N*qI3`QTe<c&07S
z(cnHC*ek#Z8p3adU?DwO9#F$Ta0qEm%eEn`Eii<m$57g0!r>y$+A3tER>-+qC3+wn
zbvb4b?jbzqCwBui0=){U!#ltqNI8SZmFde<4*_{0kl!Ni9dZaH3#ESA%$y<twcuC+
zjyu4y5FBf$Zvs3IQ@b#2dI(&lh9y9c2(IABX<d{u1T!D{6sXh4QHW`Wxf8P*Q-xWB
zS&LbRS&#V~a|rV{%-=DN+C{l<1NR+xW<PZLh&p#qZlMMxZaFbMHo0GqCoT%MHROqa
z?S8Ng8_v6$Ft2rHlWH;XE~Wz2g(UQd8H~sJIB^jzn@8#r7l#T`%Z!MOcq>R<BK!#@
zgg@JYuaq;JvR6R0w+*cYzRaUoip=M^04|5S4P2p@)J@t}>SIRIf(3l^mYE}{Aodbs
zD~YY3)V{=4=&`hfrql#bgv-OGm89Oa)H`B0X$2IM5r$Afuo8(74QV(SUorX4AidEu
zgqIRliN7}{U?d_+?L`_yqQkhv&s!^H7D2}rq>F+_OEhmK{@&Dk3^1BeN;N6A8LfE+
z)Ne^_WpWEATt=YK0S*_7AzPwXn*-an&0$m0b|Y6|YIcaYYJ-pAGRj$uj+-yq8A?53
zbhpqzbXtz7Q%hvE1bCSoe7rts^~PJL>DU}{rZxSs@$m8D+f=^em;X$t$5xYLOWm}T
zo!r>tZD$$%#~8`R7{^MF7CQ>54ZzcfjRqAd+6|{L!hySO@3S9`5;oGr+amHM;{elh
z(Q_JEC4GsJRmM)F?2SH|pWKA&bYg68b$}ezR=QKtSxt$1IP)=LWaTiDY)4ul^3bb_
zguBvH3H08`RG^1>GwQja*M8;!u4Ggz{Z$Yxewfre@d@rh^DG2!2`R)Ml;P~Dypmg>
zPU-?X_cE?RJks-+3yOksG68+Sm71UAE4dV^a%i~}1Kmt{&{o_vru@QW4WTmjW9Bbe
zOZa{;l=k3M@soKa_fR5bP<{<mkqgek4#LGT!2wsHw5h>5;{-}L;Ur$1y1F~vGRgf=
zrJD9q$pg5njsJj9jaz88ANL+=5)ryeeNG~30^eVE<c#)`)~tq}V$;HcnZbk@s3>_(
zI-gMpT&;r19gLP5z2JM>(!Lb7KE#|Px+Q;-TxyE5l<=r0?>g>fs3tt0fjPAiZj{U9
zw9c>7PVTc(mRte9rF2F{3CJy!p<OdVjmNaKSDWnU6ADRgL!VJ=@T$q+w=Ae40S#Rz
zi{nfip;42o;k1T1#?@QdoJKB@;w%nq2>)XbOKt)mq1h%d^YsSVkQ&3FvG8mN#~yMr
zVIMm-N0Xyr4$2M{ZI>N-G$kA<P4bXPo=9m!@Eeg>7ic<9LwS!>^3lW#c1M#gJ5AG=
zXQw}*5dSD5Yh2a@CNgR|iC4Ph`c(9Et?f&#&JaY<g!Hk-#fab}q9&uTtll6y#9)$@
z?quQxY=5g{wx(#lEPX6}hrwJ%<)`GzZkdKDcPfzaaaKwSthVyS7(|-TU|-|dqkgHU
zf(HMRGE&q&;nHQ47-oF7p7DLd99)PVK9nIBjL^-PDZ-n9(HA}}yb+}@=s2AMDUy7Q
zIj*K_+emD-oE#?}N5}?eqc<fHqoQhgzw~7=V{l?)_J5L8O$tqzeINW9`QKNv=WK*J
zHX6+#^9c=IGVk!UUCbVAcSqUs9xm_e=JLB_L@A>YiSS03cI++he_tmyrfApVJz%@(
zjljzvp6#}nWrrUrUjsQ#Qcwoxxn;OvN37imp~TFEZC&(2!{Y{-xyt0TFny^|TrfKV
z<xdE=6U}V?13OV5_4C^vTWmXumR3sl;<nA;H<=;&7hFgot?1d}s)xkC2rq+0;}jvr
z$Y{g8SmXKXRqklLrp@-bCC1SFW^CwEO|%$X?Y5-oFTT^tc-yox^L<$~mASVaa6>Kk
zcIorbO!eu06sWoKXJ-^M(~FeJ*M6CQv@$1ipn_wB{>e(B<jAI<q_>yMKTEGA^U|5}
z>?Szc2PKnVv_=-+r$_vz)MKK8%+MXPJ~1sPv+lO<mQqU$ouqG+@5gr5-J^`f`v`OJ
zrdB@PhTqcln6QIq($r{@v~KU3EvrrDvMrEK;z3D=H08?B^<icam|Zk2D)Y6~xPtJx
znZGl0T4v_kE1{8k@ecsAyqip^GB0j(*Ji(WHH8<YA3&1UyKo{vbZ7Yth6e;Hl$CkC
zLW9|v%xK9Rw;g-9@4)^NkVw8ZBa!QI@g*p^kUtaP2b;s6vMIM<C0rmgjxIk~J`;(O
z?*Z~Bis2c<^U#Yj4FxE%5nbS>v~n$9UF1uLZ~>5{oTl)}jFT3jJj-vQp`eBh6L@a8
z$zp14q~DZ7m_OEoYA}<WhN@nuz)HK1?4kxDH$f;S`8?eu(kB?mJBv>eB@MXPF|C(2
zs}O^&(Jdy*pDrDmm8*V$yfZ;zw>G%t`@}Xvw!uxC%<Szp+&eHbvK7~A+L$?bLs`q?
z^?}M8&u;P(OO1En+a^hXy~Xe@VKyW?V>+Zh#nUnMT`BB+qc$(Bep<gjldm#;{&e--
z)Xm2YaGN``DXq&b{u<$7xv8~0%_zv?l%a3@pd+NC-*h=2pF|Soes#ljL^*)wyNy&D
zxU{5v1=|DEJy2YX`1qd3Epj0)`$f!r9WZYwevDFX@|o{N;IN0*w5hAnA~GfrO;G>+
zIhE7zNZ~KM%3pMriadd5sA6SaAPRlY#F%eYJR7~h=z#m4C7RaKOz31JyU}dN5s|4K
zGJV<LyjHe#q((>C17<?9l+Fypmy&OfN9w0&w6q4sj}}HI1ll!KMrMrXb2XXgJ!Mb9
ztRak>KtQ<7GcxM&#xPzM`6I!vJ-u-0&s;uAwdP^Nm6F2CaRfT0f#EW<O30*WgQl*G
zZ|LcS+4c7Da?^;!q%Yt5jgb&jdOAAoLbl}(Q_%;F&6vBj^td^l*SZ?O%%>sir*`Gr
z^r2?tlAXW}0VDaVMaC@XGGDSZ{mcvx+BR!Wn^MgCpD&kjX27u_KG}3N8d})&UWcKJ
zyd|W|bHUM*)yhA{S(w!?jxkzRkS*2ibJMYnrL-y1v2H6#Yd3&d8eYmJHIPees~6$k
zOda9*yr)U;%VDSn7kC=UjL{@fivP2Dz=hG&!_SizlJ~YzDcOM8m-Q3AQc}RWFc;aw
z_^hq)b-_Ovj4nx~YRus;x3M%Q9PUx1A>KS%;J}d93(H6>BTtI%#PipZS5`(ED`CD-
zk&R7rl9$?_+q%U@=9>6EZe~4^4l}CAD#L>;&uodavHnuskfVfyMzC7k)56}<fHC6?
zW(CX_e!rBZrP#QTUVp4$Dz%o8QN3_NvZUCb#?}yxAoCsG9I?eE^PNN9E@hr^Q$2lV
ztM9YxY4%i%zf>F87<Jv))W{6H_6a9?5E|Bwse*81Qo_rZN>PGtG;=ShkZgx&48Pts
zDY77C=R0ypn*aT2hq<0%%5(W+J-4(zt8?5-^am^Vn#|#)^`4Q`IKT^O;+d&aGkw|F
zJZ<68?MUgfAv74`?*me_6WF#j#P-^yUWw9Um|nwRBrPiM3|y|(k#}^r2lXH{h=C0h
zo<VxiXM4<ioeA|zA~1rakgrYdg^w8o16yXdviT{+<Eb3}G*;>(K&VBZS;=P>)8L=^
zoy*I9%AbrTcKapXFQ=y*8DD|&wrBdEbPp^ehqQLApfz{P{1fl<G0PPuyB%d%NHOyU
zYrjCaoaaJ_$(yLJIB`b!`@i0m@pftFctwZSQnK{;tZ49IB(t2tqxYbR?R%i~o~aRD
zntK?B+y7}@&7Pvahr8t)KQ7rJZhicT4CiLfkNW0z!0bL+TWW1bt|?mjFyl{KfoNvu
zM9NIBNNt4nA_p!W>00?H??=VwmbB2zR|B_<Mn(>#HZ&6&BAxEcLz>Pu1k8Hyce%NI
z*oa>!V@spuO`M&dCs^tOO!M9^Nssx;=<UgETS2IkYG*0IgX<6V!sO|tNNL<OE$Wxx
z&(^eCcu#uS6s)Y$WhUP1HybNQph#Wh-8A#^wP0q%squNWNlDAsUaVJwA26jI`Ax^8
zOq$OO2Q&oF54&-<%_+8RP2ra`G|N3m?Qvozectg>(Zx*UNLDBqPEKRy)<`(Vr}-MI
zr;CG?A(KcjbKesi%ajghKOXc_9&5QAR**Hm{<<wArnA*KthVp0x~gxg?r4tl&>iQq
zGNlhI&xf-{aEu!7uF{^(>h49X)Lx>VWu^8nSb@EnRpndMYwC6Nv8vHQ9nyI^UpLcX
zeVQ)PEm+yyR=3mV=#ILR?yS48_2`Xy7OUQWz)JW9tchR5s<BS2X~_X~!Im{~vUZ#G
zpcSmB=|oQwW2c$E_%Bxdz;J-NLiJQbxGUN9Xe@Y+=N_nTVFmi-tU;f~N|%}3Ls_+X
zr@9J!=c-YxNPkdWqkgP@qOMhsaDQ7Zq#k2gPxF`>M{Sk>W2t(I)#%Sqvx#aIE3<EA
zZT9cfEVWmC#G2O{^%wQH`iy&t`dsIzC$*zbR~vP)?xVKqzIuRqUtg(*vX*<89;W`H
zN3c85r+T!WplYSYDo&l}Lz@Swa{)U5{1nP8<c_i*z#?|&VV?%QkQ&%4#QSp8P-GxA
z?u}_pJ4IM8AEPasvkJbH)$lE#fve?7o3Gkqp9Rf20PSo8sKT_YI6E0ScZQzQ%DK=q
zXxjQ*lTNYba1eCuuX^FXz^w1@XJRhJ?#JDq`wXalC6v92))>YP4Ohb*Uj70Xj(uDX
zw2QbeHs0PTJ51|cLESDkvj;70;s)*B)I!0PQX9cYuKqlWjH{Eeg=Yq`_eX!@6Sug|
zhYsocEb`miO~j*9>h1z9?A_f^qbD@*?p~B;Z!f2%f>5GAbhygBuQqh>(%5HLsFc3W
z$LV3zfflm2VBz7`f%dX@I+wUAxJ5SY?ZtWccSdHVMeW_yw5(U3Z_>i{wl(XKs@Day
zvAr(?pjXEW*wMk>(%Sy(Qug3b?4Bc+#7V5BL4PBM!V~sZhBrLATm_HV+frz_@7>M-
z%dn(J{T4GsV#-WRA2dN9a~I=3&o!bU0_@2#04mCLCKAxv#C3+orQ|6^Ui!mfm%tAf
zfp<S(UV&sa<F;2fc%&HG=eYE@&p@g*#4jcn8EH!^wF8R~+DR^UkKt;Cv<-n5<^Bee
zI0T;VXs97?-$m54h-V9=wG0V~K-nk~CYQ}`oUU~i&<ePl1L+&!FPGR|jjJ``rPQ*i
z%Yi0xSuL86%i?wc+|!fRlQDh=Y|#&c;HR_U_4Z&WBeY`Q?a?F6l|l#a7M*PGkfBCC
zH1KXq39$#ew@|^mTR;VSM^kj5aC4mljpf~;qwAOc&cjLeuCcf1fHYQKs6R|%(=C{t
z)`sNLrgF>3->Z#0+qt)`SzfAMvKLK5_i41b+(K!u#+Cz03>QfMB%ENm!9uXU7I1~!
zd}&KjMQ9_V6}c^i+7RFLRY*S|#?G>~M2odFw=x>MoXq9ug`>YJ;0n<-!b!qGxo~9Q
gwuv`QpEZ5zw2XaL3TS<~WWS(%I94aWW%=*_0Y=-JFaQ7m

literal 0
HcmV?d00001

diff --git a/docs/source/bertology.md b/docs/source/bertology.md
new file mode 100644
index 0000000000..e408484e84
--- /dev/null
+++ b/docs/source/bertology.md
@@ -0,0 +1 @@
+# Bertology
\ No newline at end of file
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 978b204466..706329c19a 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -41,7 +41,8 @@ release = u'1.0.0'
 extensions = [
     'sphinx.ext.autodoc',
     'sphinx.ext.coverage',
-    'sphinx.ext.napoleon'
+    'sphinx.ext.napoleon',
+    'recommonmark'
 
 ]
 
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 4b5b982148..8505d0f019 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -7,11 +7,14 @@ Pytorch-Transformers
     :caption: Notes
 
     installation
+    philosophy
     usage
     examples
     notebooks
     tpu
     cli
+    migration
+    bertology
     torchscript
 
 
diff --git a/docs/source/migration.md b/docs/source/migration.md
new file mode 100644
index 0000000000..9165365fa8
--- /dev/null
+++ b/docs/source/migration.md
@@ -0,0 +1 @@
+# Migration
\ No newline at end of file
diff --git a/docs/source/philosophy.md b/docs/source/philosophy.md
new file mode 100644
index 0000000000..78c4f0309f
--- /dev/null
+++ b/docs/source/philosophy.md
@@ -0,0 +1 @@
+# Philosophy
\ No newline at end of file
diff --git a/docs/source/torchscript.rst b/docs/source/torchscript.rst
index c94ce35fe2..f5eb97f69d 100644
--- a/docs/source/torchscript.rst
+++ b/docs/source/torchscript.rst
@@ -122,9 +122,4 @@ Using the traced model for inference is as simple as using its ``__call__`` dund
 
 .. code-block:: python
 
-    traced_model(tokens_tensor, segments_tensors)
-
-(Optional) Using TorchScript in C++
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-Below are examples of using a model exported using Python in C++.
+    traced_model(tokens_tensor, segments_tensors)
\ No newline at end of file

From 3b7cb7bf44f30b9fa0d1657a2d9f67a92b88abd4 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Tue, 9 Jul 2019 16:12:15 +0200
Subject: [PATCH 077/139] small update to run_glue

---
 examples/run_glue.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/examples/run_glue.py b/examples/run_glue.py
index 62d655ecc9..6387ed448a 100644
--- a/examples/run_glue.py
+++ b/examples/run_glue.py
@@ -382,7 +382,8 @@ def main():
 
         # Save a trained model, configuration and tokenizer using `save_pretrained()`.
         # They can then be reloaded using `from_pretrained()`
-        model.save_pretrained(args.output_dir)
+        model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
+        model_to_save.save_pretrained(args.output_dir)
         tokenizer.save_pretrained(args.output_dir)
 
         # Good practice: save your training arguments together with the trained model

From e0e5c7faf5d50d5f67c4a9904a220a89d8c4594b Mon Sep 17 00:00:00 2001
From: LysandreJik <lysandre.debut@reseau.eseo.fr>
Date: Tue, 9 Jul 2019 10:16:09 -0400
Subject: [PATCH 078/139] Added requirements.txt file.

---
 docs/README.md        |  2 +-
 docs/requirements.txt | 28 ++++++++++++++++++++++++++++
 2 files changed, 29 insertions(+), 1 deletion(-)
 create mode 100644 docs/requirements.txt

diff --git a/docs/README.md b/docs/README.md
index b88cd50bbf..c39ecda0d1 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -56,5 +56,5 @@ It should build the static app that will be available under `/docs/_build/html`
 
 ## Adding a new element to the tree (toc-tree)
 
-Acceptes files are reStructuredText (.rst) and Markdown (.md). Create a file with its extension and put it
+Accepted files are reStructuredText (.rst) and Markdown (.md). Create a file with its extension and put it
 in the source directory. You can then link it to the toc-tree by putting the filename without the extension.
\ No newline at end of file
diff --git a/docs/requirements.txt b/docs/requirements.txt
new file mode 100644
index 0000000000..112beb3f72
--- /dev/null
+++ b/docs/requirements.txt
@@ -0,0 +1,28 @@
+alabaster==0.7.12
+Babel==2.7.0
+certifi==2019.6.16
+chardet==3.0.4
+commonmark==0.9.0
+docutils==0.14
+future==0.17.1
+idna==2.8
+imagesize==1.1.0
+Jinja2==2.10.1
+MarkupSafe==1.1.1
+packaging==19.0
+Pygments==2.4.2
+pyparsing==2.4.0
+pytz==2019.1
+recommonmark==0.5.0
+requests==2.22.0
+six==1.12.0
+snowballstemmer==1.9.0
+Sphinx==2.1.2
+sphinx-rtd-theme==0.4.3
+sphinxcontrib-applehelp==1.0.1
+sphinxcontrib-devhelp==1.0.1
+sphinxcontrib-htmlhelp==1.0.2
+sphinxcontrib-jsmath==1.0.1
+sphinxcontrib-qthelp==1.0.2
+sphinxcontrib-serializinghtml==1.1.3
+urllib3==1.25.3

From 4ce237c880e890b49092faac07bb757376f0e3f4 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Tue, 9 Jul 2019 17:00:32 +0200
Subject: [PATCH 079/139] update run_glue

---
 examples/run_glue.py | 41 ++++++++++++++++++++++++-----------------
 1 file changed, 24 insertions(+), 17 deletions(-)

diff --git a/examples/run_glue.py b/examples/run_glue.py
index 6387ed448a..ec76dadcfe 100644
--- a/examples/run_glue.py
+++ b/examples/run_glue.py
@@ -69,7 +69,11 @@ def train(args, train_dataset, model):
     train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
     train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size)
 
-    num_train_optimization_steps = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
+    if args.max_steps > 0:
+        num_train_optimization_steps = args.max_steps
+        args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
+    else:
+        num_train_optimization_steps = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
 
     # Prepare optimizer
     param_optimizer = list(model.named_parameters())
@@ -91,10 +95,8 @@ def train(args, train_dataset, model):
         warmup_linear = WarmupLinearSchedule(warmup=args.warmup_proportion, t_total=num_train_optimization_steps)
 
     else:
-        optimizer = BertAdam(optimizer_grouped_parameters,
-                                lr=args.learning_rate,
-                                warmup=args.warmup_proportion,
-                                t_total=num_train_optimization_steps)
+        optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion,
+                             t_total=num_train_optimization_steps)
 
     # Train!
     logger.info("***** Running training *****")
@@ -113,7 +115,7 @@ def train(args, train_dataset, model):
             batch = tuple(t.to(args.device) for t in batch)
             inputs = {'input_ids':      batch[0],
                       'attention_mask': batch[1],
-                      'token_type_ids': batch[2] if args.model_type in ['bert', 'xlnet'] else None,
+                      'token_type_ids': batch[2] if args.model_type in ['bert', 'xlnet'] else None,  # XLM don't use segment_ids
                       'labels':         batch[3]}
             ouputs = model(**inputs)
             loss = ouputs[0]
@@ -140,14 +142,16 @@ def train(args, train_dataset, model):
                     if not args.fp16:
                         tb_writer.add_scalar('lr', optimizer.get_lr()[0], global_step)
                     tb_writer.add_scalar('loss', loss.item(), global_step)
+            if args.max_steps > 0 and global_step > args.max_steps:
+                break
+        if args.max_steps > 0 and global_step > args.max_steps:
+            break
 
     return global_step, tr_loss / global_step
 
 
 def evalutate(args, eval_task, eval_output_dir, dataset, model):
     """ Evaluate the model """
-    if os.path.exists(eval_output_dir) and os.listdir(eval_output_dir) and args.do_train and not args.overwrite_output_dir:
-        raise ValueError("Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(eval_output_dir))
     if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]:
         os.makedirs(eval_output_dir)
 
@@ -166,13 +170,13 @@ def evalutate(args, eval_task, eval_output_dir, dataset, model):
     out_label_ids = None
     for batch in tqdm(eval_dataloader, desc="Evaluating"):
         batch = tuple(t.to(args.device) for t in batch)
-        input_ids, input_mask, segment_ids, label_ids = batch
 
         with torch.no_grad():
-            outputs = model(input_ids,
-                            token_type_ids=segment_ids,
-                            attention_mask=input_mask,
-                            labels=label_ids)
+            inputs = {'input_ids':      batch[0],
+                      'attention_mask': batch[1],
+                      'token_type_ids': batch[2] if args.model_type in ['bert', 'xlnet'] else None,  # XLM don't use segment_ids
+                      'labels':         batch[3]}
+            outputs = model(**inputs)
             tmp_eval_loss, logits = outputs[:2]
 
         eval_loss += tmp_eval_loss.mean().item()
@@ -276,6 +280,8 @@ def main():
                         help="The initial learning rate for Adam.")
     parser.add_argument("--num_train_epochs", default=3.0, type=float,
                         help="Total number of training epochs to perform.")
+    parser.add_argument("--max_steps", default=-1, type=int,
+                        help="If > 0: set total number of training steps to perform. Override num_train_epochs.")
     parser.add_argument("--warmup_proportion", default=0.1, type=float,
                         help="Proportion of training with linear learning rate warmup (0.1 = 10%% of training).")
     parser.add_argument("--no_cuda", action='store_true',
@@ -299,6 +305,9 @@ def main():
     parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.")
     args = parser.parse_args()
 
+    if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir:
+        raise ValueError("Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(args.output_dir))
+
     # Setup distant debugging if needed
     if args.server_ip and args.server_port:
         # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
@@ -320,8 +329,8 @@ def main():
 
     # Setup logging
     logging.basicConfig(level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
-    logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format(
-        device, args.n_gpu, bool(args.local_rank != -1), args.fp16))
+    logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
+                args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16)
 
     # Setup seeds
     random.seed(args.seed)
@@ -375,8 +384,6 @@ def main():
     # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
     if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
         # Create output directory if needed
-        if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir:
-            raise ValueError("Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(args.output_dir))
         if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
             os.makedirs(args.output_dir)
 

From ed6c8d37f402847e65510a81e79b483c68733fe4 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Tue, 9 Jul 2019 17:14:52 +0200
Subject: [PATCH 080/139] fix merge

---
 examples/run_glue.py                  | 4 ++--
 pytorch_transformers/modeling_bert.py | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/run_glue.py b/examples/run_glue.py
index ec76dadcfe..547a4e4698 100644
--- a/examples/run_glue.py
+++ b/examples/run_glue.py
@@ -183,10 +183,10 @@ def evalutate(args, eval_task, eval_output_dir, dataset, model):
         nb_eval_steps += 1
         if preds is None:
             preds = logits.detach().cpu().numpy()
-            out_label_ids = label_ids.detach().cpu().numpy()
+            out_label_ids = inputs['labels'].detach().cpu().numpy()
         else:
             preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
-            out_label_ids = np.append(out_label_ids, label_ids.detach().cpu().numpy(), axis=0)
+            out_label_ids = np.append(out_label_ids, inputs['labels'].detach().cpu().numpy(), axis=0)
 
     eval_loss = eval_loss / nb_eval_steps
     if args.output_mode == "classification":
diff --git a/pytorch_transformers/modeling_bert.py b/pytorch_transformers/modeling_bert.py
index 71045d1694..7cb723b563 100644
--- a/pytorch_transformers/modeling_bert.py
+++ b/pytorch_transformers/modeling_bert.py
@@ -176,7 +176,7 @@ class BertConfig(PretrainedConfig):
                 initializing all weight matrices.
             layer_norm_eps: The epsilon used by LayerNorm.
     """
-    pretrained_config_archive_map = PRETRAINED_CONFIG_ARCHIVE_MAP
+    pretrained_config_archive_map = BERT_PRETRAINED_CONFIG_ARCHIVE_MAP
 
     def __init__(self,
                  vocab_size_or_config_json_file=30522,

From 8fe2c9d98ec7d82466138a59817af2fd0a376a6f Mon Sep 17 00:00:00 2001
From: LysandreJik <lysandre.debut@reseau.eseo.fr>
Date: Tue, 9 Jul 2019 15:55:31 -0400
Subject: [PATCH 081/139] Refactored Docstrings of BERT, GPT2, GPT, TransfoXL,
 XLM and XLNet.

---
 docs/source/cli.rst                         |  10 +-
 docs/source/model_doc/bert.rst              |  22 +-
 docs/source/model_doc/gpt.rst               |  12 +-
 docs/source/model_doc/gpt2.rst              |  10 +-
 docs/source/model_doc/transformerxl.rst     |   8 +-
 docs/source/model_doc/xlm.rst               |  33 +-
 docs/source/usage.rst                       |   8 +-
 pytorch_transformers/modeling_bert.py       | 154 +++---
 pytorch_transformers/modeling_gpt2.py       | 302 +++++++-----
 pytorch_transformers/modeling_openai.py     | 339 +++++++------
 pytorch_transformers/modeling_transfo_xl.py | 251 +++++-----
 pytorch_transformers/modeling_xlm.py        | 518 ++++++++++----------
 pytorch_transformers/modeling_xlnet.py      |  20 +-
 13 files changed, 924 insertions(+), 763 deletions(-)

diff --git a/docs/source/cli.rst b/docs/source/cli.rst
index 22da24550b..217cd1a8be 100644
--- a/docs/source/cli.rst
+++ b/docs/source/cli.rst
@@ -20,7 +20,7 @@ Here is an example of the conversion process for a pre-trained ``BERT-Base Uncas
 
    export BERT_BASE_DIR=/path/to/bert/uncased_L-12_H-768_A-12
 
-   pytorch_pretrained_bert bert \
+   pytorch_transformers bert \
      $BERT_BASE_DIR/bert_model.ckpt \
      $BERT_BASE_DIR/bert_config.json \
      $BERT_BASE_DIR/pytorch_model.bin
@@ -36,7 +36,7 @@ Here is an example of the conversion process for a pre-trained OpenAI GPT model,
 
    export OPENAI_GPT_CHECKPOINT_FOLDER_PATH=/path/to/openai/pretrained/numpy/weights
 
-   pytorch_pretrained_bert gpt \
+   pytorch_transformers gpt \
      $OPENAI_GPT_CHECKPOINT_FOLDER_PATH \
      $PYTORCH_DUMP_OUTPUT \
      [OPENAI_GPT_CONFIG]
@@ -50,7 +50,7 @@ Here is an example of the conversion process for a pre-trained Transformer-XL mo
 
    export TRANSFO_XL_CHECKPOINT_FOLDER_PATH=/path/to/transfo/xl/checkpoint
 
-   pytorch_pretrained_bert transfo_xl \
+   pytorch_transformers transfo_xl \
      $TRANSFO_XL_CHECKPOINT_FOLDER_PATH \
      $PYTORCH_DUMP_OUTPUT \
      [TRANSFO_XL_CONFIG]
@@ -64,7 +64,7 @@ Here is an example of the conversion process for a pre-trained OpenAI's GPT-2 mo
 
    export GPT2_DIR=/path/to/gpt2/checkpoint
 
-   pytorch_pretrained_bert gpt2 \
+   pytorch_transformers gpt2 \
      $GPT2_DIR/model.ckpt \
      $PYTORCH_DUMP_OUTPUT \
      [GPT2_CONFIG]
@@ -79,7 +79,7 @@ Here is an example of the conversion process for a pre-trained XLNet model, fine
    export TRANSFO_XL_CHECKPOINT_PATH=/path/to/xlnet/checkpoint
    export TRANSFO_XL_CONFIG_PATH=/path/to/xlnet/config
 
-   pytorch_pretrained_bert xlnet \
+   pytorch_transformers xlnet \
      $TRANSFO_XL_CHECKPOINT_PATH \
      $TRANSFO_XL_CONFIG_PATH \
      $PYTORCH_DUMP_OUTPUT \
diff --git a/docs/source/model_doc/bert.rst b/docs/source/model_doc/bert.rst
index 7dc669af75..554c6e9b7c 100644
--- a/docs/source/model_doc/bert.rst
+++ b/docs/source/model_doc/bert.rst
@@ -4,75 +4,75 @@ BERT
 ``BertConfig``
 ~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: pytorch_pretrained_bert.BertConfig
+.. autoclass:: pytorch_transformers.BertConfig
     :members:
 
 
 ``BertTokenizer``
 ~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: pytorch_pretrained_bert.BertTokenizer
+.. autoclass:: pytorch_transformers.BertTokenizer
     :members:
 
 
 ``BertAdam``
 ~~~~~~~~~~~~~~~~
 
-.. autoclass:: pytorch_pretrained_bert.BertAdam
+.. autoclass:: pytorch_transformers.BertAdam
     :members:
 
 1. ``BertModel``
 ~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: pytorch_pretrained_bert.BertModel
+.. autoclass:: pytorch_transformers.BertModel
     :members:
 
 
 2. ``BertForPreTraining``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: pytorch_pretrained_bert.BertForPreTraining
+.. autoclass:: pytorch_transformers.BertForPreTraining
     :members:
 
 
 3. ``BertForMaskedLM``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: pytorch_pretrained_bert.BertForMaskedLM
+.. autoclass:: pytorch_transformers.BertForMaskedLM
     :members:
 
 
 4. ``BertForNextSentencePrediction``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: pytorch_pretrained_bert.BertForNextSentencePrediction
+.. autoclass:: pytorch_transformers.BertForNextSentencePrediction
     :members:
 
 
 5. ``BertForSequenceClassification``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: pytorch_pretrained_bert.BertForSequenceClassification
+.. autoclass:: pytorch_transformers.BertForSequenceClassification
     :members:
 
 
 6. ``BertForMultipleChoice``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: pytorch_pretrained_bert.BertForMultipleChoice
+.. autoclass:: pytorch_transformers.BertForMultipleChoice
     :members:
 
 
 7. ``BertForTokenClassification``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: pytorch_pretrained_bert.BertForTokenClassification
+.. autoclass:: pytorch_transformers.BertForTokenClassification
     :members:
 
 
 8. ``BertForQuestionAnswering``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: pytorch_pretrained_bert.BertForQuestionAnswering
+.. autoclass:: pytorch_transformers.BertForQuestionAnswering
     :members:
 
diff --git a/docs/source/model_doc/gpt.rst b/docs/source/model_doc/gpt.rst
index 3db40719b3..b5e518759a 100644
--- a/docs/source/model_doc/gpt.rst
+++ b/docs/source/model_doc/gpt.rst
@@ -4,40 +4,40 @@ OpenAI GPT
 ``OpenAIGPTConfig``
 ~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: pytorch_pretrained_bert.OpenAIGPTConfig
+.. autoclass:: pytorch_transformers.OpenAIGPTConfig
     :members:
 
 
 ``OpenAIGPTTokenizer``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: pytorch_pretrained_bert.OpenAIGPTTokenizer
+.. autoclass:: pytorch_transformers.OpenAIGPTTokenizer
     :members:
 
 
 ``OpenAIAdam``
 ~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: pytorch_pretrained_bert.OpenAIAdam
+.. autoclass:: pytorch_transformers.OpenAIAdam
     :members:
 
 
 9. ``OpenAIGPTModel``
 ~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: pytorch_pretrained_bert.OpenAIGPTModel
+.. autoclass:: pytorch_transformers.OpenAIGPTModel
     :members:
 
 
 10. ``OpenAIGPTLMHeadModel``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: pytorch_pretrained_bert.OpenAIGPTLMHeadModel
+.. autoclass:: pytorch_transformers.OpenAIGPTLMHeadModel
     :members:
 
 
 11. ``OpenAIGPTDoubleHeadsModel``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: pytorch_pretrained_bert.OpenAIGPTDoubleHeadsModel
+.. autoclass:: pytorch_transformers.OpenAIGPTDoubleHeadsModel
     :members:
diff --git a/docs/source/model_doc/gpt2.rst b/docs/source/model_doc/gpt2.rst
index ca232ca876..fe2cd46c37 100644
--- a/docs/source/model_doc/gpt2.rst
+++ b/docs/source/model_doc/gpt2.rst
@@ -4,33 +4,33 @@ OpenAI GPT2
 ``GPT2Config``
 ~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: pytorch_pretrained_bert.GPT2Config
+.. autoclass:: pytorch_transformers.GPT2Config
     :members:
 
 
 ``GPT2Tokenizer``
 ~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: pytorch_pretrained_bert.GPT2Tokenizer
+.. autoclass:: pytorch_transformers.GPT2Tokenizer
     :members:
 
 
 14. ``GPT2Model``
 ~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: pytorch_pretrained_bert.GPT2Model
+.. autoclass:: pytorch_transformers.GPT2Model
     :members:
 
 
 15. ``GPT2LMHeadModel``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: pytorch_pretrained_bert.GPT2LMHeadModel
+.. autoclass:: pytorch_transformers.GPT2LMHeadModel
     :members:
 
 
 16. ``GPT2DoubleHeadsModel``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: pytorch_pretrained_bert.GPT2DoubleHeadsModel
+.. autoclass:: pytorch_transformers.GPT2DoubleHeadsModel
     :members:
diff --git a/docs/source/model_doc/transformerxl.rst b/docs/source/model_doc/transformerxl.rst
index 2d2c38b250..20cc7a224c 100644
--- a/docs/source/model_doc/transformerxl.rst
+++ b/docs/source/model_doc/transformerxl.rst
@@ -5,26 +5,26 @@ Transformer XL
 ``TransfoXLConfig``
 ~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: pytorch_pretrained_bert.TransfoXLConfig
+.. autoclass:: pytorch_transformers.TransfoXLConfig
     :members:
 
 
 ``TransfoXLTokenizer``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: pytorch_pretrained_bert.TransfoXLTokenizer
+.. autoclass:: pytorch_transformers.TransfoXLTokenizer
     :members:
 
 
 12. ``TransfoXLModel``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: pytorch_pretrained_bert.TransfoXLModel
+.. autoclass:: pytorch_transformers.TransfoXLModel
     :members:
 
 
 13. ``TransfoXLLMHeadModel``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: pytorch_pretrained_bert.TransfoXLLMHeadModel
+.. autoclass:: pytorch_transformers.TransfoXLLMHeadModel
     :members:
diff --git a/docs/source/model_doc/xlm.rst b/docs/source/model_doc/xlm.rst
index 086bf8782c..2c18016dcc 100644
--- a/docs/source/model_doc/xlm.rst
+++ b/docs/source/model_doc/xlm.rst
@@ -1,5 +1,36 @@
 XLM
 ----------------------------------------------------
 
+``XLMConfig``
+~~~~~~~~~~~~~~~~~~~~~
 
-I don't really know what to put here, I'll leave it up to you to decide @Thom
\ No newline at end of file
+.. autoclass:: pytorch_transformers.TransfoXLConfig
+    :members:
+
+
+17. ``XLMModel``
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: pytorch_transformers.XLMModel
+    :members:
+
+
+18. ``XLMWithLMHeadModel``
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: pytorch_transformers.XLMWithLMHeadModel
+    :members:
+
+
+19. ``XLMForSequenceClassification``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: pytorch_transformers.XLMForSequenceClassification
+    :members:
+
+
+20. ``XLMForQuestionAnswering``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: pytorch_transformers.XLMForQuestionAnswering
+    :members:
diff --git a/docs/source/usage.rst b/docs/source/usage.rst
index 1e48a6ecf8..1abfa3c1aa 100644
--- a/docs/source/usage.rst
+++ b/docs/source/usage.rst
@@ -11,7 +11,7 @@ First let's prepare a tokenized input with ``BertTokenizer``
 .. code-block:: python
 
    import torch
-   from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM
+   from pytorch_transformers import BertTokenizer, BertModel, BertForMaskedLM
 
    # OPTIONAL: if you want to have more information on what's happening, activate the logger as follows
    import logging
@@ -89,7 +89,7 @@ First let's prepare a tokenized input with ``OpenAIGPTTokenizer``
 .. code-block:: python
 
    import torch
-   from pytorch_pretrained_bert import OpenAIGPTTokenizer, OpenAIGPTModel, OpenAIGPTLMHeadModel
+   from pytorch_transformers import OpenAIGPTTokenizer, OpenAIGPTModel, OpenAIGPTLMHeadModel
 
    # OPTIONAL: if you want to have more information on what's happening, activate the logger as follows
    import logging
@@ -177,7 +177,7 @@ First let's prepare a tokenized input with ``TransfoXLTokenizer``
 .. code-block:: python
 
    import torch
-   from pytorch_pretrained_bert import TransfoXLTokenizer, TransfoXLModel, TransfoXLLMHeadModel
+   from pytorch_transformers import TransfoXLTokenizer, TransfoXLModel, TransfoXLLMHeadModel
 
    # OPTIONAL: if you want to have more information on what's happening, activate the logger as follows
    import logging
@@ -253,7 +253,7 @@ First let's prepare a tokenized input with ``GPT2Tokenizer``
 .. code-block:: python
 
    import torch
-   from pytorch_pretrained_bert import GPT2Tokenizer, GPT2Model, GPT2LMHeadModel
+   from pytorch_transformers import GPT2Tokenizer, GPT2Model, GPT2LMHeadModel
 
    # OPTIONAL: if you want to have more information on what's happening, activate the logger as follows
    import logging
diff --git a/pytorch_transformers/modeling_bert.py b/pytorch_transformers/modeling_bert.py
index 7cb723b563..acc5647cb5 100644
--- a/pytorch_transformers/modeling_bert.py
+++ b/pytorch_transformers/modeling_bert.py
@@ -150,7 +150,7 @@ ACT2FN = {"gelu": gelu, "relu": torch.nn.functional.relu, "swish": swish}
 
 class BertConfig(PretrainedConfig):
     r"""
-        :class:`~pytorch_pretrained_bert.BertConfig` is the configuration class to store the configuration of a
+        :class:`~pytorch_transformers.BertConfig` is the configuration class to store the configuration of a
         `BertModel`.
 
         Arguments:
@@ -193,6 +193,29 @@ class BertConfig(PretrainedConfig):
                  layer_norm_eps=1e-12,
                  **kwargs):
         """Constructs BertConfig.
+
+        Arguments:
+            vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `BertModel`.
+            hidden_size: Size of the encoder layers and the pooler layer.
+            num_hidden_layers: Number of hidden layers in the Transformer encoder.
+            num_attention_heads: Number of attention heads for each attention layer in
+                the Transformer encoder.
+            intermediate_size: The size of the "intermediate" (i.e., feed-forward)
+                layer in the Transformer encoder.
+            hidden_act: The non-linear activation function (function or string) in the
+                encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
+            hidden_dropout_prob: The dropout probabilitiy for all fully connected
+                layers in the embeddings, encoder, and pooler.
+            attention_probs_dropout_prob: The dropout ratio for the attention
+                probabilities.
+            max_position_embeddings: The maximum sequence length that this model might
+                ever be used with. Typically set this to something large just in case
+                (e.g., 512 or 1024 or 2048).
+            type_vocab_size: The vocabulary size of the `token_type_ids` passed into
+                `BertModel`.
+            initializer_range: The sttdev of the truncated_normal_initializer for
+                initializing all weight matrices.
+            layer_norm_eps: The epsilon used by LayerNorm.
         """
         super(BertConfig, self).__init__(**kwargs)
         if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
@@ -219,6 +242,7 @@ class BertConfig(PretrainedConfig):
                              "or the path to a pretrained model config file (str)")
 
 
+
 try:
     from apex.normalization.fused_layer_norm import FusedLayerNorm as BertLayerNorm
 except ImportError:
@@ -568,7 +592,7 @@ class BertPreTrainedModel(PreTrainedModel):
 class BertModel(BertPreTrainedModel):
     r"""BERT model ("Bidirectional Embedding Representations from a Transformer").
 
-    :class:`~pytorch_pretrained_bert.BertModel` is the basic BERT Transformer model with a layer of summed token, \
+    :class:`~pytorch_transformers.BertModel` is the basic BERT Transformer model with a layer of summed token, \
     position and sequence embeddings followed by a series of identical self-attention blocks (12 for BERT-base, 24 \
     for BERT-large). The model is instantiated with the following parameters.
 
@@ -605,23 +629,23 @@ class BertModel(BertPreTrainedModel):
 
     def forward(self, input_ids, token_type_ids=None, attention_mask=None, head_mask=None):
         """
-        Performs a model forward pass. Can be called by calling the class directly, once it has been instantiated.
+        Performs a model forward pass. **Can be called by calling the class directly, once it has been instantiated.**
 
 
         Arguments:
-            input_ids: a torch.LongTensor of shape [batch_size, sequence_length] with the word token indices in the \
+            input_ids: a ``torch.LongTensor`` of shape [batch_size, sequence_length] with the word token indices in the \
                 vocabulary(see the tokens pre-processing logic in the scripts `run_bert_extract_features.py`, \
                 `run_bert_classifier.py` and `run_bert_squad.py`)
-            token_type_ids: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token \
+            token_type_ids: an optional ``torch.LongTensor`` of shape [batch_size, sequence_length] with the token \
                 types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to \
                 a `sentence B` token (see BERT paper for more details).
-            attention_mask: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices \
+            attention_mask: an optional ``torch.LongTensor`` of shape [batch_size, sequence_length] with indices \
                 selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max \
                 input sequence length in the current batch. It's the mask that we typically use for attention when \
                 a batch has varying length sentences.
             output_all_encoded_layers: boolean which controls the content of the `encoded_layers` output as described \
             below. Default: `True`.
-            head_mask: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 \
+            head_mask: an optional ``torch.Tensor`` of shape [num_heads] or [num_layers, num_heads] with indices between 0 \
             and 1. It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 \
             => head is not masked.
 
@@ -633,12 +657,12 @@ class BertModel(BertPreTrainedModel):
             If ``output_all_encoded_layers`` is set to True, outputs a list of the full sequences of \
             encoded-hidden-states at the end of each attention \
             block (i.e. 12 full sequences for BERT-base, 24 for BERT-large), each encoded-hidden-state is a\
-            torch.FloatTensor of size [batch_size, sequence_length, hidden_size].
+            ``torch.FloatTensor`` of size [batch_size, sequence_length, hidden_size].
 
             If set to False, outputs only the full sequence of hidden-states corresponding \
             to the last attention block of shape [batch_size, sequence_length, hidden_size].
 
-            ``pooled_output`` is a torch.FloatTensor of size [batch_size, hidden_size] which is the output of a \
+            ``pooled_output`` is a ``torch.FloatTensor`` of size [batch_size, hidden_size] which is the output of a \
             classifier pretrained on top of the hidden state associated to the first character of the \
             input (`CLS`) to train on the Next-Sentence task (see BERT's paper).
 
@@ -731,38 +755,40 @@ class BertForPreTraining(BertPreTrainedModel):
     def forward(self, input_ids, token_type_ids=None, attention_mask=None, masked_lm_labels=None,
                 next_sentence_label=None, head_mask=None):
         """
-        Performs a model forward pass. Can be called by calling the class directly, once it has been instantiated.
+        Performs a model forward pass. **Can be called by calling the class directly, once it has been instantiated.**
 
         Args:
-            `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
+            `input_ids`: a ``torch.LongTensor`` of shape [batch_size, sequence_length]
                 with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
                 `run_bert_extract_features.py`, `run_bert_classifier.py` and `run_bert_squad.py`)
-            `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
+            `token_type_ids`: an optional ``torch.LongTensor`` of shape [batch_size, sequence_length] with the token
                 types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
                 a `sentence B` token (see BERT paper for more details).
-            `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
+            `attention_mask`: an optional ``torch.LongTensor`` of shape [batch_size, sequence_length] with indices
                 selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
                 input sequence length in the current batch. It's the mask that we typically use for attention when
                 a batch has varying length sentences.
-            `masked_lm_labels`: optional masked language modeling labels: torch.LongTensor of shape [batch_size, sequence_length]
+            `masked_lm_labels`: optional masked language modeling labels: ``torch.LongTensor`` of shape [batch_size, sequence_length]
                 with indices selected in [-1, 0, ..., vocab_size]. All labels set to -1 are ignored (masked), the loss
                 is only computed for the labels set in [0, ..., vocab_size]
-            `next_sentence_label`: optional next sentence classification loss: torch.LongTensor of shape [batch_size]
+            `next_sentence_label`: optional next sentence classification loss: ``torch.LongTensor`` of shape [batch_size]
                 with indices selected in [0, 1].
                 0 => next sentence is the continuation, 1 => next sentence is a random sentence.
-            `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
+            `head_mask`: an optional ``torch.Tensor`` of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
                 It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
 
 
         Returns:
-            Either a torch.Tensor or tuple(torch.Tensor, torch.Tensor).
+            Either a ``torch.Tensor`` or ``tuple(torch.Tensor, torch.Tensor)``.
 
             if ``masked_lm_labels`` and ``next_sentence_label`` are not ``None``, outputs the total_loss which is the \
              sum of the masked language modeling loss and the next \
             sentence classification loss.
 
-            if ``masked_lm_labels`` or ``next_sentence_label` is `None``, outputs a tuple comprising:
-                - the masked language modeling logits of shape [batch_size, sequence_length, vocab_size], and
+            if ``masked_lm_labels`` or ``next_sentence_label`` is ``None``, outputs a tuple made of:
+
+                - the masked language modeling logits of shape [batch_size, sequence_length, vocab_size]
+
                 - the next sentence classification logits of shape [batch_size, 2].
 
         Example ::
@@ -823,31 +849,31 @@ class BertForMaskedLM(BertPreTrainedModel):
 
     def forward(self, input_ids, token_type_ids=None, attention_mask=None, masked_lm_labels=None, head_mask=None):
         """
-        Performs a model forward pass. Can be called by calling the class directly, once it has been instantiated.
+        Performs a model forward pass. **Can be called by calling the class directly, once it has been instantiated.**
 
         Args:
-            `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
+            `input_ids`: a ``torch.LongTensor`` of shape [batch_size, sequence_length]
                 with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
                 `run_bert_extract_features.py`, `run_bert_classifier.py` and `run_bert_squad.py`)
-            `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
+            `token_type_ids`: an optional ``torch.LongTensor`` of shape [batch_size, sequence_length] with the token
                 types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
                 a `sentence B` token (see BERT paper for more details).
-            `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
+            `attention_mask`: an optional ``torch.LongTensor`` of shape [batch_size, sequence_length] with indices
                 selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
                 input sequence length in the current batch. It's the mask that we typically use for attention when
                 a batch has varying length sentences.
-            `masked_lm_labels`: masked language modeling labels: torch.LongTensor of shape [batch_size, sequence_length]
+            `masked_lm_labels`: masked language modeling labels: ``torch.LongTensor`` of shape [batch_size, sequence_length]
                 with indices selected in [-1, 0, ..., vocab_size]. All labels set to -1 are ignored (masked), the loss
                 is only computed for the labels set in [0, ..., vocab_size]
-            `head_mask`: an optional torch.LongTensor of shape [num_heads] with indices
+            `head_mask`: an optional ``torch.LongTensor`` of shape [num_heads] with indices
                 selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
                 input sequence length in the current batch. It's the mask that we typically use for attention when
                 a batch has varying length sentences.
-            `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
+            `head_mask`: an optional ``torch.Tensor`` of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
                 It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
 
         Returns:
-            Masked language modeling loss if `masked_lm_labels` is specified, masked language modeling
+            Masked language modeling loss if ``masked_lm_labels`` is specified, masked language modeling
             logits of shape [batch_size, sequence_length, vocab_size] otherwise.
 
         Example::
@@ -901,30 +927,30 @@ class BertForNextSentencePrediction(BertPreTrainedModel):
 
     def forward(self, input_ids, token_type_ids=None, attention_mask=None, next_sentence_label=None, head_mask=None):
         """
-        Performs a model forward pass. Can be called by calling the class directly, once it has been instantiated.
+        Performs a model forward pass. **Can be called by calling the class directly, once it has been instantiated.**
 
         Args:
-            `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
+            `input_ids`: a ``torch.LongTensor`` of shape [batch_size, sequence_length]
                 with the word token indices in the vocabulary(see the tokens pre-processing logic in the scripts
                 `run_bert_extract_features.py`, `run_bert_classifier.py` and `run_bert_squad.py`)
-            `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
+            `token_type_ids`: an optional ``torch.LongTensor`` of shape [batch_size, sequence_length] with the token
                 types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
                 a `sentence B` token (see BERT paper for more details).
-            `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
+            `attention_mask`: an optional ``torch.LongTensor`` of shape [batch_size, sequence_length] with indices
                 selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
                 input sequence length in the current batch. It's the mask that we typically use for attention when
                 a batch has varying length sentences.
-            `next_sentence_label`: next sentence classification loss: torch.LongTensor of shape [batch_size]
+            `next_sentence_label`: next sentence classification loss: ``torch.LongTensor`` of shape [batch_size]
                 with indices selected in [0, 1].
                 0 => next sentence is the continuation, 1 => next sentence is a random sentence.
-            `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between
+            `head_mask`: an optional ``torch.Tensor`` of shape [num_heads] or [num_layers, num_heads] with indices between
                 0 and 1.It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked,
                 0.0 => head is not masked.
 
         Returns:
-            If `next_sentence_label` is specified, outputs the total_loss which is the sum of the masked language \
-            modeling loss and the next sentence classification loss.
-            if `next_sentence_label` is `None`, outputs the next sentence classification logits of shape [batch_size, 2].
+            If ``next_sentence_label`` is specified, outputs the total_loss which is the sum of the masked language
+            modeling loss and the next sentence classification loss. If ``next_sentence_label`` is ``None``, outputs
+            the next sentence classification logits of shape [batch_size, 2].
 
 
         Example::
@@ -984,27 +1010,27 @@ class BertForSequenceClassification(BertPreTrainedModel):
 
     def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None, head_mask=None):
         """
-        Performs a model forward pass. Can be called by calling the class directly, once it has been instantiated.
+        Performs a model forward pass. **Can be called by calling the class directly, once it has been instantiated.**
 
         Parameters:
-            `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
+            `input_ids`: a ``torch.LongTensor`` of shape [batch_size, sequence_length]
                 with the word token indices in the vocabulary. Items in the batch should begin with the special "CLS" token. (see the tokens preprocessing logic in the scripts
                 `run_bert_extract_features.py`, `run_bert_classifier.py` and `run_bert_squad.py`)
-            `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
+            `token_type_ids`: an optional ``torch.LongTensor`` of shape [batch_size, sequence_length] with the token
                 types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
                 a `sentence B` token (see BERT paper for more details).
-            `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
+            `attention_mask`: an optional ``torch.LongTensor`` of shape [batch_size, sequence_length] with indices
                 selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
                 input sequence length in the current batch. It's the mask that we typically use for attention when
                 a batch has varying length sentences.
-            `labels`: labels for the classification output: torch.LongTensor of shape [batch_size]
+            `labels`: labels for the classification output: ``torch.LongTensor`` of shape [batch_size]
                 with indices selected in [0, ..., num_labels].
-            `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
+            `head_mask`: an optional ``torch.Tensor`` of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
                 It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
 
         Returns:
-            if `labels` is not `None`, outputs the CrossEntropy classification loss of the output with the labels.
-            if `labels` is `None`, outputs the classification logits of shape `[batch_size, num_labels]`.
+            If ``labels`` is not ``None``, outputs the CrossEntropy classification loss of the output with the labels.
+            If ``labels`` is ``None``, outputs the classification logits of shape [batch_size, num_labels].
 
         Example::
 
@@ -1070,27 +1096,27 @@ class BertForMultipleChoice(BertPreTrainedModel):
 
     def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None, head_mask=None):
         """
-        Performs a model forward pass. Can be called by calling the class directly, once it has been instantiated.
+        Performs a model forward pass. **Can be called by calling the class directly, once it has been instantiated.**
 
         Parameters:
-            `input_ids`: a torch.LongTensor of shape [batch_size, num_choices, sequence_length]
+            `input_ids`: a ``torch.LongTensor`` of shape [batch_size, num_choices, sequence_length]
                 with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
                 `run_bert_extract_features.py`, `run_bert_classifier.py` and `run_bert_squad.py`)
-            `token_type_ids`: an optional torch.LongTensor of shape [batch_size, num_choices, sequence_length]
+            `token_type_ids`: an optional ``torch.LongTensor`` of shape [batch_size, num_choices, sequence_length]
                 with the token types indices selected in [0, 1]. Type 0 corresponds to a `sentence A`
                 and type 1 corresponds to a `sentence B` token (see BERT paper for more details).
-            `attention_mask`: an optional torch.LongTensor of shape [batch_size, num_choices, sequence_length] with indices
+            `attention_mask`: an optional ``torch.LongTensor`` of shape [batch_size, num_choices, sequence_length] with indices
                 selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
                 input sequence length in the current batch. It's the mask that we typically use for attention when
                 a batch has varying length sentences.
-            `labels`: labels for the classification output: torch.LongTensor of shape [batch_size]
+            `labels`: labels for the classification output: ``torch.LongTensor`` of shape [batch_size]
                 with indices selected in [0, ..., num_choices].
-            `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
+            `head_mask`: an optional ``torch.Tensor`` of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
                 It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
 
         Returns:
-            if `labels` is not `None`, outputs the CrossEntropy classification loss of the output with the labels.
-            if `labels` is `None`, outputs the classification logits of shape [batch_size, num_labels].
+            If ``labels`` is not ``None``, outputs the CrossEntropy classification loss of the output with the labels.
+            If ``labels`` is ``None``, outputs the classification logits of shape [batch_size, num_labels].
 
         Example::
 
@@ -1159,27 +1185,27 @@ class BertForTokenClassification(BertPreTrainedModel):
 
     def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None, head_mask=None):
         """
-        Performs a model forward pass. Can be called by calling the class directly, once it has been instantiated.
+        Performs a model forward pass. **Can be called by calling the class directly, once it has been instantiated.**
 
         Parameters:
-            `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
+            `input_ids`: a ``torch.LongTensor`` of shape [batch_size, sequence_length]
                 with the word token indices in the vocabulary(see the tokens pre-processing logic in the scripts
                 `run_bert_extract_features.py`, `run_bert_classifier.py` and `run_bert_squad.py`)
-            `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
+            `token_type_ids`: an optional ``torch.LongTensor`` of shape [batch_size, sequence_length] with the token
                 types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
                 a `sentence B` token (see BERT paper for more details).
-            `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
+            `attention_mask`: an optional ``torch.LongTensor`` of shape [batch_size, sequence_length] with indices
                 selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
                 input sequence length in the current batch. It's the mask that we typically use for attention when
                 a batch has varying length sentences.
-            `labels`: labels for the classification output: torch.LongTensor of shape [batch_size, sequence_length]
+            `labels`: labels for the classification output: ``torch.LongTensor`` of shape [batch_size, sequence_length]
                 with indices selected in [0, ..., num_labels].
-            `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
+            `head_mask`: an optional ``torch.Tensor`` of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
                 It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
 
         Returns:
-            if `labels` is not `None`, outputs the CrossEntropy classification loss of the output with the labels.
-            if `labels` is `None`, outputs the classification logits of shape [batch_size, sequence_length, num_labels].
+            If ``labels`` is not ``None``, outputs the CrossEntropy classification loss of the output with the labels.
+            If ``labels`` is ``None``, outputs the classification logits of shape [batch_size, sequence_length, num_labels].
 
         Example::
 
@@ -1243,6 +1269,8 @@ class BertForQuestionAnswering(BertPreTrainedModel):
     def forward(self, input_ids, token_type_ids=None, attention_mask=None, start_positions=None,
                 end_positions=None, head_mask=None):
         """
+        Performs a model forward pass. **Can be called by calling the class directly, once it has been instantiated.**
+
         Parameters:
             `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
                 with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
@@ -1260,13 +1288,13 @@ class BertForQuestionAnswering(BertPreTrainedModel):
             `end_positions`: position of the last token for the labeled span: torch.LongTensor of shape [batch_size].
                 Positions are clamped to the length of the sequence and position outside of the sequence are not taken
                 into account for computing the loss.
-            `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
+            `head_mask`: an optional ``torch.Tensor`` of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
                 It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
 
         Returns:
-            if `start_positions` and `end_positions` are not `None`, outputs the total_loss which is the sum of the
+            If ``start_positions`` and ``end_positions`` are not ``None``, outputs the total_loss which is the sum of the
             CrossEntropy loss for the start and end token positions.
-            if `start_positions` or `end_positions` is `None`, outputs a tuple of start_logits, end_logits which are the
+            If ``start_positions`` or ``end_positions`` is ``None``, outputs a tuple of start_logits, end_logits which are the
             logits respectively for the start and end position tokens of shape [batch_size, sequence_length].
 
         Example::
diff --git a/pytorch_transformers/modeling_gpt2.py b/pytorch_transformers/modeling_gpt2.py
index 840016098a..ec2abf72b9 100644
--- a/pytorch_transformers/modeling_gpt2.py
+++ b/pytorch_transformers/modeling_gpt2.py
@@ -101,6 +101,25 @@ def gelu(x):
 
 class GPT2Config(PretrainedConfig):
     """Configuration class to store the configuration of a `GPT2Model`.
+
+    Args:
+        vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `GPT2Model` or a configuration json file.
+        n_special: The number of special tokens to learn during fine-tuning ('[SEP]', '[CLF]', ...)
+        n_positions: Number of positional embeddings.
+        n_ctx: Size of the causal mask (usually same as n_positions).
+        n_embd: Dimensionality of the embeddings and hidden states.
+        n_layer: Number of hidden layers in the Transformer encoder.
+        n_head: Number of attention heads for each attention layer in
+            the Transformer encoder.
+        layer_norm_epsilon: epsilon to use in the layer norm layers
+        resid_pdrop: The dropout probabilitiy for all fully connected
+            layers in the embeddings, encoder, and pooler.
+        attn_pdrop: The dropout ratio for the attention
+            probabilities.
+        embd_pdrop: The dropout ratio for the embeddings.
+        initializer_range: The sttdev of the truncated_normal_initializer for
+            initializing all weight matrices.
+        predict_special_tokens: should we predict special tokens (when the model has a LM head)
     """
     pretrained_config_archive_map = GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP
 
@@ -418,9 +437,11 @@ class GPT2Model(GPT2PreTrainedModel):
     GPT-2 use a single embedding matrix to store the word and special embeddings.
     Special tokens embeddings are additional tokens that are not pre-trained: [SEP], [CLS]...
     Special tokens need to be trained during the fine-tuning if you use them.
-    The number of special embeddings can be controled using the `set_num_special_tokens(num_special_tokens)` function.
+    The number of special embeddings can be controlled using the `set_num_special_tokens(num_special_tokens)` function.
+
+    The embeddings are ordered as follow in the token embeddings matrix:
+    ::
 
-    The embeddings are ordered as follow in the token embeddings matrice:
         [0,                                                         ----------------------
          ...                                                        -> word embeddings
          config.vocab_size - 1,                                     ______________________
@@ -428,47 +449,24 @@ class GPT2Model(GPT2PreTrainedModel):
          ...                                                        -> special embeddings
          config.vocab_size + config.n_special - 1]                  ______________________
 
-    where total_tokens_embeddings can be obtained as config.total_tokens_embeddings and is:
-        total_tokens_embeddings = config.vocab_size + config.n_special
-    You should use the associate indices to index the embeddings.
+    where total_tokens_embeddings can be obtained as config.total_tokens_embeddings and is equal to
 
-    Params:
+    ::
+
+        total_tokens_embeddings = config.vocab_size + config.n_special
+
+    You should use the associated indices to index the embeddings.
+
+    Args:
         `config`: a GPT2Config class instance with the configuration to build a new model
         `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
 
-    Inputs:
-        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] (or more generally [d_1, ..., d_n, sequence_length]
-            were d_1 ... d_n are arbitrary dimensions) with the word BPE token indices selected in the range [0, config.vocab_size[
-        `position_ids`: an optional torch.LongTensor with the same shape as input_ids
-            with the position indices (selected in the range [0, config.n_positions - 1[.
-        `token_type_ids`: an optional torch.LongTensor with the same shape as input_ids
-            You can use it to add a third type of embedding to each input token in the sequence
-            (the previous two being the word and position embeddings).
-            The input, position and token_type embeddings are summed inside the Transformer before the first
-            self-attention block.
-        `past`: an optional list of torch.LongTensor that contains pre-computed hidden-states
-            (key and values in the attention blocks) to speed up sequential decoding
-            (this is the presents output of the model, cf. below).
-        `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
-            It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
 
-    Outputs a tuple consisting of:
-        `hidden_states`: a list of all the encoded-hidden-states in the model (length of the list: number of layers + 1 for the output of the embeddings)
-            as torch.FloatTensor of size [batch_size, sequence_length, hidden_size]
-            (or more generally [d_1, ..., d_n, hidden_size] were d_1 ... d_n are the dimension of input_ids)
-        `presents`: a list of pre-computed hidden-states (key and values in each attention blocks) as
-            torch.FloatTensors. They can be reused to speed up sequential decoding.
 
-    Example usage:
-    ```python
-    # Already been converted into BPE token ids
-    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
+    Example::
 
-    config = modeling_gpt2.GPT2Config()
-
-    model = modeling_gpt2.GPT2Model(config)
-    hidden_states, presents = model(input_ids)
-    ```
+        config = modeling_gpt2.GPT2Config()
+        model = modeling_gpt2.GPT2Model(config)
     """
 
     def __init__(self, config):
@@ -485,7 +483,7 @@ class GPT2Model(GPT2PreTrainedModel):
         self.apply(self.init_weights)
 
     def set_num_special_tokens(self, num_special_tokens=None):
-        " Update input embeddings with new embedding matrice if needed "
+        """Update input embeddings with new embedding matrix if needed."""
         if num_special_tokens is None or self.config.n_special == num_special_tokens:
             return
         # Update config
@@ -506,6 +504,47 @@ class GPT2Model(GPT2PreTrainedModel):
             self.h[layer].attn.prune_heads(heads)
 
     def forward(self, input_ids, position_ids=None, token_type_ids=None, past=None, head_mask=None):
+        """
+        Performs a model forward pass. **Can be called by calling the class directly, once it has been instantiated.**
+
+        Args:
+            `input_ids`: a ``torch.LongTensor`` of shape [batch_size, sequence_length] (or more generally [d_1, ..., d_n, sequence_length]
+                were d_1 ... d_n are arbitrary dimensions) with the word BPE token indices selected in the range [0, config.vocab_size[
+            `position_ids`: an optional ``torch.LongTensor`` with the same shape as input_ids
+                with the position indices (selected in the range [0, config.n_positions - 1[.
+            `token_type_ids`: an optional ``torch.LongTensor`` with the same shape as input_ids
+                You can use it to add a third type of embedding to each input token in the sequence
+                (the previous two being the word and position embeddings).
+                The input, position and token_type embeddings are summed inside the Transformer before the first
+                self-attention block.
+            `past`: an optional list of ``torch.LongTensor`` that contains pre-computed hidden-states
+                (key and values in the attention blocks) to speed up sequential decoding
+                (this is the presents output of the model, cf. below).
+            `head_mask`: an optional ``torch.Tensor`` of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
+                It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
+
+        Returns:
+             A tuple consisting of ``hidden_states`` and ``presents``.
+
+                 ``hidden_states`` are a list of all the encoded-hidden-states in the model (length of the list: number of
+                 layers + 1 for the output of the embeddings) as ``torch.FloatTensor`` of size [batch_size, sequence_length,
+                 hidden_size] (or more generally [d_1, ..., d_n, hidden_size] were d_1 ... d_n are the dimension of
+                 input_ids).
+
+                 ``presents`` are a list of pre-computed hidden-states (key and values in each attention blocks) as
+                 torch.FloatTensors. They can be reused to speed up sequential decoding.
+
+
+        Example::
+
+            # Already been converted into BPE token ids
+            input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
+
+            hidden_states, presents = model(input_ids)
+            # or
+            hidden_states, presents = model.forward(input_ids)
+
+        """
         if past is None:
             past_length = 0
             past = [None] * len(self.h)
@@ -580,50 +619,18 @@ class GPT2Model(GPT2PreTrainedModel):
 class GPT2LMHeadModel(GPT2PreTrainedModel):
     """OpenAI GPT-2 model with a Language Modeling head ("Language Models are Unsupervised Multitask Learners").
 
-    Params:
+    Args:
         `config`: a GPT2Config class instance with the configuration to build a new model
         `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
         `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
             This can be used to compute head importance metrics. Default: False
 
-    Inputs:
-        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] (or more generally [d_1, ..., d_n, sequence_length]
-            were d_1 ... d_n are arbitrary dimensions) with the word BPE token indices selected in the range [0, config.vocab_size[
-        `position_ids`: an optional torch.LongTensor with the same shape as input_ids
-            with the position indices (selected in the range [0, config.n_positions - 1[.
-        `token_type_ids`: an optional torch.LongTensor with the same shape as input_ids
-            You can use it to add a third type of embedding to each input token in the sequence
-            (the previous two being the word and position embeddings).
-            The input, position and token_type embeddings are summed inside the Transformer before the first
-            self-attention block.
-        `lm_labels`: optional language modeling labels: torch.LongTensor of shape [batch_size, sequence_length]
-            with indices selected in [-1, 0, ..., vocab_size]. All labels set to -1 are ignored (masked), the loss
-            is only computed for the labels set in [0, ..., vocab_size]
-        `past`: an optional list of torch.LongTensor that contains pre-computed hidden-states
-            (key and values in the attention blocks) to speed up sequential decoding
-            (this is the presents output of the model, cf. below).
-        `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
-            It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
 
-    Outputs:
-        if `lm_labels` is not `None`:
-            Outputs the language modeling loss.
-        else a tuple:
-            `lm_logits`: the language modeling logits as a torch.FloatTensor of size [batch_size, sequence_length, config.vocab_size]
-                (or more generally [d_1, ..., d_n, config.vocab_size] were d_1 ... d_n are the dimension of input_ids)
-            `presents`: a list of pre-computed hidden-states (key and values in each attention blocks) as
-                torch.FloatTensors. They can be reused to speed up sequential decoding.
 
-    Example usage:
-    ```python
-    # Already been converted into BPE token ids
-    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
+    Example::
 
-    config = modeling_gpt2.GPT2Config()
-
-    model = modeling_gpt2.GPT2LMHeadModel(config)
-    lm_logits, presents = model(input_ids)
-    ```
+        config = modeling_gpt2.GPT2Config()
+        model = modeling_gpt2.GPT2LMHeadModel(config)
     """
 
     def __init__(self, config):
@@ -633,14 +640,58 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
         self.apply(self.init_weights)
 
     def set_num_special_tokens(self, num_special_tokens, predict_special_tokens=True):
-        """ Update input and output embeddings with new embedding matrice
-            Make sure we are sharing the embeddings
+        """
+        Update input and output embeddings with new embedding matrix. Make sure we are sharing the embeddings.
+        TODO Shouldn't we put args + returns ?
         """
         self.config.predict_special_tokens = self.transformer.config.predict_special_tokens = predict_special_tokens
         self.transformer.set_num_special_tokens(num_special_tokens)
         self.lm_head.set_embeddings_weights(self.transformer.wte.weight, predict_special_tokens=predict_special_tokens)
 
     def forward(self, input_ids, position_ids=None, token_type_ids=None, lm_labels=None, past=None, head_mask=None):
+        """
+        Performs a model forward pass. **Can be called by calling the class directly, once it has been instantiated.**
+
+        Args:
+            `input_ids`: a ``torch.LongTensor`` of shape [batch_size, sequence_length] (or more generally [d_1, ..., d_n, sequence_length]
+                were d_1 ... d_n are arbitrary dimensions) with the word BPE token indices selected in the range [0, config.vocab_size[
+            `position_ids`: an optional ``torch.LongTensor`` with the same shape as input_ids
+                with the position indices (selected in the range [0, config.n_positions - 1[.
+            `token_type_ids`: an optional ``torch.LongTensor`` with the same shape as input_ids
+                You can use it to add a third type of embedding to each input token in the sequence
+                (the previous two being the word and position embeddings).
+                The input, position and token_type embeddings are summed inside the Transformer before the first
+                self-attention block.
+            `lm_labels`: optional language modeling labels: ``torch.LongTensor`` of shape [batch_size, sequence_length]
+                with indices selected in [-1, 0, ..., vocab_size]. All labels set to -1 are ignored (masked), the loss
+                is only computed for the labels set in [0, ..., vocab_size]
+            `past`: an optional list of ``torch.LongTensor`` that contains pre-computed hidden-states
+                (key and values in the attention blocks) to speed up sequential decoding
+                (this is the presents output of the model, cf. below).
+            `head_mask`: an optional ``torch.Tensor`` of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
+                It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
+
+        Returns:
+            If ``lm_labels`` is not ``None``, returns the language modeling loss. It ``lm_labels`` is ``None``, returns
+            a tuple of (``lm_logits``, ``presents``).
+
+                ``lm_logits`` is the language modeling logits as a ``torch.FloatTensor`` of size [batch_size,
+                sequence_length, config.vocab_size] (or more generally [d_1, ..., d_n, config.vocab_size] were d_1 ...
+                d_n are the dimension of input_ids).
+
+                ``presents`` is a list of pre-computed hidden-states (key and values in each attention blocks) as
+                torch.FloatTensors. They can be reused to speed up sequential decoding.
+
+        Example::
+
+            # Already been converted into BPE token ids
+            input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
+
+            lm_logits, presents = model(input_ids)
+            # or
+            lm_logits, presents = model.forward(input_ids)
+
+        """
         transformer_outputs = self.transformer(input_ids, position_ids, token_type_ids, past, head_mask)
         hidden_states = transformer_outputs[0]
 
@@ -663,55 +714,16 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
 class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
     """OpenAI GPT-2 model with a Language Modeling and a Multiple Choice head ("Language Models are Unsupervised Multitask Learners").
 
-    Params:
+    Args:
         `config`: a GPT2Config class instance with the configuration to build a new model
         `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
         `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
             This can be used to compute head importance metrics. Default: False
 
-    Inputs:
-        `input_ids`: a torch.LongTensor of shape [batch_size, num_choices, sequence_length] with the BPE token
-            indices selected in the range [0, config.vocab_size[
-        `mc_token_ids`: a torch.LongTensor of shape [batch_size, num_choices] with the index of the token from
-            which we should take the hidden state to feed the multiple choice classifier (usually last token of the sequence)
-        `position_ids`: an optional torch.LongTensor with the same shape as input_ids
-            with the position indices (selected in the range [0, config.n_positions - 1[.
-        `token_type_ids`: an optional torch.LongTensor with the same shape as input_ids
-            You can use it to add a third type of embedding to each input token in the sequence
-            (the previous two being the word and position embeddings).
-            The input, position and token_type embeddings are summed inside the Transformer before the first
-            self-attention block.
-        `lm_labels`: optional language modeling labels: torch.LongTensor of shape [batch_size, num_choices, sequence_length]
-            with indices selected in [-1, 0, ..., config.vocab_size]. All labels set to -1 are ignored (masked), the loss
-            is only computed for the labels set in [0, ..., config.vocab_size]
-        `multiple_choice_labels`: optional multiple choice labels: torch.LongTensor of shape [batch_size]
-            with indices selected in [0, ..., num_choices].
-        `past`: an optional list of torch.LongTensor that contains pre-computed hidden-states
-            (key and values in the attention blocks) to speed up sequential decoding
-            (this is the presents output of the model, cf. below).
-        `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
-            It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
+    Example::
 
-    Outputs:
-        if `lm_labels` and `multiple_choice_labels` are not `None`:
-            Outputs a tuple of losses with the language modeling loss and the multiple choice loss.
-        else: a tuple with
-            `lm_logits`: the language modeling logits as a torch.FloatTensor of size [batch_size, num_choices, sequence_length, config.vocab_size]
-            `multiple_choice_logits`: the multiple choice logits as a torch.FloatTensor of size [batch_size, num_choices]
-            `presents`: a list of pre-computed hidden-states (key and values in each attention blocks) as
-                torch.FloatTensors. They can be reused to speed up sequential decoding.
-
-    Example usage:
-    ```python
-    # Already been converted into BPE token ids
-    input_ids = torch.LongTensor([[[31, 51, 99], [15, 5, 0]]])  # (bsz, number of choice, seq length)
-    mc_token_ids = torch.LongTensor([[2], [1]]) # (bsz, number of choice)
-
-    config = modeling_gpt2.GPT2Config()
-
-    model = modeling_gpt2.GPT2DoubleHeadsModel(config)
-    lm_logits, multiple_choice_logits, presents = model(input_ids, mc_token_ids)
-    ```
+        config = modeling_gpt2.GPT2Config()
+        model = modeling_gpt2.GPT2DoubleHeadsModel(config)
     """
 
     def __init__(self, config):
@@ -723,8 +735,9 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
         self.apply(self.init_weights)
 
     def set_num_special_tokens(self, num_special_tokens, predict_special_tokens=True):
-        """ Update input and output embeddings with new embedding matrice
-            Make sure we are sharing the embeddings
+        """
+        Update input and output embeddings with new embedding matrix.Make sure we are sharing the embeddings
+        TODO Shouldn't we put args + returns ?
         """
         self.config.predict_special_tokens = self.transformer.config.predict_special_tokens = predict_special_tokens
         self.transformer.set_num_special_tokens(num_special_tokens)
@@ -732,6 +745,55 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
 
     def forward(self, input_ids, mc_token_ids=None, lm_labels=None, mc_labels=None, token_type_ids=None,
                 position_ids=None, past=None, head_mask=None):
+        """
+        Performs a model forward pass. **Can be called by calling the class directly, once it has been instantiated.**
+
+        Args:
+            `input_ids`: a ``torch.LongTensor`` of shape [batch_size, num_choices, sequence_length] with the BPE token
+                indices selected in the range [0, config.vocab_size[
+            `mc_token_ids`: a ``torch.LongTensor`` of shape [batch_size, num_choices] with the index of the token from
+                which we should take the hidden state to feed the multiple choice classifier (usually last token of the sequence)
+            `position_ids`: an optional ``torch.LongTensor`` with the same shape as input_ids
+                with the position indices (selected in the range [0, config.n_positions - 1[.
+            `token_type_ids`: an optional ``torch.LongTensor`` with the same shape as input_ids
+                You can use it to add a third type of embedding to each input token in the sequence
+                (the previous two being the word and position embeddings).
+                The input, position and token_type embeddings are summed inside the Transformer before the first
+                self-attention block.
+            `lm_labels`: optional language modeling labels: ``torch.LongTensor`` of shape [batch_size, num_choices, sequence_length]
+                with indices selected in [-1, 0, ..., config.vocab_size]. All labels set to -1 are ignored (masked), the loss
+                is only computed for the labels set in [0, ..., config.vocab_size]
+            `multiple_choice_labels`: optional multiple choice labels: ``torch.LongTensor`` of shape [batch_size]
+                with indices selected in [0, ..., num_choices].
+            `past`: an optional list of ``torch.LongTensor`` that contains pre-computed hidden-states
+                (key and values in the attention blocks) to speed up sequential decoding
+                (this is the presents output of the model, cf. below).
+            `head_mask`: an optional ``torch.Tensor`` of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
+                It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
+
+        Returns:
+            If ``lm_labels`` and ``multiple_choice_labels`` are not ``None``, outputs a
+            ``tuple(language_modeling_loss, multiple_choice_loss)``. If they are not ``None``, outputs a
+            ``tuple(lm_logits, multiple_choice_logits, presents)``.
+
+                ``lm_logits``: the language modeling logits as a ``torch.FloatTensor`` of size [batch_size, num_choices, sequence_length, config.vocab_size]
+
+                ``multiple_choice_logits``: the multiple choice logits as a ``torch.FloatTensor`` of size [batch_size, num_choices]
+
+                ``presents``: a list of pre-computed hidden-states (key and values in each attention blocks) as
+                torch.FloatTensors. They can be reused to speed up sequential decoding.
+
+        Example::
+
+            # Already been converted into BPE token ids
+            input_ids = torch.LongTensor([[[31, 51, 99], [15, 5, 0]]])  # (bsz, number of choice, seq length)
+            mc_token_ids = torch.LongTensor([[2], [1]]) # (bsz, number of choice)
+
+            lm_logits, multiple_choice_logits, presents = model(input_ids, mc_token_ids)
+            # or
+            lm_logits, multiple_choice_logits, presents = model.forward(input_ids, mc_token_ids)
+
+        """
         transformer_outputs = self.transformer(input_ids, position_ids, token_type_ids, past, head_mask)
         hidden_states = transformer_outputs[0]
 
diff --git a/pytorch_transformers/modeling_openai.py b/pytorch_transformers/modeling_openai.py
index 024ff8eb41..37736efed7 100644
--- a/pytorch_transformers/modeling_openai.py
+++ b/pytorch_transformers/modeling_openai.py
@@ -127,7 +127,29 @@ ACT_FNS = {"relu": nn.ReLU, "swish": swish, "gelu": gelu}
 
 
 class OpenAIGPTConfig(PretrainedConfig):
-    """Configuration class to store the configuration of a `OpenAIGPTModel`.
+    """
+    Configuration class to store the configuration of a `OpenAIGPTModel`.
+
+    Args:
+        vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `OpenAIGPTModel` or a configuration json file.
+        n_special: The number of special tokens to learn during fine-tuning ('[SEP]', '[CLF]', ...)
+        n_positions: Number of positional embeddings.
+        n_ctx: Size of the causal mask (usually same as n_positions).
+        n_embd: Dimensionality of the embeddings and hidden states.
+        n_layer: Number of hidden layers in the Transformer encoder.
+        n_head: Number of attention heads for each attention layer in
+            the Transformer encoder.
+        afn: The non-linear activation function (function or string) in the
+            encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
+        resid_pdrop: The dropout probabilitiy for all fully connected
+            layers in the embeddings, encoder, and pooler.
+        attn_pdrop: The dropout ratio for the attention
+            probabilities.
+        embd_pdrop: The dropout ratio for the embeddings.
+        layer_norm_epsilon: epsilon to use in the layer norm layers
+        initializer_range: The sttdev of the truncated_normal_initializer for
+            initializing all weight matrices.
+        predict_special_tokens: should we predict special tokens (when the model has a LM head)
     """
     pretrained_config_archive_map = OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP
 
@@ -157,27 +179,6 @@ class OpenAIGPTConfig(PretrainedConfig):
         **kwargs
     ):
         """Constructs OpenAIGPTConfig.
-
-        Args:
-            vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `OpenAIGPTModel` or a configuration json file.
-            n_special: The number of special tokens to learn during fine-tuning ('[SEP]', '[CLF]', ...)
-            n_positions: Number of positional embeddings.
-            n_ctx: Size of the causal mask (usually same as n_positions).
-            n_embd: Dimensionality of the embeddings and hidden states.
-            n_layer: Number of hidden layers in the Transformer encoder.
-            n_head: Number of attention heads for each attention layer in
-                the Transformer encoder.
-            afn: The non-linear activation function (function or string) in the
-                encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
-            resid_pdrop: The dropout probabilitiy for all fully connected
-                layers in the embeddings, encoder, and pooler.
-            attn_pdrop: The dropout ratio for the attention
-                probabilities.
-            embd_pdrop: The dropout ratio for the embeddings.
-            layer_norm_epsilon: epsilon to use in the layer norm layers
-            initializer_range: The sttdev of the truncated_normal_initializer for
-                initializing all weight matrices.
-            predict_special_tokens: should we predict special tokens (when the model has a LM head)
         """
         super(OpenAIGPTConfig, self).__init__(**kwargs)
 
@@ -441,12 +442,16 @@ class OpenAIGPTPreTrainedModel(PreTrainedModel):
 class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
     """OpenAI GPT model ("Improving Language Understanding by Generative Pre-Training").
 
-    OpenAI GPT use a single embedding matrix to store the word and special embeddings.
-    Special tokens embeddings are additional tokens that are not pre-trained: [SEP], [CLS]...
-    Special tokens need to be trained during the fine-tuning if you use them.
-    The number of special embeddings can be controled using the `set_num_special_tokens(num_special_tokens)` function.
+    OpenAI GPT uses a single embedding matrix to store the word and special embeddings.
+    Special tokens embeddings are additional tokens that are not pre-trained, such as: [SEP], [CLS]...
+
+    Special tokens need to be trained during the fine-tuning if you use them.
+    The number of special embeddings can be controlled using the ``set_num_special_tokens(num_special_tokens)`` function.
+
+    The embeddings are ordered as follow in the token embeddings matrix:
+
+    ::
 
-    The embeddings are ordered as follow in the token embeddings matrice:
         [0,                                                         ----------------------
          ...                                                        -> word embeddings
          config.vocab_size - 1,                                     ______________________
@@ -454,44 +459,25 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
          ...                                                        -> special embeddings
          config.vocab_size + config.n_special - 1]                  ______________________
 
-    where total_tokens_embeddings can be obtained as config.total_tokens_embeddings and is:
-        total_tokens_embeddings = config.vocab_size + config.n_special
-    You should use the associate indices to index the embeddings.
+    where ``total_tokens_embeddings`` can be obtained as ``config.total_tokens_embeddings`` and is:
 
-    Params:
+    ::
+
+        total_tokens_embeddings = config.vocab_size + config.n_special
+
+    You should use the associated indices to index the embeddings.
+
+    Args:
         `config`: a OpenAIGPTConfig class instance with the configuration to build a new model
         `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
         `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
             This can be used to compute head importance metrics. Default: False
 
-    Inputs:
-        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] (or more generally [d_1, ..., d_n, sequence_length]
-            were d_1 ... d_n are arbitrary dimensions) with the word BPE token indices selected in the range [0, total_tokens_embeddings[
-        `position_ids`: an optional torch.LongTensor with the same shape as input_ids
-            with the position indices (selected in the range [0, config.n_positions - 1[.
-        `token_type_ids`: an optional torch.LongTensor with the same shape as input_ids
-            You can use it to add a third type of embedding to each input token in the sequence
-            (the previous two being the word and position embeddings).
-            The input, position and token_type embeddings are summed inside the Transformer before the first
-            self-attention block.
-        `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
-            It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
 
-    Outputs:
-        `hidden_states`: a list of all the encoded-hidden-states in the model (length of the list: number of layers + 1 for the output of the embeddings)
-            as torch.FloatTensor of size [batch_size, sequence_length, hidden_size]
-            (or more generally [d_1, ..., d_n, hidden_size] were d_1 ... d_n are the dimension of input_ids)
+    Example::
 
-    Example usage:
-    ```python
-    # Already been converted into BPE token ids
-    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
-
-    config = modeling_openai.OpenAIGPTConfig()
-
-    model = modeling_openai.OpenAIGPTModel(config)
-    hidden_states = model(input_ids)
-    ```
+        config = modeling_openai.OpenAIGPTConfig()
+        model = modeling_openai.OpenAIGPTModel(config)
     """
 
     def __init__(self, config):
@@ -507,7 +493,17 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
         self.apply(self.init_weights)
 
     def set_num_special_tokens(self, num_special_tokens=None):
-        " Update input embeddings with new embedding matrice if needed "
+        """
+        Update input embeddings with new embedding matrice if needed
+
+        TODO
+
+        Args:
+            num_special_tokens:
+
+        Returns:
+
+        """
         if num_special_tokens is None or self.config.n_special == num_special_tokens:
             return
         # Update config
@@ -528,6 +524,37 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
             self.h[layer].attn.prune_heads(heads)
 
     def forward(self, input_ids, position_ids=None, token_type_ids=None, head_mask=None):
+        """
+        Performs a model forward pass. **Can be called by calling the class directly, once it has been instantiated.**
+
+        Args:
+            `input_ids`: a ``torch.LongTensor`` of shape [batch_size, sequence_length] (or more generally [d_1, ..., d_n, sequence_length]
+                were d_1 ... d_n are arbitrary dimensions) with the word BPE token indices selected in the range [0, total_tokens_embeddings[
+            `position_ids`: an optional ``torch.LongTensor`` with the same shape as input_ids
+                with the position indices (selected in the range [0, config.n_positions - 1[.
+            `token_type_ids`: an optional ``torch.LongTensor`` with the same shape as input_ids
+                You can use it to add a third type of embedding to each input token in the sequence
+                (the previous two being the word and position embeddings).
+                The input, position and token_type embeddings are summed inside the Transformer before the first
+                self-attention block.
+            `head_mask`: an optional ``torch.Tensor`` of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
+                It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
+
+        Returns:
+            ``hidden_states``, a list of all the encoded-hidden-states in the model (length of the list is number
+            of layers + 1 for the output of the embeddings)
+            as ``torch.FloatTensor`` of size [batch_size, sequence_length, hidden_size]
+            (or more generally [d_1, ..., d_n, hidden_size] were d_1 ... d_n are the dimension of input_ids)
+
+        Example::
+
+            # Already been converted into BPE token ids
+            input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
+
+            hidden_states = model(input_ids)
+            # or
+            hidden_states = model.forward(input_ids)
+        """
         if position_ids is None:
             # This was used when we had a single embedding matrice from position and token embeddings
             # start = self.config.vocab_size + self.config.n_special
@@ -594,10 +621,13 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
 
     OpenAI GPT use a single embedding matrix to store the word and special embeddings.
     Special tokens embeddings are additional tokens that are not pre-trained: [SEP], [CLS]...
-    Special tokens need to be trained during the fine-tuning if you use them.
-    The number of special embeddings can be controled using the `set_num_special_tokens(num_special_tokens)` function.
+    Special tokens need to be trained during the fine-tuning if you use them. The number of special embeddings
+    can be controlled using the ``set_num_special_tokens(num_special_tokens)`` function.
+
+    The embeddings are ordered as follow in the token embeddings matrix:
+
+    ::
 
-    The embeddings are ordered as follow in the token embeddings matrice:
         [0,                                                         ----------------------
          ...                                                        -> word embeddings
          config.vocab_size - 1,                                     ______________________
@@ -605,49 +635,25 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
          ...                                                        -> special embeddings
          config.vocab_size + config.n_special - 1]                  ______________________
 
-    where total_tokens_embeddings can be obtained as config.total_tokens_embeddings and is:
-        total_tokens_embeddings = config.vocab_size + config.n_special
-    You should use the associate indices to index the embeddings.
+    where ``total_tokens_embeddings`` can be obtained as ``config.total_tokens_embeddings`` and is:
 
-    Params:
+    ::
+
+        total_tokens_embeddings = config.vocab_size + config.n_special
+
+    You should use the associated indices to index the embeddings.
+
+    Args:
         `config`: a OpenAIGPTConfig class instance with the configuration to build a new model
         `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
         `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
             This can be used to compute head importance metrics. Default: False
 
-    Inputs:
-        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] (or more generally [d_1, ..., d_n, sequence_length]
-            were d_1 ... d_n are arbitrary dimensions) with the word BPE token indices selected in the range [0, total_tokens_embeddings[
-        `position_ids`: an optional torch.LongTensor with the same shape as input_ids
-            with the position indices (selected in the range [0, config.n_positions - 1[.
-        `token_type_ids`: an optional torch.LongTensor with the same shape as input_ids
-            You can use it to add a third type of embedding to each input token in the sequence
-            (the previous two being the word and position embeddings).
-            The input, position and token_type embeddings are summed inside the Transformer before the first
-            self-attention block.
-        `lm_labels`: optional language modeling labels: torch.LongTensor of shape [batch_size, sequence_length]
-            with indices selected in [-1, 0, ..., vocab_size]. All labels set to -1 are ignored (masked), the loss
-            is only computed for the labels set in [0, ..., vocab_size]
-        `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
-            It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
 
-    Outputs:
-        if `lm_labels` is not `None`:
-            Outputs the language modeling loss.
-        else:
-            `lm_logits`: the language modeling logits as a torch.FloatTensor of size [batch_size, sequence_length, total_tokens_embeddings]
-                (or more generally [d_1, ..., d_n, total_tokens_embeddings] were d_1 ... d_n are the dimension of input_ids)
+    Example::
 
-    Example usage:
-    ```python
-    # Already been converted into BPE token ids
-    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
-
-    config = modeling_openai.OpenAIGPTConfig()
-
-    model = modeling_openai.OpenAIGPTLMHeadModel(config)
-    lm_logits = model(input_ids)
-    ```
+        config = modeling_openai.OpenAIGPTConfig()
+        model = modeling_openai.OpenAIGPTLMHeadModel(config)
     """
 
     def __init__(self, config):
@@ -657,14 +663,50 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
         self.apply(self.init_weights)
 
     def set_num_special_tokens(self, num_special_tokens, predict_special_tokens=True):
-        """ Update input and output embeddings with new embedding matrice
-            Make sure we are sharing the embeddings
+        """
+        Update input and output embeddings with new embedding matrix. Make sure we are sharing the embeddings
+        TODO
+
         """
         self.config.predict_special_tokens = self.transformer.config.predict_special_tokens = predict_special_tokens
         self.transformer.set_num_special_tokens(num_special_tokens)
         self.lm_head.set_embeddings_weights(self.transformer.tokens_embed.weight, predict_special_tokens=predict_special_tokens)
 
     def forward(self, input_ids, position_ids=None, token_type_ids=None, lm_labels=None, head_mask=None):
+        """
+        Performs a model forward pass. **Can be called by calling the class directly, once it has been instantiated.**
+
+        Args:
+            `input_ids`: a ``torch.LongTensor`` of shape [batch_size, sequence_length] (or more generally [d_1, ..., d_n, sequence_length]
+                were d_1 ... d_n are arbitrary dimensions) with the word BPE token indices selected in the range [0, total_tokens_embeddings[
+            `position_ids`: an optional ``torch.LongTensor`` with the same shape as input_ids
+                with the position indices (selected in the range [0, config.n_positions - 1[.
+            `token_type_ids`: an optional ``torch.LongTensor`` with the same shape as input_ids
+                You can use it to add a third type of embedding to each input token in the sequence
+                (the previous two being the word and position embeddings).
+                The input, position and token_type embeddings are summed inside the Transformer before the first
+                self-attention block.
+            `lm_labels`: optional language modeling labels: ``torch.LongTensor`` of shape [batch_size, sequence_length]
+                with indices selected in [-1, 0, ..., vocab_size]. All labels set to -1 are ignored (masked), the loss
+                is only computed for the labels set in [0, ..., vocab_size]
+            `head_mask`: an optional ``torch.Tensor`` of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
+                It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
+
+        Returns:
+            if ``lm_labels`` is not ``None``, outputs the language modeling loss. Otherwise, outputs ``lm_logits``,
+            the language modeling logits as a ``torch.FloatTensor`` of size [batch_size, sequence_length,
+            total_tokens_embeddings] (or more generally [d_1, ..., d_n, total_tokens_embeddings] where d_1 ... d_n are
+            the dimension of input_ids)
+
+        Example::
+
+            # Already been converted into BPE token ids
+            input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
+
+            lm_logits = model(input_ids)
+            # or
+            lm_logits = model.forward(input_ids)
+        """
         transformer_outputs = self.transformer(input_ids, position_ids, token_type_ids, head_mask)
         hidden_states = transformer_outputs[0]
         lm_logits = self.lm_head(hidden_states)
@@ -689,9 +731,13 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
     OpenAI GPT use a single embedding matrix to store the word and special embeddings.
     Special tokens embeddings are additional tokens that are not pre-trained: [SEP], [CLS]...
     Special tokens need to be trained during the fine-tuning if you use them.
-    The number of special embeddings can be controled using the `set_num_special_tokens(num_special_tokens)` function.
+    The number of special embeddings can be controlled using the ``set_num_special_tokens(num_special_tokens)``
+    function.
+
+    The embeddings are ordered as follow in the token embeddings matrix:
+
+    ::
 
-    The embeddings are ordered as follow in the token embeddings matrice:
         [0,                                                         ----------------------
          ...                                                        -> word embeddings
          config.vocab_size - 1,                                     ______________________
@@ -699,54 +745,24 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
          ...                                                        -> special embeddings
          config.vocab_size + config.n_special - 1]                  ______________________
 
-    where total_tokens_embeddings can be obtained as config.total_tokens_embeddings and is:
+    where ``total_tokens_embeddings`` can be obtained as ``config.total_tokens_embeddings`` and is:
+
+    ::
+
         total_tokens_embeddings = config.vocab_size + config.n_special
+
     You should use the associate indices to index the embeddings.
 
-    Params:
+    Args:
         `config`: a OpenAIGPTConfig class instance with the configuration to build a new model
         `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
         `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
             This can be used to compute head importance metrics. Default: False
 
-    Inputs:
-        `input_ids`: a torch.LongTensor of shape [batch_size, num_choices, sequence_length] with the BPE token
-            indices selected in the range [0, total_tokens_embeddings[
-        `mc_token_ids`: a torch.LongTensor of shape [batch_size, num_choices] with the index of the token from
-            which we should take the hidden state to feed the multiple choice classifier (usually last token of the sequence)
-        `position_ids`: an optional torch.LongTensor with the same shape as input_ids
-            with the position indices (selected in the range [0, config.n_positions - 1[.
-        `token_type_ids`: an optional torch.LongTensor with the same shape as input_ids
-            You can use it to add a third type of embedding to each input token in the sequence
-            (the previous two being the word and position embeddings).
-            The input, position and token_type embeddings are summed inside the Transformer before the first
-            self-attention block.
-        `lm_labels`: optional language modeling labels: torch.LongTensor of shape [batch_size, num_choices, sequence_length]
-            with indices selected in [-1, 0, ..., total_tokens_embeddings]. All labels set to -1 are ignored (masked), the loss
-            is only computed for the labels set in [0, ..., total_tokens_embeddings]
-        `multiple_choice_labels`: optional multiple choice labels: torch.LongTensor of shape [batch_size]
-            with indices selected in [0, ..., num_choices].
-        `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
-            It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
+    Example::
 
-    Outputs:
-        if `lm_labels` and `multiple_choice_labels` are not `None`:
-            Outputs a tuple of losses with the language modeling loss and the multiple choice loss.
-        else: a tuple with
-            `lm_logits`: the language modeling logits as a torch.FloatTensor of size [batch_size, num_choices, sequence_length, total_tokens_embeddings]
-            `multiple_choice_logits`: the multiple choice logits as a torch.FloatTensor of size [batch_size, num_choices]
-
-    Example usage:
-    ```python
-    # Already been converted into BPE token ids
-    input_ids = torch.LongTensor([[[31, 51, 99], [15, 5, 0]]])  # (bsz, number of choice, seq length)
-    mc_token_ids = torch.LongTensor([[2], [1]]) # (bsz, number of choice)
-
-    config = modeling_openai.OpenAIGPTConfig()
-
-    model = modeling_openai.OpenAIGPTDoubleHeadsModel(config)
-    lm_logits, multiple_choice_logits = model(input_ids, mc_token_ids)
-    ```
+        config = modeling_openai.OpenAIGPTConfig()
+        model = modeling_openai.OpenAIGPTDoubleHeadsModel(config)
     """
 
     def __init__(self, config):
@@ -761,6 +777,7 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
     def set_num_special_tokens(self, num_special_tokens, predict_special_tokens=True):
         """ Update input and output embeddings with new embedding matrice
             Make sure we are sharing the embeddings
+            TODO
         """
         self.config.predict_special_tokens = self.transformer.config.predict_special_tokens = predict_special_tokens
         self.transformer.set_num_special_tokens(num_special_tokens)
@@ -768,6 +785,50 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
 
     def forward(self, input_ids, mc_token_ids=None, lm_labels=None, mc_labels=None, token_type_ids=None,
                 position_ids=None, head_mask=None):
+        """
+        Performs a model forward pass. **Can be called by calling the class directly, once it has been instantiated.**
+
+        Args:
+            `input_ids`: a ``torch.LongTensor`` of shape [batch_size, num_choices, sequence_length] with the BPE token
+                indices selected in the range [0, total_tokens_embeddings[
+            `mc_token_ids`: a ``torch.LongTensor`` of shape [batch_size, num_choices] with the index of the token from
+                which we should take the hidden state to feed the multiple choice classifier (usually last token of the sequence)
+            `position_ids`: an optional ``torch.LongTensor`` with the same shape as input_ids
+                with the position indices (selected in the range [0, config.n_positions - 1[.
+            `token_type_ids`: an optional ``torch.LongTensor`` with the same shape as input_ids
+                You can use it to add a third type of embedding to each input token in the sequence
+                (the previous two being the word and position embeddings).
+                The input, position and token_type embeddings are summed inside the Transformer before the first
+                self-attention block.
+            `lm_labels`: optional language modeling labels: ``torch.LongTensor`` of shape [batch_size, num_choices, sequence_length]
+                with indices selected in [-1, 0, ..., total_tokens_embeddings]. All labels set to -1 are ignored (masked), the loss
+                is only computed for the labels set in [0, ..., total_tokens_embeddings]
+            `multiple_choice_labels`: optional multiple choice labels: ``torch.LongTensor`` of shape [batch_size]
+                with indices selected in [0, ..., num_choices].
+            `head_mask`: an optional ``torch.Tensor`` of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
+                It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
+
+        Returns:
+            if ``lm_labels`` and ``multiple_choice_labels`` are not ``None``, outputs a tuple of losses with the
+            language modeling loss and the multiple choice loss. Otherwise, returns a
+            ``tuple(lm_logits, multiple_choice_logits)``.
+
+                ``lm_logits`` are the language modeling logits as a ``torch.FloatTensor`` of size
+                [batch_size, num_choices, sequence_length, total_tokens_embeddings]
+
+                ``multiple_choice_logits``: the multiple choice logits as a ``torch.FloatTensor`` of
+                size [batch_size, num_choices]
+
+        Example::
+
+            # Already been converted into BPE token ids
+            input_ids = torch.LongTensor([[[31, 51, 99], [15, 5, 0]]])  # (bsz, number of choice, seq length)
+            mc_token_ids = torch.LongTensor([[2], [1]]) # (bsz, number of choice)
+
+            lm_logits, multiple_choice_logits = model(input_ids, mc_token_ids)
+            # or
+            lm_logits, multiple_choice_logits = model.forward(input_ids, mc_token_ids)
+        """
         transformer_outputs = self.transformer(input_ids, position_ids, token_type_ids, head_mask)
         hidden_states = transformer_outputs[0]
 
diff --git a/pytorch_transformers/modeling_transfo_xl.py b/pytorch_transformers/modeling_transfo_xl.py
index 0c5d127d62..c84f3b5ed2 100644
--- a/pytorch_transformers/modeling_transfo_xl.py
+++ b/pytorch_transformers/modeling_transfo_xl.py
@@ -177,6 +177,38 @@ def load_tf_weights_in_transfo_xl(model, config, tf_path):
 
 class TransfoXLConfig(PretrainedConfig):
     """Configuration class to store the configuration of a `TransfoXLModel`.
+
+        Args:
+            vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `TransfoXLModel` or a configuration json file.
+            cutoffs: cutoffs for the adaptive softmax
+            d_model: Dimensionality of the model's hidden states.
+            d_embed: Dimensionality of the embeddings
+            d_head: Dimensionality of the model's heads.
+            div_val: divident value for adapative input and softmax
+            pre_lnorm: apply LayerNorm to the input instead of the output
+            d_inner: Inner dimension in FF
+            n_layer: Number of hidden layers in the Transformer encoder.
+            n_head: Number of attention heads for each attention layer in
+                the Transformer encoder.
+            tgt_len: number of tokens to predict
+            ext_len: length of the extended context
+            mem_len: length of the retained previous heads
+            same_length: use the same attn length for all tokens
+            proj_share_all_but_first: True to share all but first projs, False not to share.
+            attn_type: attention type. 0 for Transformer-XL, 1 for Shaw et al, 2 for Vaswani et al, 3 for Al Rfou et al.
+            clamp_len: use the same pos embeddings after clamp_len
+            sample_softmax: number of samples in sampled softmax
+            adaptive: use adaptive softmax
+            tie_weight: tie the word embedding and softmax weights
+            dropout: The dropout probabilitiy for all fully connected
+                layers in the embeddings, encoder, and pooler.
+            dropatt: The dropout ratio for the attention probabilities.
+            untie_r: untie relative position biases
+            embd_pdrop: The dropout ratio for the embeddings.
+            init: parameter initializer to use
+            init_range: parameters initialized by U(-init_range, init_range).
+            proj_init_std: parameters initialized by N(0, init_std)
+            init_std: parameters initialized by N(0, init_std)
     """
     pretrained_config_archive_map = TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP
 
@@ -210,38 +242,6 @@ class TransfoXLConfig(PretrainedConfig):
                  init_std=0.02,
                  **kwargs):
         """Constructs TransfoXLConfig.
-
-        Args:
-            vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `TransfoXLModel` or a configuration json file.
-            cutoffs: cutoffs for the adaptive softmax
-            d_model: Dimensionality of the model's hidden states.
-            d_embed: Dimensionality of the embeddings
-            d_head: Dimensionality of the model's heads.
-            div_val: divident value for adapative input and softmax
-            pre_lnorm: apply LayerNorm to the input instead of the output
-            d_inner: Inner dimension in FF
-            n_layer: Number of hidden layers in the Transformer encoder.
-            n_head: Number of attention heads for each attention layer in
-                the Transformer encoder.
-            tgt_len: number of tokens to predict
-            ext_len: length of the extended context
-            mem_len: length of the retained previous heads
-            same_length: use the same attn length for all tokens
-            proj_share_all_but_first: True to share all but first projs, False not to share.
-            attn_type: attention type. 0 for Transformer-XL, 1 for Shaw et al, 2 for Vaswani et al, 3 for Al Rfou et al.
-            clamp_len: use the same pos embeddings after clamp_len
-            sample_softmax: number of samples in sampled softmax
-            adaptive: use adaptive softmax
-            tie_weight: tie the word embedding and softmax weights
-            dropout: The dropout probabilitiy for all fully connected
-                layers in the embeddings, encoder, and pooler.
-            dropatt: The dropout ratio for the attention probabilities.
-            untie_r: untie relative position biases
-            embd_pdrop: The dropout ratio for the embeddings.
-            init: parameter initializer to use
-            init_range: parameters initialized by U(-init_range, init_range).
-            proj_init_std: parameters initialized by N(0, init_std)
-            init_std: parameters initialized by N(0, init_std)
         """
         super(TransfoXLConfig, self).__init__(**kwargs)
 
@@ -901,42 +901,20 @@ class TransfoXLPreTrainedModel(PreTrainedModel):
 class TransfoXLModel(TransfoXLPreTrainedModel):
     """Transformer XL model ("Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context").
 
-    Transformer XL use a relative positioning (with sinusiodal patterns) and adaptive softmax inputs which means that:
-    - you don't need to specify positioning embeddings indices
-    - the tokens in the vocabulary have to be sorted to decreasing frequency.
+    Transformer XL uses relative positioning (with sinusiodal patterns) and adaptive softmax inputs which means that:
 
-    Params:
+        - you don't need to specify positioning embeddings indices.
+
+        - the tokens in the vocabulary have to be sorted in decreasing frequency.
+
+    Args:
         config: a TransfoXLConfig class instance with the configuration to build a new model
 
-    Inputs:
-        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
-            with the token indices selected in the range [0, self.config.n_token[
-        `mems`: optional memomry of hidden states from previous forward passes
-            as a list (num layers) of hidden states at the entry of each layer
-            each hidden states has shape [self.config.mem_len, bsz, self.config.d_model]
-            Note that the first two dimensions are transposed in `mems` with regards to `input_ids` and `labels`
-    Outputs:
-        A tuple of (last_hidden_state, new_mems)
-        `last_hidden_state`: the encoded-hidden-states at the top of the model
-            as a torch.FloatTensor of size [batch_size, sequence_length, self.config.d_model]
-        `new_mems`: list (num layers) of updated mem states at the entry of each layer
-            each mem state is a torch.FloatTensor of size [self.config.mem_len, batch_size, self.config.d_model]
-            Note that the first two dimensions are transposed in `mems` with regards to `input_ids` and `labels`
 
-    Example usage:
-    ```python
-    # Already been converted into BPE token ids
-    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
-    input_ids_next = torch.LongTensor([[53, 21, 1], [64, 23, 100]])
+    Example::
 
-    config = TransfoXLConfig()
-
-    model = TransfoXLModel(config)
-    last_hidden_state, new_mems = model(input_ids)
-
-    # Another time on input_ids_next using the memory:
-    last_hidden_state, new_mems = model(input_ids_next, new_mems)
-    ```
+        config = TransfoXLConfig()
+        model = TransfoXLModel(config)
     """
     def __init__(self, config):
         super(TransfoXLModel, self).__init__(config)
@@ -1200,18 +1178,40 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
         return outputs  # last hidden state, new_mems, (all hidden states), (all attentions)
 
     def forward(self, input_ids, mems=None, head_mask=None):
-        """ Params:
-                input_ids :: [bsz, len]
-                mems :: optional mems from previous forwar passes (or init_mems)
-                    list (num layers) of mem states at the entry of each layer
-                        shape :: [self.config.mem_len, bsz, self.config.d_model]
-                    Note that the first two dimensions are transposed in `mems` with regards to `input_ids` and `labels`
-            Returns:
-                tuple (last_hidden, new_mems) where:
-                    new_mems: list (num layers) of mem states at the entry of each layer
-                        shape :: [self.config.mem_len, bsz, self.config.d_model]
-                    last_hidden: output of the last layer:
-                        shape :: [bsz, len, self.config.d_model]
+        """
+        Performs a model forward pass. **Can be called by calling the class directly, once it has been instantiated.**
+
+        Args:
+            `input_ids`: a ``torch.LongTensor`` of shape [batch_size, sequence_length]
+                with the token indices selected in the range [0, self.config.n_token[
+            `mems`: optional memory of hidden states from previous forward passes
+                as a list (num layers) of hidden states at the entry of each layer
+                each hidden states has shape [self.config.mem_len, bsz, self.config.d_model]
+                Note that the first two dimensions are transposed in `mems` with regards to `input_ids` and `labels`
+
+        Returns:
+            A tuple of ``(last_hidden_state, new_mems)``.
+
+                ``last_hidden_state``: the encoded-hidden-states at the top of the model
+                as a ``torch.FloatTensor`` of size [batch_size, sequence_length, self.config.d_model]
+
+                ``new_mems``: list (num layers) of updated mem states at the entry of each layer
+                each mem state is a ``torch.FloatTensor`` of size [self.config.mem_len, batch_size, self.config.d_model]
+                Note that the first two dimensions are transposed in ``mems`` with regards to ``input_ids`` and
+                ``labels``
+
+        Example::
+
+            # Already been converted into BPE token ids
+            input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
+            input_ids_next = torch.LongTensor([[53, 21, 1], [64, 23, 100]])
+
+            last_hidden_state, new_mems = model(input_ids)
+            # or
+            last_hidden_state, new_mems = model.forward(input_ids)
+
+            # Another time on input_ids_next using the memory:
+            last_hidden_state, new_mems = model(input_ids_next, new_mems)
         """
         # the original code for Transformer-XL used shapes [len, bsz] but we want a unified interface in the library
         # so we transpose here from shape [bsz, len] to shape [len, bsz]
@@ -1227,52 +1227,24 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
 class TransfoXLLMHeadModel(TransfoXLPreTrainedModel):
     """Transformer XL model ("Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context").
 
-    This model add an (adaptive) softmax head on top of the TransfoXLModel
+    This model adds an (adaptive) softmax head on top of the ``TransfoXLModel``
 
-    Transformer XL use a relative positioning (with sinusiodal patterns) and adaptive softmax inputs which means that:
-    - you don't need to specify positioning embeddings indices
-    - the tokens in the vocabulary have to be sorted to decreasing frequency.
+    Transformer XL uses a relative positioning (with sinusoidal patterns) and adaptive softmax inputs which means that:
 
-    Call self.tie_weights() if you update/load the weights of the transformer to keep the weights tied.
+        - you don't need to specify positioning embeddings indices
 
-    Params:
-        config: a TransfoXLConfig class instance with the configuration to build a new model
+        - the tokens in the vocabulary have to be sorted in decreasing frequency.
 
-    Inputs:
-        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
-            with the token indices selected in the range [0, self.config.n_token[
-        `labels`: an optional torch.LongTensor of shape [batch_size, sequence_length]
-            with the labels token indices selected in the range [0, self.config.n_token[
-        `mems`: an optional memory of hidden states from previous forward passes
-            as a list (num layers) of hidden states at the entry of each layer
-            each hidden states has shape [self.config.mem_len, bsz, self.config.d_model]
-            Note that the first two dimensions are transposed in `mems` with regards to `input_ids` and `labels`
+    Call ``self.tie_weights()`` if you update/load the weights of the transformer to keep the weights tied.
 
-    Outputs:
-        A tuple of (last_hidden_state, new_mems)
-        `softmax_output`: output of the (adaptive) softmax:
-            if labels is None:
-                Negative log likelihood of shape [batch_size, sequence_length] 
-            else:
-                log probabilities of tokens, shape [batch_size, sequence_length, n_tokens]
-        `new_mems`: list (num layers) of updated mem states at the entry of each layer
-            each mem state is a torch.FloatTensor of size [self.config.mem_len, batch_size, self.config.d_model]
-            Note that the first two dimensions are transposed in `mems` with regards to `input_ids` and `labels`
+    Args:
+        config: a ``TransfoXLConfig`` class instance with the configuration to build a new model
 
-    Example usage:
-    ```python
-    # Already been converted into BPE token ids
-    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
-    input_ids_next = torch.LongTensor([[53, 21, 1], [64, 23, 100]])
 
-    config = TransfoXLConfig()
+    Example::
 
-    model = TransfoXLModel(config)
-    last_hidden_state, new_mems = model(input_ids)
-
-    # Another time on input_ids_next using the memory:
-    last_hidden_state, new_mems = model(input_ids_next, mems=new_mems)
-    ```
+        config = TransfoXLConfig()
+        model = TransfoXLModel(config)
     """
     def __init__(self, config):
         super(TransfoXLLMHeadModel, self).__init__(config)
@@ -1290,7 +1262,9 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel):
         self.tie_weights()
 
     def tie_weights(self):
-        """ Run this to be sure output and input (adaptive) softmax weights are tied """
+        """
+        Run this to be sure output and input (adaptive) softmax weights are tied
+        """
         # sampled softmax
         if self.sample_softmax > 0:
             if self.config.tie_weight:
@@ -1314,18 +1288,43 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel):
         return self.transformer.init_mems(data)
 
     def forward(self, input_ids, labels=None, mems=None, head_mask=None):
-        """ Params:
-                input_ids :: [bsz, len]
-                labels :: [bsz, len]
-            Returns:
-                tuple(softmax_output, new_mems) where:
-                    new_mems: list (num layers) of hidden states at the entry of each layer
-                        shape :: [mem_len, bsz, self.config.d_model] :: Warning: shapes are transposed here w. regards to input_ids
-                    softmax_output: output of the (adaptive) softmax:
-                        if labels is None:
-                            Negative log likelihood of shape :: [bsz, len] 
-                        else:
-                            log probabilities of tokens, shape :: [bsz, len, n_tokens]
+        """
+        Performs a model forward pass. **Can be called by calling the class directly, once it has been instantiated.**
+
+        Args:
+            `input_ids`: a ``torch.LongTensor`` of shape [batch_size, sequence_length]
+                with the token indices selected in the range [0, self.config.n_token[
+            `labels`: an optional ``torch.LongTensor`` of shape [batch_size, sequence_length]
+                with the labels token indices selected in the range [0, self.config.n_token[
+            `mems`: an optional memory of hidden states from previous forward passes
+                as a list (num layers) of hidden states at the entry of each layer
+                each hidden states has shape [self.config.mem_len, bsz, self.config.d_model]
+                Note that the first two dimensions are transposed in `mems` with regards to `input_ids` and `labels`
+
+        Returns:
+            A tuple of (last_hidden_state, new_mems)
+
+                ``last_hidden_state``: output of the (adaptive) softmax. If ``labels`` is ``None``, it is the negative
+                log likelihood of shape [batch_size, sequence_length]. Otherwise, it is the log probabilities of
+                tokens of, shape [batch_size, sequence_length, n_tokens].
+
+                ``new_mems``: list (num layers) of updated mem states at the entry of each layer
+                each mem state is a ``torch.FloatTensor`` of size [self.config.mem_len, batch_size, self.config.d_model]
+                Note that the first two dimensions are transposed in ``mems`` with regards to ``input_ids`` and
+                ``labels``
+
+        Example::
+
+            # Already been converted into BPE token ids
+            input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
+            input_ids_next = torch.LongTensor([[53, 21, 1], [64, 23, 100]])
+
+            last_hidden_state, new_mems = model(input_ids)
+            # or
+            last_hidden_state, new_mems = model.forward(input_ids)
+
+            # Another time on input_ids_next using the memory:
+            last_hidden_state, new_mems = model(input_ids_next, mems=new_mems)
         """
         bsz = input_ids.size(0)
         tgt_len = input_ids.size(1)
diff --git a/pytorch_transformers/modeling_xlm.py b/pytorch_transformers/modeling_xlm.py
index 65db9e7159..72ec6397a0 100644
--- a/pytorch_transformers/modeling_xlm.py
+++ b/pytorch_transformers/modeling_xlm.py
@@ -45,6 +45,46 @@ XLM_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 
 class XLMConfig(PretrainedConfig):
     """Configuration class to store the configuration of a `XLMModel`.
+
+    Args:
+        vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `XLMModel`.
+        d_model: Size of the encoder layers and the pooler layer.
+        n_layer: Number of hidden layers in the Transformer encoder.
+        n_head: Number of attention heads for each attention layer in
+            the Transformer encoder.
+        d_inner: The size of the "intermediate" (i.e., feed-forward)
+            layer in the Transformer encoder.
+        ff_activation: The non-linear activation function (function or string) in the
+            encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
+        untie_r: untie relative position biases
+        attn_type: 'bi' for XLM, 'uni' for Transformer-XL
+
+        dropout: The dropout probabilitiy for all fully connected
+            layers in the embeddings, encoder, and pooler.
+        dropatt: The dropout ratio for the attention
+            probabilities.
+        max_position_embeddings: The maximum sequence length that this model might
+            ever be used with. Typically set this to something large just in case
+            (e.g., 512 or 1024 or 2048).
+        initializer_range: The sttdev of the truncated_normal_initializer for
+            initializing all weight matrices.
+        layer_norm_eps: The epsilon used by LayerNorm.
+
+        dropout: float, dropout rate.
+        dropatt: float, dropout rate on attention probabilities.
+        init: str, the initialization scheme, either "normal" or "uniform".
+        init_range: float, initialize the parameters with a uniform distribution
+            in [-init_range, init_range]. Only effective when init="uniform".
+        init_std: float, initialize the parameters with a normal distribution
+            with mean 0 and stddev init_std. Only effective when init="normal".
+        mem_len: int, the number of tokens to cache.
+        reuse_len: int, the number of tokens in the currect batch to be cached
+            and reused in the future.
+        bi_data: bool, whether to use bidirectional input pipeline.
+            Usually set to True during pretraining and False during finetuning.
+        clamp_len: int, clamp all relative distances larger than clamp_len.
+            -1 means no clamping.
+        same_length: bool, whether to use the same attention length for each token.
     """
     pretrained_config_archive_map = XLM_PRETRAINED_CONFIG_ARCHIVE_MAP
 
@@ -83,46 +123,6 @@ class XLMConfig(PretrainedConfig):
                  end_n_top=5,
                  **kwargs):
         """Constructs XLMConfig.
-
-        Args:
-            vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `XLMModel`.
-            d_model: Size of the encoder layers and the pooler layer.
-            n_layer: Number of hidden layers in the Transformer encoder.
-            n_head: Number of attention heads for each attention layer in
-                the Transformer encoder.
-            d_inner: The size of the "intermediate" (i.e., feed-forward)
-                layer in the Transformer encoder.
-            ff_activation: The non-linear activation function (function or string) in the
-                encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
-            untie_r: untie relative position biases
-            attn_type: 'bi' for XLM, 'uni' for Transformer-XL
-
-            dropout: The dropout probabilitiy for all fully connected
-                layers in the embeddings, encoder, and pooler.
-            dropatt: The dropout ratio for the attention
-                probabilities.
-            max_position_embeddings: The maximum sequence length that this model might
-                ever be used with. Typically set this to something large just in case
-                (e.g., 512 or 1024 or 2048).
-            initializer_range: The sttdev of the truncated_normal_initializer for
-                initializing all weight matrices.
-            layer_norm_eps: The epsilon used by LayerNorm.
-
-            dropout: float, dropout rate.
-            dropatt: float, dropout rate on attention probabilities.
-            init: str, the initialization scheme, either "normal" or "uniform".
-            init_range: float, initialize the parameters with a uniform distribution
-                in [-init_range, init_range]. Only effective when init="uniform".
-            init_std: float, initialize the parameters with a normal distribution
-                with mean 0 and stddev init_std. Only effective when init="normal".
-            mem_len: int, the number of tokens to cache.
-            reuse_len: int, the number of tokens in the currect batch to be cached
-                and reused in the future.
-            bi_data: bool, whether to use bidirectional input pipeline.
-                Usually set to True during pretraining and False during finetuning.
-            clamp_len: int, clamp all relative distances larger than clamp_len.
-                -1 means no clamping.
-            same_length: bool, whether to use the same attention length for each token.
         """
         super(XLMConfig, self).__init__(**kwargs)
 
@@ -377,6 +377,26 @@ class XLMPreTrainedModel(PreTrainedModel):
 
 
 class XLMModel(XLMPreTrainedModel):
+    """
+    XLM model from: "Cross-lingual Language Model Pretraining" by Guillaume Lample, Alexis Conneau
+
+    Paper: https://arxiv.org/abs/1901.07291
+
+    Original code: https://github.com/facebookresearch/XLM
+
+    Args:
+        `config`: a XLMConfig class instance with the configuration to build a new model
+        `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
+        `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
+            This can be used to compute head importance metrics. Default: False
+
+    Example::
+
+        config = modeling.XLMConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
+            num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
+
+        model = modeling.XLMModel(config=config)
+    """
 
     ATTRIBUTES = ['encoder', 'eos_index', 'pad_index',  # 'with_output', 
                   'n_langs', 'n_words', 'dim', 'n_layers', 'n_heads', 
@@ -384,57 +404,6 @@ class XLMModel(XLMPreTrainedModel):
                   'asm_cutoffs', 'asm_div_value']
 
     def __init__(self, config):  #, dico, is_encoder, with_output):
-        """ XLM model from: "Cross-lingual Language Model Pretraining" by Guillaume Lample, Alexis Conneau
-            Paper: https://arxiv.org/abs/1901.07291
-            Original code: https://github.com/facebookresearch/XLM
-
-        Params:
-            `config`: a XLMConfig class instance with the configuration to build a new model
-            `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
-            `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
-                This can be used to compute head importance metrics. Default: False
-
-        Inputs:
-            `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
-                with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
-                `run_bert_extract_features.py`, `run_bert_classifier.py` and `run_bert_squad.py`)
-            `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
-                types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
-                a `sentence B` token (see XLM paper for more details).
-            `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
-                selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
-                input sequence length in the current batch. It's the mask that we typically use for attention when
-                a batch has varying length sentences.
-            `output_all_encoded_layers`: boolean which controls the content of the `encoded_layers` output as described below. Default: `True`.
-            `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
-                It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
-
-
-        Outputs: Tuple of (encoded_layers, pooled_output)
-            `encoded_layers`: controled by `output_all_encoded_layers` argument:
-                - `output_all_encoded_layers=True`: outputs a list of the full sequences of encoded-hidden-states at the end
-                    of each attention block (i.e. 12 full sequences for XLM-base, 24 for XLM-large), each
-                    encoded-hidden-state is a torch.FloatTensor of size [batch_size, sequence_length, hidden_size],
-                - `output_all_encoded_layers=False`: outputs only the full sequence of hidden-states corresponding
-                    to the last attention block of shape [batch_size, sequence_length, hidden_size],
-            `pooled_output`: a torch.FloatTensor of size [batch_size, hidden_size] which is the output of a
-                classifier pretrained on top of the hidden state associated to the first character of the
-                input (`CLS`) to train on the Next-Sentence task (see XLM's paper).
-
-        Example usage:
-        ```python
-        # Already been converted into WordPiece token ids
-        input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
-        input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
-        token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
-
-        config = modeling.XLMConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
-            num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
-
-        model = modeling.XLMModel(config=config)
-        all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)
-        ```
-        """
         super(XLMModel, self).__init__(config)
         self.output_attentions = config.output_attentions
         self.output_hidden_states = config.output_hidden_states
@@ -507,12 +476,53 @@ class XLMModel(XLMPreTrainedModel):
     def forward(self, input_ids, lengths=None, positions=None, langs=None,
                 token_type_ids=None, attention_mask=None, cache=None, head_mask=None):  # src_enc=None, src_len=None, 
         """
-        Inputs:
-            `input_ids` LongTensor(bs, slen), containing word indices
-            `lengths` LongTensor(bs), containing the length of each sentence
-            `positions` LongTensor(bs, slen), containing word positions
-            `langs` LongTensor(bs, slen), containing language IDs
-            `token_type_ids` LongTensor (bs, slen) same as `langs` used for compatibility
+        Performs a model forward pass. **Can be called by calling the class directly, once it has been instantiated.**
+
+        Parameters:
+            `input_ids`: a ``torch.LongTensor`` of shape [batch_size, sequence_length]
+                with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
+                `run_bert_extract_features.py`, `run_bert_classifier.py` and `run_bert_squad.py`)
+            `lengths`: ``torch.LongTensor`` of size ``bs``, containing the length of each sentence
+            `positions`: ``torch.LongTensor`` of size ``(bs, slen)``, containing word positions
+            `langs`: ``torch.LongTensor`` of size ``(bs, slen)``, containing language IDs
+            `token_type_ids`: an optional ``torch.LongTensor`` of shape [batch_size, sequence_length] with the token
+                types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
+                a `sentence B` token (see XLM paper for more details).
+            `attention_mask`: an optional ``torch.LongTensor`` of shape [batch_size, sequence_length] with indices
+                selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
+                input sequence length in the current batch. It's the mask that we typically use for attention when
+                a batch has varying length sentences.
+            `cache`: TODO
+            `head_mask`: an optional ``torch.Tensor`` of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
+                It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
+
+
+        Returns:
+            A ``tuple(encoded_layers, pooled_output)``, with
+
+            ``encoded_layers``: controlled by ``output_all_encoded_layers`` argument:
+
+                - ``output_all_encoded_layers=True``: outputs a list of the full sequences of encoded-hidden-states at the end \
+                of each attention block (i.e. 12 full sequences for XLM-base, 24 for XLM-large), each \
+                encoded-hidden-state is a ``torch.FloatTensor`` of size [batch_size, sequence_length, hidden_size],
+
+                - ``output_all_encoded_layers=False``: outputs only the full sequence of hidden-states corresponding \
+                to the last attention block of shape [batch_size, sequence_length, hidden_size],
+
+            ``pooled_output``: a ``torch.FloatTensor`` of size [batch_size, hidden_size] which is the output of a
+            classifier pre-trained on top of the hidden state associated to the first character of the
+            input (`CLS`) to train on the Next-Sentence task (see XLM's paper).
+
+        Example::
+
+            # Already been converted into WordPiece token ids
+            input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
+            input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
+            token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
+
+            all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)
+            # or
+            all_encoder_layers, pooled_output = model.forward(input_ids, token_type_ids, input_mask)
         """
         if lengths is None:
             lengths = (input_ids != self.pad_index).sum(dim=1).long()
@@ -674,55 +684,23 @@ class XLMPredLayer(nn.Module):
 
 class XLMWithLMHeadModel(XLMPreTrainedModel):
     """ XLM model from: "Cross-lingual Language Model Pretraining" by Guillaume Lample, Alexis Conneau
-        Paper: https://arxiv.org/abs/1901.07291
-        Original code: https://github.com/facebookresearch/XLM
 
-    Params:
+    Paper: https://arxiv.org/abs/1901.07291
+
+    Original code: https://github.com/facebookresearch/XLM
+
+    Args:
         `config`: a XLMConfig class instance with the configuration to build a new model
         `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
         `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
             This can be used to compute head importance metrics. Default: False
 
-    Inputs:
-        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
-            with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
-            `run_bert_extract_features.py`, `run_bert_classifier.py` and `run_bert_squad.py`)
-        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
-            types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
-            a `sentence B` token (see XLM paper for more details).
-        `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
-            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
-            input sequence length in the current batch. It's the mask that we typically use for attention when
-            a batch has varying length sentences.
-        `output_all_encoded_layers`: boolean which controls the content of the `encoded_layers` output as described below. Default: `True`.
-        `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
-            It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
+    Example::
 
+        config = modeling.XLMConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
+            num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
 
-    Outputs: Tuple of (encoded_layers, pooled_output)
-        `encoded_layers`: controled by `output_all_encoded_layers` argument:
-            - `output_all_encoded_layers=True`: outputs a list of the full sequences of encoded-hidden-states at the end
-                of each attention block (i.e. 12 full sequences for XLM-base, 24 for XLM-large), each
-                encoded-hidden-state is a torch.FloatTensor of size [batch_size, sequence_length, hidden_size],
-            - `output_all_encoded_layers=False`: outputs only the full sequence of hidden-states corresponding
-                to the last attention block of shape [batch_size, sequence_length, hidden_size],
-        `pooled_output`: a torch.FloatTensor of size [batch_size, hidden_size] which is the output of a
-            classifier pretrained on top of the hidden state associated to the first character of the
-            input (`CLS`) to train on the Next-Sentence task (see XLM's paper).
-
-    Example usage:
-    ```python
-    # Already been converted into WordPiece token ids
-    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
-    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
-    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
-
-    config = modeling.XLMConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
-        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
-
-    model = modeling.XLMModel(config=config)
-    all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)
-    ```
+        model = modeling.XLMModel(config=config)
     """
     def __init__(self, config):
         super(XLMWithLMHeadModel, self).__init__(config)
@@ -746,29 +724,51 @@ class XLMWithLMHeadModel(XLMPreTrainedModel):
                 attention_mask=None, cache=None, labels=None, head_mask=None):
         """
         Args:
-            inp_k: int32 Tensor in shape [bsz, len], the input token IDs.
-            token_type_ids: int32 Tensor in shape [bsz, len], the input segment IDs.
-            input_mask: float32 Tensor in shape [bsz, len], the input mask.
-                0 for real tokens and 1 for padding.
-            mems: a list of float32 Tensors in shape [mem_len, bsz, d_model], memory
-                from previous batches. The length of the list equals n_layer.
-                If None, no memory is used.
-            perm_mask: float32 Tensor in shape [bsz, len, len].
-                If perm_mask[k, i, j] = 0, i attend to j in batch k;
-                if perm_mask[k, i, j] = 1, i does not attend to j in batch k.
-                If None, each position attends to all the others.
-            target_mapping: float32 Tensor in shape [bsz, num_predict, len].
-                If target_mapping[k, i, j] = 1, the i-th predict in batch k is
-                on the j-th token.
-                Only used during pretraining for partial prediction.
-                Set to None during finetuning.
-            inp_q: float32 Tensor in shape [bsz, len].
-                1 for tokens with losses and 0 for tokens without losses.
-                Only used during pretraining for two-stream attention.
-                Set to None during finetuning.
+            `input_ids`: a ``torch.LongTensor`` of shape [batch_size, sequence_length]
+                with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
+                `run_bert_extract_features.py`, `run_bert_classifier.py` and `run_bert_squad.py`)
+            `lengths`: TODO
+            `positions`: TODO
+            `langs`: TODO
+            `token_type_ids`: an optional ``torch.LongTensor`` of shape [batch_size, sequence_length] with the token
+                types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
+                a `sentence B` token (see XLM paper for more details).
+            `attention_mask`: an optional ``torch.LongTensor`` of shape [batch_size, sequence_length] with indices
+                selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
+                input sequence length in the current batch. It's the mask that we typically use for attention when
+                a batch has varying length sentences.
+            `cache`: TODO
+            `labels`: TODO
+            `head_mask`: an optional ``torch.Tensor`` of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
+                It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
 
-            summary_type: str, "last", "first", "mean", or "attn". The method
-                to pool the input to get a vector representation.
+
+        Returns:
+            A ``tuple(encoded_layers, pooled_output)``, with
+
+                ``encoded_layers``: controlled by ``output_all_encoded_layers`` argument:
+
+                    If ``output_all_encoded_layers=True``: outputs a list of the full sequences of encoded-hidden-states \
+                    at the end of each attention block (i.e. 12 full sequences for XLM-base, 24 for XLM-large), each \
+                    encoded-hidden-state is a ``torch.FloatTensor`` of size [batch_size, sequence_length, hidden_size],
+
+                    If ``output_all_encoded_layers=False``: outputs only the full sequence of hidden-states corresponding \
+                    to the last attention block of shape [batch_size, sequence_length, hidden_size],
+
+                ``pooled_output``: a ``torch.FloatTensor`` of size [batch_size, hidden_size] which is the output of a \
+                classifier pre-trained on top of the hidden state associated to the first character of the \
+                input (`CLS`) to train on the Next-Sentence task (see XLM's paper).
+
+        Example::
+
+            # Already been converted into WordPiece token ids
+            input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
+            input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
+            token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
+
+            all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)
+            # or
+            all_encoder_layers, pooled_output = model.forward(input_ids, token_type_ids, input_mask)
         """
         transformer_outputs = self.transformer(input_ids, lengths=lengths, positions=positions, token_type_ids=token_type_ids,
                                                langs=langs, attention_mask=attention_mask, cache=cache, head_mask=head_mask)
@@ -783,7 +783,7 @@ class XLMWithLMHeadModel(XLMPreTrainedModel):
 class XLMForSequenceClassification(XLMPreTrainedModel):
     """XLM model ("XLM: Generalized Autoregressive Pretraining for Language Understanding").
 
-    Params:
+    Args:
         `config`: a XLMConfig class instance with the configuration to build a new model
         `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
         `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
@@ -791,58 +791,15 @@ class XLMForSequenceClassification(XLMPreTrainedModel):
         `summary_type`: str, "last", "first", "mean", or "attn". The method
             to pool the input to get a vector representation. Default: last
 
-    Inputs:
-        inp_k: int32 Tensor in shape [bsz, len], the input token IDs.
-        token_type_ids: int32 Tensor in shape [bsz, len], the input segment IDs.
-        input_mask: float32 Tensor in shape [bsz, len], the input mask.
-            0 for real tokens and 1 for padding.
-        attention_mask: [optional] float32 Tensor, SAME FUNCTION as `input_mask`
-            but with 1 for real tokens and 0 for padding.
-            Added for easy compatibility with the XLM model (which uses this negative masking).
-            You can only uses one among `input_mask` and `attention_mask`
-        mems: a list of float32 Tensors in shape [mem_len, bsz, d_model], memory
-            from previous batches. The length of the list equals n_layer.
-            If None, no memory is used.
-        perm_mask: float32 Tensor in shape [bsz, len, len].
-            If perm_mask[k, i, j] = 0, i attend to j in batch k;
-            if perm_mask[k, i, j] = 1, i does not attend to j in batch k.
-            If None, each position attends to all the others.
-        target_mapping: float32 Tensor in shape [bsz, num_predict, len].
-            If target_mapping[k, i, j] = 1, the i-th predict in batch k is
-            on the j-th token.
-            Only used during pretraining for partial prediction.
-            Set to None during finetuning.
-        inp_q: float32 Tensor in shape [bsz, len].
-            1 for tokens with losses and 0 for tokens without losses.
-            Only used during pretraining for two-stream attention.
-            Set to None during finetuning.
-        `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
-            It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
 
 
-    Outputs: Tuple of (logits or loss, mems)
-        `logits or loss`:
-            if labels is None:
-                Token logits with shape [batch_size, sequence_length] 
-            else:
-                CrossEntropy loss with the targets
-        `new_mems`: list (num layers) of updated mem states at the entry of each layer
-            each mem state is a torch.FloatTensor of size [self.config.mem_len, batch_size, self.config.d_model]
-            Note that the first two dimensions are transposed in `mems` with regards to `input_ids` and `labels`
+    Example::
 
-    Example usage:
-    ```python
-    # Already been converted into WordPiece token ids
-    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
-    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
-    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
+        config = modeling.XLMConfig(vocab_size_or_config_json_file=32000, d_model=768,
+            n_layer=12, num_attention_heads=12, intermediate_size=3072)
 
-    config = modeling.XLMConfig(vocab_size_or_config_json_file=32000, d_model=768,
-        n_layer=12, num_attention_heads=12, intermediate_size=3072)
+        model = modeling.XLMModel(config=config)
 
-    model = modeling.XLMModel(config=config)
-    all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)
-    ```
     """
     def __init__(self, config):
         super(XLMForSequenceClassification, self).__init__(config)
@@ -857,30 +814,36 @@ class XLMForSequenceClassification(XLMPreTrainedModel):
                 attention_mask=None, cache=None, labels=None, head_mask=None):
         """
         Args:
-            inp_k: int32 Tensor in shape [bsz, len], the input token IDs.
+            input_ids: TODO
+            lengths: TODO
+            positions: TODO
+            langs: TODO
             token_type_ids: int32 Tensor in shape [bsz, len], the input segment IDs.
-            input_mask: float32 Tensor in shape [bsz, len], the input mask.
-                0 for real tokens and 1 for padding.
             attention_mask: [optional] float32 Tensor, SAME FUNCTION as `input_mask`
                 but with 1 for real tokens and 0 for padding.
                 Added for easy compatibility with the XLM model (which uses this negative masking).
                 You can only uses one among `input_mask` and `attention_mask`
-            mems: a list of float32 Tensors in shape [mem_len, bsz, d_model], memory
-                from previous batches. The length of the list equals n_layer.
-                If None, no memory is used.
-            perm_mask: float32 Tensor in shape [bsz, len, len].
-                If perm_mask[k, i, j] = 0, i attend to j in batch k;
-                if perm_mask[k, i, j] = 1, i does not attend to j in batch k.
-                If None, each position attends to all the others.
-            target_mapping: float32 Tensor in shape [bsz, num_predict, len].
-                If target_mapping[k, i, j] = 1, the i-th predict in batch k is
-                on the j-th token.
-                Only used during pretraining for partial prediction.
-                Set to None during finetuning.
-            inp_q: float32 Tensor in shape [bsz, len].
-                1 for tokens with losses and 0 for tokens without losses.
-                Only used during pretraining for two-stream attention.
-                Set to None during finetuning.
+            cache: TODO
+            labels: TODO
+            head_mask: TODO
+
+
+        Returns:
+            A ``tuple(logits_or_loss, new_mems)``. If ``labels`` is ``None``, return token logits with shape
+            [batch_size, sequence_length]. If it isn't ``None``, return the ``CrossEntropy`` loss with the targets.
+
+            ``new_mems`` is a list (num layers) of updated mem states at the entry of each layer \
+            each mem state is a ``torch.FloatTensor`` of size [self.config.mem_len, batch_size, self.config.d_model] \
+            Note that the first two dimensions are transposed in ``mems`` with regards to ``input_ids`` and ``labels``
+
+        Example::
+
+            # Already been converted into WordPiece token ids
+            input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
+            input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
+            token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
+
+            all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)
         """
         transformer_outputs = self.transformer(input_ids, lengths=lengths, positions=positions, token_type_ids=token_type_ids,
                                                langs=langs, attention_mask=attention_mask, cache=cache, head_mask=head_mask)
@@ -904,60 +867,25 @@ class XLMForSequenceClassification(XLMPreTrainedModel):
 
 
 class XLMForQuestionAnswering(XLMPreTrainedModel):
-    """XLM model for Question Answering (span extraction).
+    """
+    XLM model for Question Answering (span extraction).
     This module is composed of the XLM model with a linear layer on top of
     the sequence output that computes start_logits and end_logits
 
-    Params:
+    Args:
         `config`: a XLMConfig class instance with the configuration to build a new model
         `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
         `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
             This can be used to compute head importance metrics. Default: False
 
-    Inputs:
-        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
-            with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
-            `run_bert_extract_features.py`, `run_bert_classifier.py` and `run_bert_squad.py`)
-        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
-            types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
-            a `sentence B` token (see XLM paper for more details).
-        `attention_mask`: [optional] float32 Tensor, SAME FUNCTION as `input_mask`
-            but with 1 for real tokens and 0 for padding.
-            Added for easy compatibility with the XLM model (which uses this negative masking).
-            You can only uses one among `input_mask` and `attention_mask`
-        `input_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
-            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
-            input sequence length in the current batch. It's the mask that we typically use for attention when
-            a batch has varying length sentences.
-        `start_positions`: position of the first token for the labeled span: torch.LongTensor of shape [batch_size].
-            Positions are clamped to the length of the sequence and position outside of the sequence are not taken
-            into account for computing the loss.
-        `end_positions`: position of the last token for the labeled span: torch.LongTensor of shape [batch_size].
-            Positions are clamped to the length of the sequence and position outside of the sequence are not taken
-            into account for computing the loss.
-        `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
-            It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
 
-    Outputs:
-        if `start_positions` and `end_positions` are not `None`:
-            Outputs the total_loss which is the sum of the CrossEntropy loss for the start and end token positions.
-        if `start_positions` or `end_positions` is `None`:
-            Outputs a tuple of start_logits, end_logits which are the logits respectively for the start and end
-            position tokens of shape [batch_size, sequence_length].
 
-    Example usage:
-    ```python
-    # Already been converted into WordPiece token ids
-    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
-    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
-    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
+    Example::
 
-    config = XLMConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
-        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
+        config = XLMConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
+            num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
 
-    model = XLMForQuestionAnswering(config)
-    start_logits, end_logits = model(input_ids, token_type_ids, input_mask)
-    ```
+        model = XLMForQuestionAnswering(config)
     """
     def __init__(self, config):
         super(XLMForQuestionAnswering, self).__init__(config)
@@ -971,6 +899,58 @@ class XLMForQuestionAnswering(XLMPreTrainedModel):
                 attention_mask=None, cache=None, start_positions=None, end_positions=None,
                 cls_index=None, is_impossible=None, p_mask=None, head_mask=None):
 
+        """
+        Performs a model forward pass. **Can be called by calling the class directly, once it has been instantiated.**
+
+        Args:
+            input_ids: a ``torch.LongTensor`` of shape [batch_size, sequence_length]
+                with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
+                `run_bert_extract_features.py`, `run_bert_classifier.py` and `run_bert_squad.py`)
+            lengths: TODO
+            positions: TODO
+            langs: TODO
+            token_type_ids: an optional ``torch.LongTensor`` of shape [batch_size, sequence_length] with the token
+                types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
+                a `sentence B` token (see XLM paper for more details).
+            attention_mask: [optional] float32 Tensor, SAME FUNCTION as `input_mask`
+                but with 1 for real tokens and 0 for padding.
+                Added for easy compatibility with the XLM model (which uses this negative masking).
+                You can only uses one among `input_mask` and `attention_mask`
+            cache: TODO
+            start_positions: position of the first token for the labeled span: ``torch.LongTensor`` of shape [batch_size].
+                Positions are clamped to the length of the sequence and position outside of the sequence are not taken
+                into account for computing the loss.
+            end_positions: position of the last token for the labeled span: ``torch.LongTensor`` of shape [batch_size].
+                Positions are clamped to the length of the sequence and position outside of the sequence are not taken
+                into account for computing the loss.
+            cls_index: TODO
+            is_impossible: TODO
+            p_mask: TODO
+            head_mask: an optional ``torch.Tensor`` of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
+                It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
+
+        Returns:
+            Either the ``total_loss`` or a ``tuple(start_logits, end_logits)``
+
+                if ``start_positions`` and ``end_positions`` are not ``None``, \
+                outputs the total_loss which is the sum of the CrossEntropy loss for the start and end token positions.
+
+                if ``start_positions`` or ``end_positions`` is ``None``:
+                Outputs a ``tuple(start_logits, end_logits)`` which are the logits respectively for the start and end
+                position tokens of shape [batch_size, sequence_length].
+
+        Example::
+
+            # Already been converted into WordPiece token ids
+            input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
+            input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
+            token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
+
+            start_logits, end_logits = model(input_ids, token_type_ids, input_mask)
+            # or
+            start_logits, end_logits = model.forward(input_ids, token_type_ids, input_mask)
+        """
+
         transformer_outputs = self.transformer(input_ids, lengths=lengths, positions=positions, token_type_ids=token_type_ids,
                                                langs=langs, attention_mask=attention_mask, cache=cache, head_mask=head_mask)
 
diff --git a/pytorch_transformers/modeling_xlnet.py b/pytorch_transformers/modeling_xlnet.py
index e0b3fb0661..3851d62c44 100644
--- a/pytorch_transformers/modeling_xlnet.py
+++ b/pytorch_transformers/modeling_xlnet.py
@@ -958,10 +958,10 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
         `encoded_layers`: controled by `output_all_encoded_layers` argument:
             - `output_all_encoded_layers=True`: outputs a list of the full sequences of encoded-hidden-states at the end
                 of each attention block (i.e. 12 full sequences for XLNet-base, 24 for XLNet-large), each
-                encoded-hidden-state is a torch.FloatTensor of size [batch_size, sequence_length, d_model],
+                encoded-hidden-state is a ``torch.FloatTensor`` of size [batch_size, sequence_length, d_model],
             - `output_all_encoded_layers=False`: outputs only the full sequence of hidden-states corresponding
                 to the last attention block of shape [batch_size, sequence_length, d_model],
-        `pooled_output`: a torch.FloatTensor of size [batch_size, d_model] which is the output of a
+        `pooled_output`: a ``torch.FloatTensor`` of size [batch_size, d_model] which is the output of a
             classifier pretrained on top of the hidden state associated to the first character of the
             input (`CLS`) to train on the Next-Sentence task (see XLNet's paper).
 
@@ -1087,7 +1087,7 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel):
             1 for tokens with losses and 0 for tokens without losses.
             Only used during pretraining for two-stream attention.
             Set to None during finetuning.
-        `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
+        `head_mask`: an optional ``torch.Tensor`` of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
             It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
 
 
@@ -1098,7 +1098,7 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel):
             else:
                 CrossEntropy loss with the targets
         `new_mems`: list (num layers) of updated mem states at the entry of each layer
-            each mem state is a torch.FloatTensor of size [self.config.mem_len, batch_size, self.config.d_model]
+            each mem state is a ``torch.FloatTensor`` of size [self.config.mem_len, batch_size, self.config.d_model]
             Note that the first two dimensions are transposed in `mems` with regards to `input_ids` and `labels`
 
     Example usage:
@@ -1189,27 +1189,27 @@ class XLNetForQuestionAnswering(XLNetPreTrainedModel):
             This can be used to compute head importance metrics. Default: False
 
     Inputs:
-        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
+        `input_ids`: a ``torch.LongTensor`` of shape [batch_size, sequence_length]
             with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
             `run_bert_extract_features.py`, `run_bert_classifier.py` and `run_bert_squad.py`)
-        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
+        `token_type_ids`: an optional ``torch.LongTensor`` of shape [batch_size, sequence_length] with the token
             types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
             a `sentence B` token (see XLNet paper for more details).
         `attention_mask`: [optional] float32 Tensor, SAME FUNCTION as `input_mask`
             but with 1 for real tokens and 0 for padding.
             Added for easy compatibility with the BERT model (which uses this negative masking).
             You can only uses one among `input_mask` and `attention_mask`
-        `input_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
+        `input_mask`: an optional ``torch.LongTensor`` of shape [batch_size, sequence_length] with indices
             selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
             input sequence length in the current batch. It's the mask that we typically use for attention when
             a batch has varying length sentences.
-        `start_positions`: position of the first token for the labeled span: torch.LongTensor of shape [batch_size].
+        `start_positions`: position of the first token for the labeled span: ``torch.LongTensor`` of shape [batch_size].
             Positions are clamped to the length of the sequence and position outside of the sequence are not taken
             into account for computing the loss.
-        `end_positions`: position of the last token for the labeled span: torch.LongTensor of shape [batch_size].
+        `end_positions`: position of the last token for the labeled span: ``torch.LongTensor`` of shape [batch_size].
             Positions are clamped to the length of the sequence and position outside of the sequence are not taken
             into account for computing the loss.
-        `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
+        `head_mask`: an optional ``torch.Tensor`` of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
             It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
 
     Outputs:

From 83fb311ef7776a278c6da4b4f18a625d1e11ecfa Mon Sep 17 00:00:00 2001
From: LysandreJik <lysandre.debut@reseau.eseo.fr>
Date: Tue, 9 Jul 2019 16:38:30 -0400
Subject: [PATCH 082/139] Patched warnings + Refactored XLNet's Docstrings

---
 docs/source/examples.rst               |  10 +-
 docs/source/index.rst                  |   4 +-
 docs/source/model_doc/overview.rst     |  20 +-
 docs/source/model_doc/xlm.rst          |  12 +-
 docs/source/model_doc/xlnet.rst        |  34 +-
 pytorch_transformers/modeling_bert.py  |   2 +
 pytorch_transformers/modeling_xlnet.py | 415 +++++++++++++------------
 7 files changed, 268 insertions(+), 229 deletions(-)

diff --git a/docs/source/examples.rst b/docs/source/examples.rst
index 92e7ee661a..e7e1958d78 100644
--- a/docs/source/examples.rst
+++ b/docs/source/examples.rst
@@ -29,9 +29,9 @@ Here is how to use these techniques in our scripts:
 * **Gradient Accumulation**\ : Gradient accumulation can be used by supplying a integer greater than 1 to the ``--gradient_accumulation_steps`` argument. The batch at each step will be divided by this integer and gradient will be accumulated over ``gradient_accumulation_steps`` steps.
 * **Multi-GPU**\ : Multi-GPU is automatically activated when several GPUs are detected and the batches are splitted over the GPUs.
 * **Distributed training**\ : Distributed training can be activated by supplying an integer greater or equal to 0 to the ``--local_rank`` argument (see below).
-* **16-bits training**\ : 16-bits training, also called mixed-precision training, can reduce the memory requirement of your model on the GPU by using half-precision training, basically allowing to double the batch size. If you have a recent GPU (starting from NVIDIA Volta architecture) you should see no decrease in speed. A good introduction to Mixed precision training can be found `here <https://devblogs.nvidia.com/mixed-precision-training-deep-neural-networks/>`_ and a full documentation is `here <https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html>`_. In our scripts, this option can be activated by setting the ``--fp16`` flag and you can play with loss scaling using the ``--loss_scale`` flag (see the previously linked documentation for details on loss scaling). The loss scale can be zero in which case the scale is dynamically adjusted or a positive power of two in which case the scaling is static.
+* **16-bits training**\ : 16-bits training, also called mixed-precision training, can reduce the memory requirement of your model on the GPU by using half-precision training, basically allowing to double the batch size. If you have a recent GPU (starting from NVIDIA Volta architecture) you should see no decrease in speed. A good introduction to Mixed precision training can be found `here <https://devblogs.nvidia.com/mixed-precision-training-deep-neural-networks/>`__ and a full documentation is `here <https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html>`__. In our scripts, this option can be activated by setting the ``--fp16`` flag and you can play with loss scaling using the ``--loss_scale`` flag (see the previously linked documentation for details on loss scaling). The loss scale can be zero in which case the scale is dynamically adjusted or a positive power of two in which case the scaling is static.
 
-To use 16-bits training and distributed training, you need to install NVIDIA's apex extension `as detailed here <https://github.com/nvidia/apex>`_. You will find more information regarding the internals of ``apex`` and how to use ``apex`` in `the doc and the associated repository <https://github.com/nvidia/apex>`_. The results of the tests performed on pytorch-BERT by the NVIDIA team (and my trials at reproducing them) can be consulted in `the relevant PR of the present repository <https://github.com/huggingface/pytorch-pretrained-BERT/pull/116>`_.
+To use 16-bits training and distributed training, you need to install NVIDIA's apex extension `as detailed here <https://github.com/nvidia/apex>`__. You will find more information regarding the internals of ``apex`` and how to use ``apex`` in `the doc and the associated repository <https://github.com/nvidia/apex>`_. The results of the tests performed on pytorch-BERT by the NVIDIA team (and my trials at reproducing them) can be consulted in `the relevant PR of the present repository <https://github.com/huggingface/pytorch-pretrained-BERT/pull/116>`_.
 
 Note: To use *Distributed Training*\ , you will need to run one training script on each of your machines. This can be done for example by running the following command on each server (see `the above mentioned blog post <(https://medium.com/huggingface/training-larger-batches-practical-tips-on-1-gpu-multi-gpu-distributed-setups-ec88c3e51255>`_\ ) for more details):
 
@@ -153,10 +153,10 @@ and unpack it to some directory ``$GLUE_DIR``.
      --num_train_epochs 3.0 \
      --output_dir /tmp/mrpc_output/
 
-Our test ran on a few seeds with `the original implementation hyper-parameters <https://github.com/google-research/bert#sentence-and-sentence-pair-classification-tasks>`_ gave evaluation results between 84% and 88%.
+Our test ran on a few seeds with `the original implementation hyper-parameters <https://github.com/google-research/bert#sentence-and-sentence-pair-classification-tasks>`__ gave evaluation results between 84% and 88%.
 
 **Fast run with apex and 16 bit precision: fine-tuning on MRPC in 27 seconds!**
-First install apex as indicated `here <https://github.com/NVIDIA/apex>`_.
+First install apex as indicated `here <https://github.com/NVIDIA/apex>`__.
 Then run
 
 .. code-block:: shell
@@ -520,7 +520,7 @@ and unpack it to some directory ``$GLUE_DIR``.
     --num_train_epochs 3.0 \
     --output_dir /tmp/mrpc_output/
 
-Our test ran on a few seeds with `the original implementation hyper-parameters <https://github.com/zihangdai/xlnet#1-sts-b-sentence-pair-relevance-regression-with-gpus>`_ gave evaluation results between 84% and 88%.
+Our test ran on a few seeds with `the original implementation hyper-parameters <https://github.com/zihangdai/xlnet#1-sts-b-sentence-pair-relevance-regression-with-gpus>`__ gave evaluation results between 84% and 88%.
 
 **Distributed training**
 Here is an example using distributed training on 8 V100 GPUs to reach XXXX:
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 8505d0f019..dd92507f15 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -52,13 +52,13 @@ Here are some information on these models:
 This PyTorch implementation of BERT is provided with `Google's pre-trained models <https://github.com/google-research/bert>`_\ , examples, notebooks and a command-line interface to load any pre-trained TensorFlow checkpoint for BERT is also provided.
 
 **OpenAI GPT** was released together with the paper `Improving Language Understanding by Generative Pre-Training <https://blog.openai.com/language-unsupervised/>`_ by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.
-This PyTorch implementation of OpenAI GPT is an adaptation of the `PyTorch implementation by HuggingFace <https://github.com/huggingface/pytorch-openai-transformer-lm>`_ and is provided with `OpenAI's pre-trained model <https://github.com/openai/finetune-transformer-lm>`_ and a command-line interface that was used to convert the pre-trained NumPy checkpoint in PyTorch.
+This PyTorch implementation of OpenAI GPT is an adaptation of the `PyTorch implementation by HuggingFace <https://github.com/huggingface/pytorch-openai-transformer-lm>`_ and is provided with `OpenAI's pre-trained model <https://github.com/openai/finetune-transformer-lm>`__ and a command-line interface that was used to convert the pre-trained NumPy checkpoint in PyTorch.
 
 **Google/CMU's Transformer-XL** was released together with the paper `Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context <http://arxiv.org/abs/1901.02860>`_ by Zihang Dai\ *, Zhilin Yang*\ , Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
 This PyTorch implementation of Transformer-XL is an adaptation of the original `PyTorch implementation <https://github.com/kimiyoung/transformer-xl>`_ which has been slightly modified to match the performances of the TensorFlow implementation and allow to re-use the pretrained weights. A command-line interface is provided to convert TensorFlow checkpoints in PyTorch models.
 
 **OpenAI GPT-2** was released together with the paper `Language Models are Unsupervised Multitask Learners <https://blog.openai.com/better-language-models/>`_ by Alec Radford\ *, Jeffrey Wu*\ , Rewon Child, David Luan, Dario Amodei\ ** and Ilya Sutskever**.
-This PyTorch implementation of OpenAI GPT-2 is an adaptation of the `OpenAI's implementation <https://github.com/openai/gpt-2>`_ and is provided with `OpenAI's pre-trained model <https://github.com/openai/gpt-2>`_ and a command-line interface that was used to convert the TensorFlow checkpoint in PyTorch.
+This PyTorch implementation of OpenAI GPT-2 is an adaptation of the `OpenAI's implementation <https://github.com/openai/gpt-2>`_ and is provided with `OpenAI's pre-trained model <https://github.com/openai/gpt-2>`__ and a command-line interface that was used to convert the TensorFlow checkpoint in PyTorch.
 
 Content
 -------
diff --git a/docs/source/model_doc/overview.rst b/docs/source/model_doc/overview.rst
index 7c426aa798..f76c010e85 100644
--- a/docs/source/model_doc/overview.rst
+++ b/docs/source/model_doc/overview.rst
@@ -9,17 +9,17 @@ Here is a detailed documentation of the classes in the package and how to use th
 
    * - Sub-section
      - Description
-   * - `Loading pre-trained weights <#loading-google-ai-or-openai-pre-trained-weights-or-pytorch-dump>`_
+   * - `Loading pre-trained weights <#loading-google-ai-or-openai-pre-trained-weights-or-pytorch-dump>`__
      - How to load Google AI/OpenAI's pre-trained weight or a PyTorch saved instance
-   * - `Serialization best-practices <#serialization-best-practices>`_
+   * - `Serialization best-practices <#serialization-best-practices>`__
      - How to save and reload a fine-tuned model
-   * - `Configurations <#configurations>`_
+   * - `Configurations <#configurations>`__
      - API of the configuration classes for BERT, GPT, GPT-2 and Transformer-XL
-   * - `Models <#models>`_
+   * - `Models <#models>`__
      - API of the PyTorch model classes for BERT, GPT, GPT-2 and Transformer-XL
-   * - `Tokenizers <#tokenizers>`_
+   * - `Tokenizers <#tokenizers>`__
      - API of the tokenizers class for BERT, GPT, GPT-2 and Transformer-XL
-   * - `Optimizers <#optimizers>`_
+   * - `Optimizers <#optimizers>`__
      - API of the optimizers
 
 
@@ -77,7 +77,7 @@ where
     * ``bert-base-multilingual-uncased``: (Orig, not recommended) 102 languages, 12-layer, 768-hidden, 12-heads, 110M parameters
     * ``bert-base-multilingual-cased``: **(New, recommended)** 104 languages, 12-layer, 768-hidden, 12-heads, 110M parameters
     * ``bert-base-chinese``: Chinese Simplified and Traditional, 12-layer, 768-hidden, 12-heads, 110M parameters
-    * ``bert-base-german-cased``: Trained on German data only, 12-layer, 768-hidden, 12-heads, 110M parameters `Performance Evaluation <https://deepset.ai/german-bert>`_
+    * ``bert-base-german-cased``: Trained on German data only, 12-layer, 768-hidden, 12-heads, 110M parameters `Performance Evaluation <https://deepset.ai/german-bert>`__
     * ``bert-large-uncased-whole-word-masking``: 24-layer, 1024-hidden, 16-heads, 340M parameters - Trained with Whole Word Masking (mask all of the the tokens corresponding to a word at once)
     * ``bert-large-cased-whole-word-masking``: 24-layer, 1024-hidden, 16-heads, 340M parameters - Trained with Whole Word Masking (mask all of the the tokens corresponding to a word at once)
     * ``bert-large-uncased-whole-word-masking-finetuned-squad``: The ``bert-large-uncased-whole-word-masking`` model finetuned on SQuAD (using the ``run_bert_squad.py`` examples). Results: *exact_match: 86.91579943235573, f1: 93.1532499015869*
@@ -93,7 +93,7 @@ where
     * ``bert_config.json`` or ``openai_gpt_config.json`` a configuration file for the model, and
     * ``pytorch_model.bin`` a PyTorch dump of a pre-trained instance of ``BertForPreTraining``\ , ``OpenAIGPTModel``\ , ``TransfoXLModel``\ , ``GPT2LMHeadModel`` (saved with the usual ``torch.save()``\ )
 
-  If ``PRE_TRAINED_MODEL_NAME_OR_PATH`` is a shortcut name, the pre-trained weights will be downloaded from AWS S3 (see the links `here <https://github.com/huggingface/pytorch-pretrained-BERT/tree/master/pytorch_pretrained_bert/modeling.py>`_\ ) and stored in a cache folder to avoid future download (the cache folder can be found at ``~/.pytorch_pretrained_bert/``\ ).
+  If ``PRE_TRAINED_MODEL_NAME_OR_PATH`` is a shortcut name, the pre-trained weights will be downloaded from AWS S3 (see the links `here <https://github.com/huggingface/pytorch-pretrained-BERT/tree/master/pytorch_pretrained_bert/modeling.py>`__\ ) and stored in a cache folder to avoid future download (the cache folder can be found at ``~/.pytorch_pretrained_bert/``\ ).
 
 *
   ``cache_dir`` can be an optional path to a specific directory to download and cache the pre-trained model weights. This option is useful in particular when you are using distributed training: to avoid concurrent access to the same weights you can set for example ``cache_dir='./pretrained_model_{}'.format(args.local_rank)`` (see the section on distributed training for more information).
@@ -102,7 +102,7 @@ where
 * ``state_dict``\ : an optional state dictionnary (collections.OrderedDict object) to use instead of Google pre-trained models
 * ``*inputs``\ , `**kwargs`: additional input for the specific Bert class (ex: num_labels for BertForSequenceClassification)
 
-``Uncased`` means that the text has been lowercased before WordPiece tokenization, e.g., ``John Smith`` becomes ``john smith``. The Uncased model also strips out any accent markers. ``Cased`` means that the true case and accent markers are preserved. Typically, the Uncased model is better unless you know that case information is important for your task (e.g., Named Entity Recognition or Part-of-Speech tagging). For information about the Multilingual and Chinese model, see the `Multilingual README <https://github.com/google-research/bert/blob/master/multilingual.md>`_ or the original TensorFlow repository.
+``Uncased`` means that the text has been lowercased before WordPiece tokenization, e.g., ``John Smith`` becomes ``john smith``. The Uncased model also strips out any accent markers. ``Cased`` means that the true case and accent markers are preserved. Typically, the Uncased model is better unless you know that case information is important for your task (e.g., Named Entity Recognition or Part-of-Speech tagging). For information about the Multilingual and Chinese model, see the `Multilingual README <https://github.com/google-research/bert/blob/master/multilingual.md>`__ or the original TensorFlow repository.
 
 When using an ``uncased model``\ , make sure to pass ``--do_lower_case`` to the example training scripts (or pass ``do_lower_case=True`` to FullTokenizer if you're using your own script and loading the tokenizer your-self.).
 
@@ -152,7 +152,7 @@ This section explain how you can save and re-load a fine-tuned model (BERT, GPT,
 There are three types of files you need to save to be able to reload a fine-tuned model:
 
 
-* the model it-self which should be saved following PyTorch serialization `best practices <https://pytorch.org/docs/stable/notes/serialization.html#best-practices>`_\ ,
+* the model it-self which should be saved following PyTorch serialization `best practices <https://pytorch.org/docs/stable/notes/serialization.html#best-practices>`__\ ,
 * the configuration file of the model which is saved as a JSON file, and
 * the vocabulary (and the merges for the BPE-based models GPT and GPT-2).
 
diff --git a/docs/source/model_doc/xlm.rst b/docs/source/model_doc/xlm.rst
index 2c18016dcc..3d7a9e5e44 100644
--- a/docs/source/model_doc/xlm.rst
+++ b/docs/source/model_doc/xlm.rst
@@ -2,35 +2,35 @@ XLM
 ----------------------------------------------------
 
 ``XLMConfig``
-~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: pytorch_transformers.TransfoXLConfig
+.. autoclass:: pytorch_transformers.XLMConfig
     :members:
 
 
 17. ``XLMModel``
-~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 .. autoclass:: pytorch_transformers.XLMModel
     :members:
 
 
 18. ``XLMWithLMHeadModel``
-~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 .. autoclass:: pytorch_transformers.XLMWithLMHeadModel
     :members:
 
 
 19. ``XLMForSequenceClassification``
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 .. autoclass:: pytorch_transformers.XLMForSequenceClassification
     :members:
 
 
 20. ``XLMForQuestionAnswering``
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 .. autoclass:: pytorch_transformers.XLMForQuestionAnswering
     :members:
diff --git a/docs/source/model_doc/xlnet.rst b/docs/source/model_doc/xlnet.rst
index 8138d1bcdb..b150e771bd 100644
--- a/docs/source/model_doc/xlnet.rst
+++ b/docs/source/model_doc/xlnet.rst
@@ -1,4 +1,36 @@
 XLNet
 ----------------------------------------------------
 
-I don't really know what to put here, I'll leave it up to you to decide @Thom
\ No newline at end of file
+``XLNetConfig``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: pytorch_transformers.XLNetConfig
+    :members:
+
+
+21. ``XLNetModel``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: pytorch_transformers.XLNetModel
+    :members:
+
+
+22. ``XLNetLMHeadModel``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: pytorch_transformers.XLNetLMHeadModel
+    :members:
+
+
+23. ``XLNetForSequenceClassification``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: pytorch_transformers.XLNetForSequenceClassification
+    :members:
+
+
+24. ``XLNetForQuestionAnswering``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: pytorch_transformers.XLNetForQuestionAnswering
+    :members:
diff --git a/pytorch_transformers/modeling_bert.py b/pytorch_transformers/modeling_bert.py
index acc5647cb5..848296ad9f 100644
--- a/pytorch_transformers/modeling_bert.py
+++ b/pytorch_transformers/modeling_bert.py
@@ -729,7 +729,9 @@ class BertModel(BertPreTrainedModel):
 class BertForPreTraining(BertPreTrainedModel):
     """BERT model with pre-training heads.
     This module comprises the BERT model followed by the two pre-training heads:
+
         - the masked language modeling head, and
+
         - the next sentence classification head.
 
     Args:
diff --git a/pytorch_transformers/modeling_xlnet.py b/pytorch_transformers/modeling_xlnet.py
index 3851d62c44..5fee4e8524 100644
--- a/pytorch_transformers/modeling_xlnet.py
+++ b/pytorch_transformers/modeling_xlnet.py
@@ -192,7 +192,48 @@ ACT2FN = {"gelu": gelu, "relu": torch.nn.functional.relu, "swish": swish}
 
 
 class XLNetConfig(PretrainedConfig):
-    """Configuration class to store the configuration of a `XLNetModel`.
+    """Configuration class to store the configuration of a ``XLNetModel``.
+
+    Args:
+        vocab_size_or_config_json_file: Vocabulary size of ``inputs_ids`` in ``XLNetModel``.
+        d_model: Size of the encoder layers and the pooler layer.
+        n_layer: Number of hidden layers in the Transformer encoder.
+        n_head: Number of attention heads for each attention layer in
+            the Transformer encoder.
+        d_inner: The size of the "intermediate" (i.e., feed-forward)
+            layer in the Transformer encoder.
+        ff_activation: The non-linear activation function (function or string) in the
+            encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
+        untie_r: untie relative position biases
+        attn_type: 'bi' for XLNet, 'uni' for Transformer-XL
+
+        dropout: The dropout probabilitiy for all fully connected
+            layers in the embeddings, encoder, and pooler.
+        dropatt: The dropout ratio for the attention
+            probabilities.
+        max_position_embeddings: The maximum sequence length that this model might
+            ever be used with. Typically set this to something large just in case
+            (e.g., 512 or 1024 or 2048).
+        initializer_range: The sttdev of the truncated_normal_initializer for
+            initializing all weight matrices.
+        layer_norm_eps: The epsilon used by LayerNorm.
+
+        dropout: float, dropout rate.
+        dropatt: float, dropout rate on attention probabilities.
+        init: str, the initialization scheme, either "normal" or "uniform".
+        init_range: float, initialize the parameters with a uniform distribution
+            in [-init_range, init_range]. Only effective when init="uniform".
+        init_std: float, initialize the parameters with a normal distribution
+            with mean 0 and stddev init_std. Only effective when init="normal".
+        mem_len: int, the number of tokens to cache.
+        reuse_len: int, the number of tokens in the currect batch to be cached
+            and reused in the future.
+        bi_data: bool, whether to use bidirectional input pipeline.
+            Usually set to True during pretraining and False during finetuning.
+        clamp_len: int, clamp all relative distances larger than clamp_len.
+            -1 means no clamping.
+        same_length: bool, whether to use the same attention length for each token.
+        finetuning_task: name of the glue task on which the model was fine-tuned if any
     """
     pretrained_config_archive_map = XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP
 
@@ -231,47 +272,6 @@ class XLNetConfig(PretrainedConfig):
                  end_n_top=5,
                  **kwargs):
         """Constructs XLNetConfig.
-
-        Args:
-            vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `XLNetModel`.
-            d_model: Size of the encoder layers and the pooler layer.
-            n_layer: Number of hidden layers in the Transformer encoder.
-            n_head: Number of attention heads for each attention layer in
-                the Transformer encoder.
-            d_inner: The size of the "intermediate" (i.e., feed-forward)
-                layer in the Transformer encoder.
-            ff_activation: The non-linear activation function (function or string) in the
-                encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
-            untie_r: untie relative position biases
-            attn_type: 'bi' for XLNet, 'uni' for Transformer-XL
-
-            dropout: The dropout probabilitiy for all fully connected
-                layers in the embeddings, encoder, and pooler.
-            dropatt: The dropout ratio for the attention
-                probabilities.
-            max_position_embeddings: The maximum sequence length that this model might
-                ever be used with. Typically set this to something large just in case
-                (e.g., 512 or 1024 or 2048).
-            initializer_range: The sttdev of the truncated_normal_initializer for
-                initializing all weight matrices.
-            layer_norm_eps: The epsilon used by LayerNorm.
-
-            dropout: float, dropout rate.
-            dropatt: float, dropout rate on attention probabilities.
-            init: str, the initialization scheme, either "normal" or "uniform".
-            init_range: float, initialize the parameters with a uniform distribution
-                in [-init_range, init_range]. Only effective when init="uniform".
-            init_std: float, initialize the parameters with a normal distribution
-                with mean 0 and stddev init_std. Only effective when init="normal".
-            mem_len: int, the number of tokens to cache.
-            reuse_len: int, the number of tokens in the currect batch to be cached
-                and reused in the future.
-            bi_data: bool, whether to use bidirectional input pipeline.
-                Usually set to True during pretraining and False during finetuning.
-            clamp_len: int, clamp all relative distances larger than clamp_len.
-                -1 means no clamping.
-            same_length: bool, whether to use the same attention length for each token.
-            finetuning_task: name of the glue task on which the model was fine-tuned if any
         """
         super(XLNetConfig, self).__init__(**kwargs)
 
@@ -621,6 +621,18 @@ class XLNetPreTrainedModel(PreTrainedModel):
 
 
 class XLNetModel(XLNetPreTrainedModel):
+    """XLNet model ("XLNet: Generalized Autoregressive Pretraining for Language Understanding").
+
+    TODO: this was copied from the XLNetLMHeadModel, check that it's ok.
+
+    Args:
+        `config`: a XLNetConfig class instance with the configuration to build a new model
+        `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
+        `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
+            This can be used to compute head importance metrics. Default: False
+
+    TODO: Add usage
+    """
     def __init__(self, config):
         super(XLNetModel, self).__init__(config)
         self.output_attentions = config.output_attentions
@@ -647,15 +659,23 @@ class XLNetModel(XLNetPreTrainedModel):
         pass
 
     def create_mask(self, qlen, mlen):
-        """ create causal attention mask.
-            float mask where 1.0 indicate masked, 0.0 indicated not-masked.
-             same_length=False:      same_length=True:
-             <mlen > <  qlen >       <mlen > <  qlen >
-          ^ [0 0 0 0 0 1 1 1 1]     [0 0 0 0 0 1 1 1 1]
-            [0 0 0 0 0 0 1 1 1]     [1 0 0 0 0 0 1 1 1]
-       qlen [0 0 0 0 0 0 0 1 1]     [1 1 0 0 0 0 0 1 1]
-            [0 0 0 0 0 0 0 0 1]     [1 1 1 0 0 0 0 0 1]
-          v [0 0 0 0 0 0 0 0 0]     [1 1 1 1 0 0 0 0 0]
+        """
+        Creates causal attention mask. Float mask where 1.0 indicates masked, 0.0 indicates not-masked.
+
+        Args:
+            qlen: TODO
+            mlen: TODO
+
+        ::
+
+                  same_length=False:      same_length=True:
+                  <mlen > <  qlen >       <mlen > <  qlen >
+               ^ [0 0 0 0 0 1 1 1 1]     [0 0 0 0 0 1 1 1 1]
+                 [0 0 0 0 0 0 1 1 1]     [1 0 0 0 0 0 1 1 1]
+            qlen [0 0 0 0 0 0 0 1 1]     [1 1 0 0 0 0 0 1 1]
+                 [0 0 0 0 0 0 0 0 1]     [1 1 1 0 0 0 0 0 1]
+               v [0 0 0 0 0 0 0 0 0]     [1 1 1 1 0 0 0 0 0]
+
         """
         attn_mask = torch.ones([qlen, qlen])
         mask_up = torch.triu(attn_mask, diagonal=1)
@@ -736,6 +756,8 @@ class XLNetModel(XLNetPreTrainedModel):
     def forward(self, input_ids, token_type_ids=None, input_mask=None, attention_mask=None,
                 mems=None, perm_mask=None, target_mapping=None, inp_q=None, head_mask=None):
         """
+        Performs a model forward pass. **Can be called by calling the class directly, once it has been instantiated.**
+
         Args:
             input_ids: int32 Tensor in shape [bsz, len], the input token IDs.
             token_type_ids: int32 Tensor in shape [bsz, len], the input segment IDs.
@@ -772,6 +794,8 @@ class XLNetModel(XLNetPreTrainedModel):
             same_length: bool, whether to use the same attention length for each token.
             summary_type: str, "last", "first", "mean", or "attn". The method
                 to pool the input to get a vector representation.
+
+        TODO: Add usage
         """
         # the original code for XLNet uses shapes [len, bsz] with the batch dimension at the end
         # but we want a unified interface in the library with the batch size on the first dimension
@@ -921,63 +945,20 @@ class XLNetModel(XLNetPreTrainedModel):
 class XLNetLMHeadModel(XLNetPreTrainedModel):
     """XLNet model ("XLNet: Generalized Autoregressive Pretraining for Language Understanding").
 
-    Params:
+    Args:
         `config`: a XLNetConfig class instance with the configuration to build a new model
         `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
         `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
             This can be used to compute head importance metrics. Default: False
 
-    Inputs:
-        input_ids: int32 Tensor in shape [bsz, len], the input token IDs.
-        token_type_ids: int32 Tensor in shape [bsz, len], the input segment IDs.
-        input_mask: [optional] float32 Tensor in shape [bsz, len], the input mask.
-            0 for real tokens and 1 for padding.
-        attention_mask: [optional] float32 Tensor, SAME FUNCTION as `input_mask`
-            but with 1 for real tokens and 0 for padding.
-            Added for easy compatibility with the BERT model (which uses this negative masking).
-            You can only uses one among `input_mask` and `attention_mask`
-        mems: [optional] a list of float32 Tensors in shape [mem_len, bsz, d_model], memory
-            from previous batches. The length of the list equals n_layer.
-            If None, no memory is used.
-        perm_mask: [optional] float32 Tensor in shape [bsz, len, len].
-            If perm_mask[k, i, j] = 0, i attend to j in batch k;
-            if perm_mask[k, i, j] = 1, i does not attend to j in batch k.
-            If None, each position attends to all the others.
-        target_mapping: [optional] float32 Tensor in shape [bsz, num_predict, len].
-            If target_mapping[k, i, j] = 1, the i-th predict in batch k is
-            on the j-th token.
-            Only used during pretraining for partial prediction.
-            Set to None during finetuning.
-        inp_q: [optional] float32 Tensor in shape [bsz, len].
-            1 for tokens with losses and 0 for tokens without losses.
-            Only used during pretraining for two-stream attention.
-            Set to None during finetuning.
 
 
-    Outputs: Tuple of (encoded_layers, pooled_output)
-        `encoded_layers`: controled by `output_all_encoded_layers` argument:
-            - `output_all_encoded_layers=True`: outputs a list of the full sequences of encoded-hidden-states at the end
-                of each attention block (i.e. 12 full sequences for XLNet-base, 24 for XLNet-large), each
-                encoded-hidden-state is a ``torch.FloatTensor`` of size [batch_size, sequence_length, d_model],
-            - `output_all_encoded_layers=False`: outputs only the full sequence of hidden-states corresponding
-                to the last attention block of shape [batch_size, sequence_length, d_model],
-        `pooled_output`: a ``torch.FloatTensor`` of size [batch_size, d_model] which is the output of a
-            classifier pretrained on top of the hidden state associated to the first character of the
-            input (`CLS`) to train on the Next-Sentence task (see XLNet's paper).
+    Example::
 
-    Example usage:
-    ```python
-    # Already been converted into WordPiece token ids
-    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
-    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
-    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
+        config = modeling.XLNetConfig(vocab_size_or_config_json_file=32000, d_model=768,
+            n_layer=12, num_attention_heads=12, intermediate_size=3072)
 
-    config = modeling.XLNetConfig(vocab_size_or_config_json_file=32000, d_model=768,
-        n_layer=12, num_attention_heads=12, intermediate_size=3072)
-
-    model = modeling.XLNetModel(config=config)
-    all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)
-    ```
+        model = modeling.XLNetModel(config=config)
     """
     def __init__(self, config):
         super(XLNetLMHeadModel, self).__init__(config)
@@ -1005,34 +986,61 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
                 mems=None, perm_mask=None, target_mapping=None, inp_q=None,
                 labels=None, head_mask=None):
         """
+         all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)
+
         Args:
             input_ids: int32 Tensor in shape [bsz, len], the input token IDs.
             token_type_ids: int32 Tensor in shape [bsz, len], the input segment IDs.
-            input_mask: float32 Tensor in shape [bsz, len], the input mask.
+            input_mask: [optional] float32 Tensor in shape [bsz, len], the input mask.
                 0 for real tokens and 1 for padding.
             attention_mask: [optional] float32 Tensor, SAME FUNCTION as `input_mask`
                 but with 1 for real tokens and 0 for padding.
                 Added for easy compatibility with the BERT model (which uses this negative masking).
                 You can only uses one among `input_mask` and `attention_mask`
-            mems: a list of float32 Tensors in shape [mem_len, bsz, d_model], memory
+            mems: [optional] a list of float32 Tensors in shape [mem_len, bsz, d_model], memory
                 from previous batches. The length of the list equals n_layer.
                 If None, no memory is used.
-            perm_mask: float32 Tensor in shape [bsz, len, len].
+            perm_mask: [optional] float32 Tensor in shape [bsz, len, len].
                 If perm_mask[k, i, j] = 0, i attend to j in batch k;
                 if perm_mask[k, i, j] = 1, i does not attend to j in batch k.
                 If None, each position attends to all the others.
-            target_mapping: float32 Tensor in shape [bsz, num_predict, len].
+            target_mapping: [optional] float32 Tensor in shape [bsz, num_predict, len].
                 If target_mapping[k, i, j] = 1, the i-th predict in batch k is
                 on the j-th token.
                 Only used during pretraining for partial prediction.
                 Set to None during finetuning.
-            inp_q: float32 Tensor in shape [bsz, len].
+            inp_q: [optional] float32 Tensor in shape [bsz, len].
                 1 for tokens with losses and 0 for tokens without losses.
                 Only used during pretraining for two-stream attention.
                 Set to None during finetuning.
 
-            summary_type: str, "last", "first", "mean", or "attn". The method
-                to pool the input to get a vector representation.
+
+        Returns:
+            A ``tuple(encoded_layers, pooled_output)``, with
+
+                ``encoded_layers``: controlled by ``output_all_encoded_layers`` argument:
+
+                    - ``output_all_encoded_layers=True``: outputs a list of the full sequences of encoded-hidden-states \
+                    at the end of each attention block (i.e. 12 full sequences for XLNet-base, 24 for XLNet-large), \
+                    each encoded-hidden-state is a ``torch.FloatTensor`` of size [batch_size, sequence_length, d_model],
+
+                    - ``output_all_encoded_layers=False``: outputs only the full sequence of hidden-states corresponding \
+                    to the last attention block of shape [batch_size, sequence_length, d_model],
+
+                ``pooled_output``: a ``torch.FloatTensor`` of size [batch_size, d_model] which is the output of a \
+                classifier pretrained on top of the hidden state associated to the first character of the \
+                input (`CLS`) to train on the Next-Sentence task (see XLNet's paper).
+
+        Example::
+
+            # Already been converted into WordPiece token ids
+            input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
+            input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
+            token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
+
+            all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)
+            # or
+            all_encoder_layers, pooled_output = model.forward(input_ids, token_type_ids, input_mask)
         """
         transformer_outputs = self.transformer(input_ids, token_type_ids, input_mask, attention_mask,
                                                mems, perm_mask, target_mapping, inp_q, head_mask)
@@ -1054,7 +1062,7 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
 class XLNetForSequenceClassification(XLNetPreTrainedModel):
     """XLNet model ("XLNet: Generalized Autoregressive Pretraining for Language Understanding").
 
-    Params:
+    Args:
         `config`: a XLNetConfig class instance with the configuration to build a new model
         `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
         `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
@@ -1062,58 +1070,16 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel):
         `summary_type`: str, "last", "first", "mean", or "attn". The method
             to pool the input to get a vector representation. Default: last
 
-    Inputs:
-        input_ids: int32 Tensor in shape [bsz, len], the input token IDs.
-        token_type_ids: int32 Tensor in shape [bsz, len], the input segment IDs.
-        input_mask: float32 Tensor in shape [bsz, len], the input mask.
-            0 for real tokens and 1 for padding.
-        attention_mask: [optional] float32 Tensor, SAME FUNCTION as `input_mask`
-            but with 1 for real tokens and 0 for padding.
-            Added for easy compatibility with the BERT model (which uses this negative masking).
-            You can only uses one among `input_mask` and `attention_mask`
-        mems: a list of float32 Tensors in shape [mem_len, bsz, d_model], memory
-            from previous batches. The length of the list equals n_layer.
-            If None, no memory is used.
-        perm_mask: float32 Tensor in shape [bsz, len, len].
-            If perm_mask[k, i, j] = 0, i attend to j in batch k;
-            if perm_mask[k, i, j] = 1, i does not attend to j in batch k.
-            If None, each position attends to all the others.
-        target_mapping: float32 Tensor in shape [bsz, num_predict, len].
-            If target_mapping[k, i, j] = 1, the i-th predict in batch k is
-            on the j-th token.
-            Only used during pretraining for partial prediction.
-            Set to None during finetuning.
-        inp_q: float32 Tensor in shape [bsz, len].
-            1 for tokens with losses and 0 for tokens without losses.
-            Only used during pretraining for two-stream attention.
-            Set to None during finetuning.
-        `head_mask`: an optional ``torch.Tensor`` of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
-            It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
 
 
-    Outputs: Tuple of (logits or loss, mems)
-        `logits or loss`:
-            if labels is None:
-                Token logits with shape [batch_size, sequence_length] 
-            else:
-                CrossEntropy loss with the targets
-        `new_mems`: list (num layers) of updated mem states at the entry of each layer
-            each mem state is a ``torch.FloatTensor`` of size [self.config.mem_len, batch_size, self.config.d_model]
-            Note that the first two dimensions are transposed in `mems` with regards to `input_ids` and `labels`
+    Example::
 
-    Example usage:
-    ```python
-    # Already been converted into WordPiece token ids
-    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
-    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
-    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
+        # Already been converted into WordPiece token ids
+        input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
+        input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
+        token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
 
-    config = modeling.XLNetConfig(vocab_size_or_config_json_file=32000, d_model=768,
-        n_layer=12, num_attention_heads=12, intermediate_size=3072)
-
-    model = modeling.XLNetModel(config=config)
-    all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)
-    ```
+        all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)
     """
     def __init__(self, config):
         super(XLNetForSequenceClassification, self).__init__(config)
@@ -1129,6 +1095,8 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel):
                 mems=None, perm_mask=None, target_mapping=None, inp_q=None,
                 labels=None, head_mask=None):
         """
+        Performs a model forward pass. **Can be called by calling the class directly, once it has been instantiated.**
+
         Args:
             input_ids: int32 Tensor in shape [bsz, len], the input token IDs.
             token_type_ids: int32 Tensor in shape [bsz, len], the input segment IDs.
@@ -1148,12 +1116,38 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel):
             target_mapping: float32 Tensor in shape [bsz, num_predict, len].
                 If target_mapping[k, i, j] = 1, the i-th predict in batch k is
                 on the j-th token.
-                Only used during pretraining for partial prediction.
-                Set to None during finetuning.
+                Only used during pre-training for partial prediction.
+                Set to None during fine-tuning.
             inp_q: float32 Tensor in shape [bsz, len].
                 1 for tokens with losses and 0 for tokens without losses.
-                Only used during pretraining for two-stream attention.
-                Set to None during finetuning.
+                Only used during pre-training for two-stream attention.
+                Set to None during fine-tuning.
+            labels: TODO
+            head_mask: an optional ``torch.Tensor`` of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
+                It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
+
+
+        Returns:
+            A ``tuple(logits_or_loss, mems)``
+
+                ``logits_or_loss``: if ``labels`` is ``None``, ``logits_or_loss`` corresponds to token logits with shape \
+                [batch_size, sequence_length]. If it is not ``None``, it corresponds to the ``CrossEntropy`` loss \
+                with the targets.
+
+                ``new_mems``: list (num layers) of updated mem states at the entry of each layer \
+                each mem state is a ``torch.FloatTensor`` of size [self.config.mem_len, batch_size, self.config.d_model] \
+                Note that the first two dimensions are transposed in ``mems`` with regards to ``input_ids`` and ``labels``
+
+        Example::
+
+            # Already been converted into WordPiece token ids
+            input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
+            input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
+            token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
+
+            all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)
+            # or
+            all_encoder_layers, pooled_output = model.forward(input_ids, token_type_ids, input_mask)
         """
         transformer_outputs = self.transformer(input_ids, token_type_ids, input_mask, attention_mask,
                                                mems, perm_mask, target_mapping, inp_q, head_mask)
@@ -1178,60 +1172,24 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel):
 
 
 class XLNetForQuestionAnswering(XLNetPreTrainedModel):
-    """ XLNet model for Question Answering (span extraction).
-    This module is composed of the XLNet model with a linear layer on top of
-    the sequence output that computes start_logits and end_logits
+    """
+    XLNet model for Question Answering (span extraction).
 
-    Params:
+    This module is composed of the XLNet model with a linear layer on top of
+    the sequence output that computes ``start_logits`` and ``end_logits``
+
+    Args:
         `config`: a XLNetConfig class instance with the configuration to build a new model
         `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
         `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
             This can be used to compute head importance metrics. Default: False
 
-    Inputs:
-        `input_ids`: a ``torch.LongTensor`` of shape [batch_size, sequence_length]
-            with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
-            `run_bert_extract_features.py`, `run_bert_classifier.py` and `run_bert_squad.py`)
-        `token_type_ids`: an optional ``torch.LongTensor`` of shape [batch_size, sequence_length] with the token
-            types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
-            a `sentence B` token (see XLNet paper for more details).
-        `attention_mask`: [optional] float32 Tensor, SAME FUNCTION as `input_mask`
-            but with 1 for real tokens and 0 for padding.
-            Added for easy compatibility with the BERT model (which uses this negative masking).
-            You can only uses one among `input_mask` and `attention_mask`
-        `input_mask`: an optional ``torch.LongTensor`` of shape [batch_size, sequence_length] with indices
-            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
-            input sequence length in the current batch. It's the mask that we typically use for attention when
-            a batch has varying length sentences.
-        `start_positions`: position of the first token for the labeled span: ``torch.LongTensor`` of shape [batch_size].
-            Positions are clamped to the length of the sequence and position outside of the sequence are not taken
-            into account for computing the loss.
-        `end_positions`: position of the last token for the labeled span: ``torch.LongTensor`` of shape [batch_size].
-            Positions are clamped to the length of the sequence and position outside of the sequence are not taken
-            into account for computing the loss.
-        `head_mask`: an optional ``torch.Tensor`` of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
-            It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
+    Example::
 
-    Outputs:
-        if `start_positions` and `end_positions` are not `None`:
-            Outputs the total_loss which is the sum of the CrossEntropy loss for the start and end token positions.
-        if `start_positions` or `end_positions` is `None`:
-            Outputs a tuple of start_logits, end_logits which are the logits respectively for the start and end
-            position tokens of shape [batch_size, sequence_length].
+        config = XLNetConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
+            num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
 
-    Example usage:
-    ```python
-    # Already been converted into WordPiece token ids
-    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
-    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
-    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
-
-    config = XLNetConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
-        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
-
-    model = XLNetForQuestionAnswering(config)
-    start_logits, end_logits = model(input_ids, token_type_ids, input_mask)
-    ```
+        model = XLNetForQuestionAnswering(config)
     """
     def __init__(self, config):
         super(XLNetForQuestionAnswering, self).__init__(config)
@@ -1249,6 +1207,53 @@ class XLNetForQuestionAnswering(XLNetPreTrainedModel):
                 mems=None, perm_mask=None, target_mapping=None, inp_q=None,
                 start_positions=None, end_positions=None, cls_index=None, is_impossible=None, p_mask=None,
                 head_mask=None):
+
+        """
+        Performs a model forward pass. **Can be called by calling the class directly, once it has been instantiated.**
+
+        Args:
+            `input_ids`: a ``torch.LongTensor`` of shape [batch_size, sequence_length]
+                with the word token indices in the vocabulary(see the tokens pre-processing logic in the scripts
+                `run_bert_extract_features.py`, `run_bert_classifier.py` and `run_bert_squad.py`)
+            `token_type_ids`: an optional ``torch.LongTensor`` of shape [batch_size, sequence_length] with the token
+                types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
+                a `sentence B` token (see XLNet paper for more details).
+            `attention_mask`: [optional] float32 Tensor, SAME FUNCTION as `input_mask`
+                but with 1 for real tokens and 0 for padding.
+                Added for easy compatibility with the BERT model (which uses this negative masking).
+                You can only uses one among ``input_mask`` and ``attention_mask``
+            `input_mask`: an optional ``torch.LongTensor`` of shape [batch_size, sequence_length] with indices
+                selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
+                input sequence length in the current batch. It's the mask that we typically use for attention when
+                a batch has varying length sentences.
+            `start_positions`: position of the first token for the labeled span: ``torch.LongTensor`` of shape [batch_size].
+                Positions are clamped to the length of the sequence and position outside of the sequence are not taken
+                into account for computing the loss.
+            `end_positions`: position of the last token for the labeled span: ``torch.LongTensor`` of shape [batch_size].
+                Positions are clamped to the length of the sequence and position outside of the sequence are not taken
+                into account for computing the loss.
+            `head_mask`: an optional ``torch.Tensor`` of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
+                It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
+
+        Returns:
+            if ``start_positions`` and ``end_positions`` are not ``None``, outputs the total_loss which is the sum of the \
+            ``CrossEntropy`` loss for the start and end token positions.
+
+            if ``start_positions`` or ``end_positions`` is ``None``, outputs a tuple of ``start_logits``, ``end_logits``
+            which are the logits respectively for the start and end position tokens of shape \
+            [batch_size, sequence_length].
+
+        Example::
+
+            # Already been converted into WordPiece token ids
+            input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
+            input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
+            token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
+
+            start_logits, end_logits = model(input_ids, token_type_ids, input_mask)
+            # or
+            start_logits, end_logits = model.forward(input_ids, token_type_ids, input_mask)
+        """
         transformer_outputs = self.transformer(input_ids, token_type_ids, input_mask, attention_mask,
                                                mems, perm_mask, target_mapping, inp_q, head_mask)
         hidden_states = transformer_outputs[0]

From 331db8cc02359bb01d1dde539d7d4dae789cf82e Mon Sep 17 00:00:00 2001
From: LysandreJik <lysandre.debut@reseau.eseo.fr>
Date: Tue, 9 Jul 2019 17:01:56 -0400
Subject: [PATCH 083/139] Added viewcode plugin for source code visualization
 within the static website.

---
 docs/source/_static/css/code-snippets.css |  6 +++---
 docs/source/_static/css/huggingface.css   | 12 ++++++++++++
 docs/source/conf.py                       |  4 ++--
 3 files changed, 17 insertions(+), 5 deletions(-)

diff --git a/docs/source/_static/css/code-snippets.css b/docs/source/_static/css/code-snippets.css
index 4d525e95d7..43acc6751c 100644
--- a/docs/source/_static/css/code-snippets.css
+++ b/docs/source/_static/css/code-snippets.css
@@ -1,12 +1,12 @@
 
-.highlight .c1{
+.highlight .c1, .highlight .sd{
     color: #999
 }
 
-.highlight .nn, .highlight .k, .highlight .s1, .highlight .nb, .highlight .bp {
+.highlight .nn, .highlight .k, .highlight .s1, .highlight .nb, .highlight .bp, .highlight .kc {
     color: #FB8D68;
 }
 
-.highlight .kn, .highlight .nv, .highlight .s2 {
+.highlight .kn, .highlight .nv, .highlight .s2, .highlight .ow {
     color: #6670FF;
 }
\ No newline at end of file
diff --git a/docs/source/_static/css/huggingface.css b/docs/source/_static/css/huggingface.css
index f50726b57d..1895f5a10b 100644
--- a/docs/source/_static/css/huggingface.css
+++ b/docs/source/_static/css/huggingface.css
@@ -104,6 +104,18 @@ a {
     background-color: #6670FF;
 }
 
+
+/* Source spans */
+.rst-content .viewcode-link, .rst-content .viewcode-back{
+    color: #6670FF;
+    font-size: 110%;
+    letter-spacing: 2px;
+    text-transform: uppercase;
+}
+
+
+
+
 /* FONTS */
 body{
     font-family: Calibre;
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 706329c19a..16866b5e5c 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -42,8 +42,8 @@ extensions = [
     'sphinx.ext.autodoc',
     'sphinx.ext.coverage',
     'sphinx.ext.napoleon',
-    'recommonmark'
-
+    'recommonmark',
+    'sphinx.ext.viewcode'
 ]
 
 # Add any paths that contain templates here, relative to this directory.

From c4bab2dc85377dd9278d047a66e0f6cddfcfd5f5 Mon Sep 17 00:00:00 2001
From: LysandreJik <lysandre.debut@reseau.eseo.fr>
Date: Tue, 9 Jul 2019 18:03:01 -0400
Subject: [PATCH 084/139] Added footer with social links.

---
 docs/source/_static/css/huggingface.css | 12 +++++++
 docs/source/_static/js/custom.js        | 48 +++++++++++++++++++++----
 2 files changed, 54 insertions(+), 6 deletions(-)

diff --git a/docs/source/_static/css/huggingface.css b/docs/source/_static/css/huggingface.css
index 1895f5a10b..362c0992fb 100644
--- a/docs/source/_static/css/huggingface.css
+++ b/docs/source/_static/css/huggingface.css
@@ -113,6 +113,18 @@ a {
     text-transform: uppercase;
 }
 
+.footer {
+    margin-top: 20px;
+}
+
+.footer__Social {
+    display: flex;
+    flex-direction: row;
+}
+
+.footer__CustomImage {
+    margin: 2px 5px 0 0;
+}
 
 
 
diff --git a/docs/source/_static/js/custom.js b/docs/source/_static/js/custom.js
index 9ddbbb7c49..4adf2a4672 100644
--- a/docs/source/_static/js/custom.js
+++ b/docs/source/_static/js/custom.js
@@ -1,18 +1,54 @@
 function addIcon() {
     const huggingFaceLogo = "http://lysand.re/huggingface_logo.svg";
     const image = document.createElement("img");
-    image.setAttribute("src", huggingFaceLogo)
+    image.setAttribute("src", huggingFaceLogo);
 
-
-    const div = document.createElement("div")
+    const div = document.createElement("div");
     div.appendChild(image);
     div.style.textAlign = 'center';
     div.style.paddingTop = '30px';
-    div.style.backgroundColor = '#6670FF'
+    div.style.backgroundColor = '#6670FF';
 
     const scrollDiv = document.getElementsByClassName("wy-side-scroll")[0];
-    scrollDiv.prepend(div)
+    scrollDiv.prepend(div);
 }
 
-window.addEventListener("load", addIcon)
+function addCustomFooter() {
+    const customFooter = document.createElement("div");
+    const questionOrIssue = document.createElement("div");
+    questionOrIssue.innerHTML = "Stuck? Read our <a href='https://medium.com/huggingface'>Blog posts</a> or <a href='https://github.com/huggingface/pytorch_transformers'>Create an issue</a>";
+    customFooter.appendChild(questionOrIssue);
+    customFooter.classList.add("footer");
+
+    const social = document.createElement("div");
+    social.classList.add("footer__Social");
+
+    const imageDetails = [
+        { link: "https://huggingface.co", imageLink: "http://lysand.re/icons/website.svg" },
+        { link: "https://twitter.com/huggingface", imageLink: "http://lysand.re/icons/twitter.svg" },
+        { link: "https://github.com/huggingface", imageLink: "http://lysand.re/icons/github.svg" },
+        { link: "https://www.linkedin.com/company/huggingface/", imageLink: "http://lysand.re/icons/linkedin.svg" }
+    ];
+
+    imageDetails.forEach(imageLinks => {
+        const link = document.createElement("a");
+        const image = document.createElement("img");
+        image.src = imageLinks.imageLink;
+        link.href = imageLinks.link;
+        image.style.width = "30px";
+        image.classList.add("footer__CustomImage");
+        link.appendChild(image);
+        social.appendChild(link);
+    });
+
+    customFooter.appendChild(social);
+    document.getElementsByTagName("footer")[0].appendChild(customFooter);
+}
+
+function onLoad() {
+    addIcon();
+    addCustomFooter();
+}
+
+window.addEventListener("load", onLoad);
 

From 3f56ad5aff88f2eca8def97a599565afc16d7278 Mon Sep 17 00:00:00 2001
From: LysandreJik <lysandre.debut@reseau.eseo.fr>
Date: Tue, 9 Jul 2019 18:50:59 -0400
Subject: [PATCH 085/139] Updated CircleCI's config.yml to use a large resource
 class.

---
 .circleci/config.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 996eab6815..51b0ffa04e 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -4,6 +4,7 @@ jobs:
         working_directory: ~/pytorch-transformers
         docker:
             - image: circleci/python:3.5
+        resource_class: large
         steps:
             - checkout
             - run: sudo pip install --progress-bar off .
@@ -14,6 +15,7 @@ jobs:
             - run: codecov
     build_py2:
         working_directory: ~/pytorch-transformers
+        resource_class: large
         docker:
             - image: circleci/python:2.7
         steps:

From 50b7e52a7fc5fe777a3d70d0e7971b30c700418b Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Wed, 10 Jul 2019 15:33:34 +0200
Subject: [PATCH 086/139] WIP examples

---
 examples/run_glue.py                   | 216 +++++++++---------
 examples/run_squad.py                  | 289 ++++++++++++++++---------
 examples/utils.py                      |  61 ++++++
 pytorch_transformers/modeling_xlnet.py |   2 +-
 pytorch_transformers/optimization.py   |   2 +
 5 files changed, 361 insertions(+), 209 deletions(-)
 create mode 100644 examples/utils.py

diff --git a/examples/run_glue.py b/examples/run_glue.py
index 547a4e4698..1e14a3e183 100644
--- a/examples/run_glue.py
+++ b/examples/run_glue.py
@@ -37,7 +37,7 @@ from pytorch_transformers import (BertForSequenceClassification, XLNetForSequenc
                                   XLNET_PRETRAINED_MODEL_ARCHIVE_MAP, XLM_PRETRAINED_MODEL_ARCHIVE_MAP)
 from pytorch_transformers import (BertTokenizer, XLNetTokenizer,
                                   XLMTokenizer)
-from pytorch_transformers.optimization import BertAdam, WarmupLinearSchedule
+from pytorch_transformers.optimization import BertAdam
 
 from utils_glue import processors, output_modes, convert_examples_to_features, compute_metrics
 
@@ -60,12 +60,12 @@ TOKENIZER_CLASSES = {
     'xlm': XLMTokenizer,
 }
 
-def train(args, train_dataset, model):
+def train(args, train_dataset, model, tokenizer):
     """ Train the model """
     if args.local_rank in [-1, 0]:
         tb_writer = SummaryWriter()
 
-    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps
+    args.train_batch_size = args.per_gpu_train_batch_size * args.n_gpu
     train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
     train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size)
 
@@ -76,42 +76,36 @@ def train(args, train_dataset, model):
         num_train_optimization_steps = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
 
     # Prepare optimizer
-    param_optimizer = list(model.named_parameters())
-    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
+    no_decay = ['bias', 'LayerNorm.weight']
     optimizer_grouped_parameters = [
-        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
-        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
+        {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay},
+        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
         ]
+    optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate,
+                         t_total=num_train_optimization_steps, warmup=args.warmup_proportion)
     if args.fp16:
         try:
-            from apex.optimizers import FP16_Optimizer, FusedAdam
+            from apex import amp
         except ImportError:
             raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
-        optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0)
-        if args.loss_scale == 0:
-            optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
-        else:
-            optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale)
-        warmup_linear = WarmupLinearSchedule(warmup=args.warmup_proportion, t_total=num_train_optimization_steps)
-
-    else:
-        optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion,
-                             t_total=num_train_optimization_steps)
+        model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)
 
     # Train!
     logger.info("***** Running training *****")
     logger.info("  Num examples = %d", len(train_dataset))
     logger.info("  Num Epochs = %d", args.num_train_epochs)
-    logger.info("  Batch size = %d", args.train_batch_size)
+    logger.info("  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
+    logger.info("  Total train batch size (w. parallel, distributed & accumulation) = %d",
+                   args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1))
     logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
     logger.info("  Total optimization steps = %d", num_train_optimization_steps)
 
     global_step = 0
-    tr_loss = 0
-    model.train()
+    tr_loss, logging_loss = 0.0, 0.0
     optimizer.zero_grad()
     for _ in trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]):
         for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])):
+            model.train()
             batch = tuple(t.to(args.device) for t in batch)
             inputs = {'input_ids':      batch[0],
                       'attention_mask': batch[1],
@@ -125,23 +119,25 @@ def train(args, train_dataset, model):
             if args.gradient_accumulation_steps > 1:
                 loss = loss / args.gradient_accumulation_steps
 
-            loss.backward() if not args.fp16 else optimizer.backward(loss)
+            if args.fp16:
+                with amp.scale_loss(loss, optimizer) as scaled_loss:
+                    scaled_loss.backward()
+            else:
+                loss.backward()
 
             tr_loss += loss.item()
             if (step + 1) % args.gradient_accumulation_steps == 0:
-                if args.fp16:
-                    # modify learning rate with special warm up BERT uses
-                    # if args.fp16 is False, BertAdam is used that handles this automatically
-                    lr_this_step = args.learning_rate * warmup_linear.get_lr(global_step, args.warmup_proportion)
-                    for param_group in optimizer.param_groups:
-                        param_group['lr'] = lr_this_step
                 optimizer.step()
                 optimizer.zero_grad()
                 global_step += 1
-                if args.local_rank in [-1, 0]:
-                    if not args.fp16:
-                        tb_writer.add_scalar('lr', optimizer.get_lr()[0], global_step)
-                    tb_writer.add_scalar('loss', loss.item(), global_step)
+                if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
+                    if args.local_rank == -1:  # Only evaluate on single GPU otherwise metrics may not average well
+                        results = evaluate(args, model, tokenizer)
+                        for key, value in results.items():
+                            tb_writer.add_scalar('eval_{}'.format(key), value, global_step)
+                    tb_writer.add_scalar('lr', optimizer.get_lr()[0], global_step)
+                    tb_writer.add_scalar('loss', (tr_loss - logging_loss)/args.logging_steps, global_step)
+                    logging_loss = tr_loss
             if args.max_steps > 0 and global_step > args.max_steps:
                 break
         if args.max_steps > 0 and global_step > args.max_steps:
@@ -150,62 +146,71 @@ def train(args, train_dataset, model):
     return global_step, tr_loss / global_step
 
 
-def evalutate(args, eval_task, eval_output_dir, dataset, model):
-    """ Evaluate the model """
-    if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]:
-        os.makedirs(eval_output_dir)
+def evaluate(args, model, tokenizer):
+    # Loop to handle MNLI double evaluation (matched, mis-matched)
+    eval_task_names = ("mnli", "mnli-mm") if args.task_name == "mnli" else (args.task_name,)
+    eval_outputs_dirs = (args.output_dir, args.output_dir + '-MM') if args.task_name == "mnli" else (args.output_dir,)
 
-    # Note that DistributedSampler samples randomly
-    eval_sampler = SequentialSampler(dataset) if args.local_rank == -1 else DistributedSampler(dataset)
-    eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)
+    results = {}
+    for eval_task, eval_output_dir in zip(eval_task_names, eval_outputs_dirs):
+        eval_dataset = load_and_cache_examples(args, eval_task, tokenizer, evaluate=True)
 
-    # Eval!
-    logger.info("***** Running evaluation *****")
-    logger.info("  Num examples = %d", len(dataset))
-    logger.info("  Batch size = %d", args.eval_batch_size)
-    model.eval()
-    eval_loss = 0
-    nb_eval_steps = 0
-    preds = None
-    out_label_ids = None
-    for batch in tqdm(eval_dataloader, desc="Evaluating"):
-        batch = tuple(t.to(args.device) for t in batch)
+        """ Evaluate the model """
+        if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]:
+            os.makedirs(eval_output_dir)
 
-        with torch.no_grad():
-            inputs = {'input_ids':      batch[0],
-                      'attention_mask': batch[1],
-                      'token_type_ids': batch[2] if args.model_type in ['bert', 'xlnet'] else None,  # XLM don't use segment_ids
-                      'labels':         batch[3]}
-            outputs = model(**inputs)
-            tmp_eval_loss, logits = outputs[:2]
+        # Note that DistributedSampler samples randomly
+        eval_sampler = SequentialSampler(eval_dataset) if args.local_rank == -1 else DistributedSampler(eval_dataset)
+        eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)
 
-        eval_loss += tmp_eval_loss.mean().item()
-        nb_eval_steps += 1
-        if preds is None:
-            preds = logits.detach().cpu().numpy()
-            out_label_ids = inputs['labels'].detach().cpu().numpy()
-        else:
-            preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
-            out_label_ids = np.append(out_label_ids, inputs['labels'].detach().cpu().numpy(), axis=0)
+        # Eval!
+        logger.info("***** Running evaluation *****")
+        logger.info("  Num examples = %d", len(eval_dataset))
+        logger.info("  Batch size = %d", args.eval_batch_size)
+        model.eval()
+        eval_loss = 0
+        nb_eval_steps = 0
+        preds = None
+        out_label_ids = None
+        for batch in tqdm(eval_dataloader, desc="Evaluating"):
+            batch = tuple(t.to(args.device) for t in batch)
 
-    eval_loss = eval_loss / nb_eval_steps
-    if args.output_mode == "classification":
-        preds = np.argmax(preds, axis=1)
-    elif args.output_mode == "regression":
-        preds = np.squeeze(preds)
-    result = compute_metrics(eval_task, preds, out_label_ids)
+            with torch.no_grad():
+                inputs = {'input_ids':      batch[0],
+                          'attention_mask': batch[1],
+                          'token_type_ids': batch[2] if args.model_type in ['bert', 'xlnet'] else None,  # XLM don't use segment_ids
+                          'labels':         batch[3]}
+                outputs = model(**inputs)
+                tmp_eval_loss, logits = outputs[:2]
 
-    output_eval_file = os.path.join(eval_output_dir, "eval_results.txt")
-    with open(output_eval_file, "w") as writer:
-        logger.info("***** Eval results *****")
-        for key in sorted(result.keys()):
-            logger.info("  %s = %s", key, str(result[key]))
-            writer.write("%s = %s\n" % (key, str(result[key])))
+            eval_loss += tmp_eval_loss.mean().item()
+            nb_eval_steps += 1
+            if preds is None:
+                preds = logits.detach().cpu().numpy()
+                out_label_ids = inputs['labels'].detach().cpu().numpy()
+            else:
+                preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
+                out_label_ids = np.append(out_label_ids, inputs['labels'].detach().cpu().numpy(), axis=0)
 
-    return result
+        eval_loss = eval_loss / nb_eval_steps
+        if args.output_mode == "classification":
+            preds = np.argmax(preds, axis=1)
+        elif args.output_mode == "regression":
+            preds = np.squeeze(preds)
+        result = compute_metrics(eval_task, preds, out_label_ids)
+        results.update(result)
+
+        output_eval_file = os.path.join(eval_output_dir, "eval_results.txt")
+        with open(output_eval_file, "w") as writer:
+            logger.info("***** Eval results *****")
+            for key in sorted(result.keys()):
+                logger.info("  %s = %s", key, str(result[key]))
+                writer.write("%s = %s\n" % (key, str(result[key])))
+
+    return results
 
 
-def load_and_cache_examples(args, task, tokenizer, evaluate=False):
+def load_and_cache_examples(args, task, tokenizer, evaluate=False, overwrite_cache=False):
     processor = processors[task]()
     output_mode = output_modes[task]
     # Load data features from cache or dataset file
@@ -214,7 +219,7 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False):
         list(filter(None, args.model_name.split('/'))).pop(),
         str(args.max_seq_length),
         str(task)))
-    if os.path.exists(cached_features_file):
+    if os.path.exists(cached_features_file) and not args.overwrite_cache:
         logger.info("Loading features from cached file %s", cached_features_file)
         features = torch.load(cached_features_file)
     else:
@@ -270,39 +275,44 @@ def main():
                         help="Whether to run eval on the dev set.")
     parser.add_argument("--do_lower_case", action='store_true',
                         help="Set this flag if you are using an uncased model.")
-    parser.add_argument("--train_batch_size", default=32, type=int,
-                        help="Total batch size for training.")
+
+    parser.add_argument("--per_gpu_train_batch_size", default=8, type=int,
+                        help="Batch size per GPU for training.")
     parser.add_argument("--eval_batch_size", default=8, type=int,
                         help="Total batch size for eval.")
     parser.add_argument('--gradient_accumulation_steps', type=int, default=1,
                         help="Number of updates steps to accumulate before performing a backward/update pass.")
     parser.add_argument("--learning_rate", default=5e-5, type=float,
                         help="The initial learning rate for Adam.")
+    parser.add_argument("--weight_decay", default=0.0, type=float,
+                        help="Weight deay if we apply some.")
     parser.add_argument("--num_train_epochs", default=3.0, type=float,
                         help="Total number of training epochs to perform.")
     parser.add_argument("--max_steps", default=-1, type=int,
                         help="If > 0: set total number of training steps to perform. Override num_train_epochs.")
     parser.add_argument("--warmup_proportion", default=0.1, type=float,
                         help="Proportion of training with linear learning rate warmup (0.1 = 10%% of training).")
+
+    parser.add_argument('--logging_steps', type=int, default=100,
+                        help="Log every X updates steps.")
     parser.add_argument("--no_cuda", action='store_true',
                         help="Avoid using CUDA when available")
     parser.add_argument('--overwrite_output_dir', action='store_true',
                         help="Overwrite the content of the output directory")
+    parser.add_argument('--overwrite_cache', action='store_true',
+                        help="Overwrite the cached training and evaluation sets")
     parser.add_argument('--seed', type=int, default=42,
                         help="random seed for initialization")
 
     parser.add_argument('--fp16', action='store_true',
-                        help="Whether to use 16-bit float precision instead of 32-bit")
-    parser.add_argument('--loss_scale', type=float, default=0,
-                        help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
-                             "0 (default value): dynamic loss scaling.\n"
-                             "Positive power of 2: static loss scaling value.\n")
-
+                        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit")
+    parser.add_argument('--fp16_opt_level', type=str, default='O1',
+                        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
+                             "See details at https://nvidia.github.io/apex/amp.html")
     parser.add_argument("--local_rank", type=int, default=-1,
-                        help="local_rank for distributed training on gpus")
-
-    parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.")
-    parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.")
+                        help="For distributed training: local_rank")
+    parser.add_argument('--server_ip', type=str, default='', help="For distant debugging.")
+    parser.add_argument('--server_port', type=str, default='', help="For distant debugging.")
     args = parser.parse_args()
 
     if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir:
@@ -362,13 +372,10 @@ def main():
     if args.local_rank == 0:
         torch.distributed.barrier()
 
-    # Distributed, parrallel and fp16 model
-    if args.fp16:
-        model.half()
+    # Distributed and parrallel training
     model.to(args.device)
     if args.local_rank != -1:
-        model = torch.nn.parallel.DistributedDataParallel(model,
-                                                          device_ids=[args.local_rank],
+        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
                                                           output_device=args.local_rank,
                                                           find_unused_parameters=True)
     elif args.n_gpu > 1:
@@ -377,7 +384,7 @@ def main():
     # Training
     if args.do_train:
         train_dataset = load_and_cache_examples(args, args.task_name, tokenizer, evaluate=False)
-        global_step, tr_loss = train(args, train_dataset, model)
+        global_step, tr_loss = train(args, train_dataset, model, tokenizer)
         logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
 
 
@@ -402,17 +409,10 @@ def main():
         model.to(args.device)
 
     # Evaluation
-    if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
-        # Handle MNLI double evaluation
-        eval_task_names = ("mnli", "mnli-mm") if args.task_name == "mnli" else (args.task_name,)
-        eval_outputs_dirs = (args.output_dir, args.output_dir + '-MM') if args.task_name == "mnli" else (args.output_dir,)
+    if args.do_eval and args.local_rank in [-1, 0]:
+        results = evaluate(args, model, tokenizer)
 
-        for eval_task, eval_output_dir in zip(eval_task_names, eval_outputs_dirs):
-            eval_dataset = load_and_cache_examples(args, eval_task, tokenizer, evaluate=True)
-
-            result = evalutate(args, eval_task, eval_output_dir, eval_dataset, model)
-
-        return result
+        return results
 
 
 if __name__ == "__main__":
diff --git a/examples/run_squad.py b/examples/run_squad.py
index d6d7279cb8..7f063109e3 100644
--- a/examples/run_squad.py
+++ b/examples/run_squad.py
@@ -33,36 +33,156 @@ from tqdm import tqdm, trange
 
 from tensorboardX import SummaryWriter
 
-from pytorch_transformers import WEIGHTS_NAME, CONFIG_NAME
-from pytorch_transformers.modeling_bert import BertForQuestionAnswering
-from pytorch_transformers.optimization import BertAdam, WarmupLinearSchedule
-from pytorch_transformers.tokenization_bert import BertTokenizer
+from pytorch_transformers import (BertForQuestionAnswering, XLNetForQuestionAnswering,
+                                  XLMForQuestionAnswering, BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+                                  XLNET_PRETRAINED_MODEL_ARCHIVE_MAP, XLM_PRETRAINED_MODEL_ARCHIVE_MAP)
+from pytorch_transformers import (BertTokenizer, XLNetTokenizer,
+                                  XLMTokenizer)
 
 from utils_squad import read_squad_examples, convert_examples_to_features, RawResult, write_predictions
 
-if sys.version_info[0] == 2:
-    import cPickle as pickle
-else:
-    import pickle
-
 logger = logging.getLogger(__name__)
 
+ALL_MODELS = sum((tuple(m.keys()) for m in (BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+                                            XLNET_PRETRAINED_MODEL_ARCHIVE_MAP,
+                                            XLM_PRETRAINED_MODEL_ARCHIVE_MAP)), ())
+
+MODEL_CLASSES = {
+    'bert': BertForQuestionAnswering,
+    'xlnet': XLNetForQuestionAnswering,
+    'xlm': XLMForQuestionAnswering,
+}
+
+TOKENIZER_CLASSES = {
+    'bert': BertTokenizer,
+    'xlnet': XLNetTokenizer,
+    'xlm': XLMTokenizer,
+}
+
+def train(args, train_dataset, model):
+    """ Train the model """
+    if args.local_rank in [-1, 0]:
+        tb_writer = SummaryWriter()
+
+    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps
+    train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
+    train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size)
+
+    if args.max_steps > 0:
+        num_train_optimization_steps = args.max_steps
+        args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
+    else:
+        num_train_optimization_steps = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
+
+    # Prepare optimizer
+    no_decay = ['bias', 'LayerNorm.weight']
+    optimizer_grouped_parameters = [
+        {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
+        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
+        ]
+    optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate,
+                         t_total=num_train_optimization_steps, warmup=args.warmup_proportion)
+    if args.fp16:
+        try:
+            from apex import amp
+        except ImportError:
+            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
+        model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)
+
+    # Train!
+    logger.info("***** Running training *****")
+    logger.info("  Num examples = %d", len(train_dataset))
+    logger.info("  Num Epochs = %d", args.num_train_epochs)
+    logger.info("  Batch size = %d", args.train_batch_size)
+    logger.info("  Total batch size (distributed) = %d", args.train_batch_size * (torch.distributed.get_world_size() if args.local_rank != -1 else 1))
+    logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
+    logger.info("  Total optimization steps = %d", num_train_optimization_steps)
+
+    global_step = 0
+    tr_loss, logging_loss = 0.0, 0.0
+    model.train()
+    optimizer.zero_grad()
+    for _ in trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]):
+        for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])):
+            batch = tuple(t.to(args.device) for t in batch)
+            inputs = {'input_ids':      batch[0],
+                      'attention_mask': batch[1],
+                      'token_type_ids': batch[2] if args.model_type in ['bert', 'xlnet'] else None,  # XLM don't use segment_ids
+                      'labels':         batch[3]}
+            ouputs = model(**inputs)
+            loss = ouputs[0]
+
+
+def evalutate(args, dataset, model):
+    """ Evaluate the model """
+
+
+
+def load_and_cache_examples(args, tokenizer, training=True):
+    """ Load data features from cache or dataset file. """
+    cached_features_file = os.path.join(args.data_dir, 'cached_{}_{}_{}_{}'.format(
+        'dev' if evaluate else 'train',
+        list(filter(None, args.model_name.split('/'))).pop(),
+        str(args.max_seq_length),
+        str(task)))
+    if os.path.exists(cached_features_file):
+        logger.info("Loading features from cached file %s", cached_features_file)
+        features = torch.load(cached_features_file)
+    else:
+        logger.info("Creating features from dataset file at %s", args.data_dir)
+        label_list = processor.get_labels()
+        examples = read_squad_examples(input_file=args.train_file if training else args.predict_file,
+                        is_training=training,
+                        version_2_with_negative=args.version_2_with_negative)
+        features = convert_examples_to_features(
+            examples=examples,
+            tokenizer=tokenizer,
+            max_seq_length=args.max_seq_length,
+            doc_stride=args.doc_stride,
+            max_query_length=args.max_query_length,
+            is_training=training)
+        if args.local_rank in [-1, 0]:
+            logger.info("Num orig examples = %d", len(examples))
+            logger.info("Num split examples = %d", len(features))
+            logger.info("Saving features into cached file %s", cached_features_file)
+            torch.save(features, cached_features_file)
+
+    # Convert to Tensors and build dataset
+    all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
+    all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
+    all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
+    if training:
+        all_start_positions = torch.tensor([f.start_position for f in train_features], dtype=torch.long)
+        all_end_positions = torch.tensor([f.end_position for f in train_features], dtype=torch.long)
+        dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_start_positions, all_end_positions)
+    else:
+        all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
+        dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_example_index)
+
+    return dataset
+
 
 def main():
     parser = argparse.ArgumentParser()
 
     ## Required parameters
-    parser.add_argument("--bert_model", default=None, type=str, required=True,
-                        help="Bert pre-trained model selected in the list: bert-base-uncased, "
-                        "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, "
-                        "bert-base-multilingual-cased, bert-base-chinese.")
+    parser.add_argument("--train_file", default=None, type=str, required=True,
+                        help="SQuAD json for training. E.g., train-v1.1.json")
+    parser.add_argument("--predict_file", default=None, type=str, required=True,
+                        help="SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json")
+    parser.add_argument("--model_name", default=None, type=str, required=True,
+                        help="Bert/XLNet/XLM pre-trained model selected in the list: " + ", ".join(ALL_MODELS))
     parser.add_argument("--output_dir", default=None, type=str, required=True,
                         help="The output directory where the model checkpoints and predictions will be written.")
 
     ## Other parameters
-    parser.add_argument("--train_file", default=None, type=str, help="SQuAD json for training. E.g., train-v1.1.json")
-    parser.add_argument("--predict_file", default=None, type=str,
-                        help="SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json")
+    parser.add_argument('--version_2_with_negative', action='store_true',
+                        help='If true, the SQuAD examples contain some that do not have an answer.')
+    parser.add_argument('--null_score_diff_threshold', type=float, default=0.0,
+                        help="If null_score - best_non_null is greater than the threshold predict null.")
+    parser.add_argument('--overwrite_output_dir', action='store_true',
+                        help="Overwrite the content of the output directory")
+
     parser.add_argument("--max_seq_length", default=384, type=int,
                         help="The maximum total input sequence length after WordPiece tokenization. Sequences "
                              "longer than this will be truncated, and sequences shorter than this will be padded.")
@@ -71,65 +191,53 @@ def main():
     parser.add_argument("--max_query_length", default=64, type=int,
                         help="The maximum number of tokens for the question. Questions longer than this will "
                              "be truncated to this length.")
-    parser.add_argument("--do_train", action='store_true', help="Whether to run training.")
-    parser.add_argument("--do_predict", action='store_true', help="Whether to run eval on the dev set.")
-    parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.")
-    parser.add_argument("--predict_batch_size", default=8, type=int, help="Total batch size for predictions.")
-    parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
+    parser.add_argument("--do_train", action='store_true',
+                        help="Whether to run training.")
+    parser.add_argument("--do_predict", action='store_true',
+                        help="Whether to run eval on the dev set.")
+    parser.add_argument("--do_lower_case", action='store_true',
+                        help="Whether to lower case the input text. True for uncased models, False for cased models.")
+
+    parser.add_argument("--train_batch_size", default=32, type=int,
+                        help="Total batch size for training.")
+    parser.add_argument("--predict_batch_size", default=8, type=int,
+                        help="Total batch size for predictions.")
+    parser.add_argument("--learning_rate", default=5e-5, type=float,
+                        help="The initial learning rate for Adam.")
+    parser.add_argument('--gradient_accumulation_steps', type=int, default=1,
+                        help="Number of updates steps to accumulate before performing a backward/update pass.")
     parser.add_argument("--num_train_epochs", default=3.0, type=float,
                         help="Total number of training epochs to perform.")
     parser.add_argument("--warmup_proportion", default=0.1, type=float,
-                        help="Proportion of training to perform linear learning rate warmup for. E.g., 0.1 = 10%% "
-                             "of training.")
+                        help="Proportion of training with linear learning rate warmup (0.1 = 10%% of training).")
     parser.add_argument("--n_best_size", default=20, type=int,
-                        help="The total number of n-best predictions to generate in the nbest_predictions.json "
-                             "output file.")
+                        help="The total number of n-best predictions to generate in the nbest_predictions.json output file.")
     parser.add_argument("--max_answer_length", default=30, type=int,
                         help="The maximum length of an answer that can be generated. This is needed because the start "
                              "and end predictions are not conditioned on one another.")
     parser.add_argument("--verbose_logging", action='store_true',
                         help="If true, all of the warnings related to data processing will be printed. "
                              "A number of warnings are expected for a normal SQuAD evaluation.")
-    parser.add_argument("--no_cuda",
-                        action='store_true',
+
+    parser.add_argument("--no_cuda", action='store_true',
                         help="Whether not to use CUDA when available")
-    parser.add_argument('--seed',
-                        type=int,
-                        default=42,
+    parser.add_argument('--seed', type=int, default=42,
                         help="random seed for initialization")
-    parser.add_argument('--gradient_accumulation_steps',
-                        type=int,
-                        default=1,
-                        help="Number of updates steps to accumulate before performing a backward/update pass.")
-    parser.add_argument("--do_lower_case",
-                        action='store_true',
-                        help="Whether to lower case the input text. True for uncased models, False for cased models.")
-    parser.add_argument("--local_rank",
-                        type=int,
-                        default=-1,
+    parser.add_argument("--local_rank", type=int, default=-1,
                         help="local_rank for distributed training on gpus")
-    parser.add_argument('--fp16',
-                        action='store_true',
-                        help="Whether to use 16-bit float precision instead of 32-bit")
-    parser.add_argument('--overwrite_output_dir',
-                        action='store_true',
-                        help="Overwrite the content of the output directory")
-    parser.add_argument('--loss_scale',
-                        type=float, default=0,
-                        help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
-                             "0 (default value): dynamic loss scaling.\n"
-                             "Positive power of 2: static loss scaling value.\n")
-    parser.add_argument('--version_2_with_negative',
-                        action='store_true',
-                        help='If true, the SQuAD examples contain some that do not have an answer.')
-    parser.add_argument('--null_score_diff_threshold',
-                        type=float, default=0.0,
-                        help="If null_score - best_non_null is greater than the threshold predict null.")
+    parser.add_argument('--fp16', action='store_true',
+                        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit")
+    parser.add_argument('--fp16_opt_level', type=str, default='O1',
+                        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
+                             "See details at https://nvidia.github.io/apex/amp.html")
     parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.")
     parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.")
     args = parser.parse_args()
     print(args)
 
+    if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir:
+        raise ValueError("Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(args.output_dir))
+
     if args.server_ip and args.server_port:
         # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
         import ptvsd
@@ -137,71 +245,52 @@ def main():
         ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
         ptvsd.wait_for_attach()
 
+    # Setup CUDA, GPU & distributed training
     if args.local_rank == -1 or args.no_cuda:
         device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
-        n_gpu = torch.cuda.device_count()
-    else:
+        args.n_gpu = torch.cuda.device_count()
+    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
         torch.cuda.set_device(args.local_rank)
         device = torch.device("cuda", args.local_rank)
-        n_gpu = 1
-        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
         torch.distributed.init_process_group(backend='nccl')
+        args.n_gpu = 1
+    args.device = device
 
-    logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
-                        datefmt = '%m/%d/%Y %H:%M:%S',
-                        level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
-
-    logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format(
-        device, n_gpu, bool(args.local_rank != -1), args.fp16))
-
-    if args.gradient_accumulation_steps < 1:
-        raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(
-                            args.gradient_accumulation_steps))
-
-    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps
+    # Setup logging
+    logging.basicConfig(level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
+    logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
+                args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16)
 
+    # Setup seeds
     random.seed(args.seed)
     np.random.seed(args.seed)
     torch.manual_seed(args.seed)
-    if n_gpu > 0:
+    if args.n_gpu > 0:
         torch.cuda.manual_seed_all(args.seed)
 
-    if not args.do_train and not args.do_predict:
-        raise ValueError("At least one of `do_train` or `do_predict` must be True.")
-
-    if args.do_train:
-        if not args.train_file:
-            raise ValueError(
-                "If `do_train` is True, then `train_file` must be specified.")
-    if args.do_predict:
-        if not args.predict_file:
-            raise ValueError(
-                "If `do_predict` is True, then `predict_file` must be specified.")
-
-    if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir:
-        raise ValueError("Output directory {} already exists and is not empty. Use --overwrite_output_dir to overcome.".format(args.output_dir))
-    if not os.path.exists(args.output_dir):
-        os.makedirs(args.output_dir)
-
+    # Load pretrained model and tokenizer
     if args.local_rank not in [-1, 0]:
-        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
+        torch.distributed.barrier()  # Make sure only 1st process in distributed training download model & vocab
+
+    args.model_type = args.model_name.lower().split('-')[0]
+    tokenizer_class = TOKENIZER_CLASSES[args.model_type]
+    model_class = MODEL_CLASSES[args.model_type]
+    tokenizer = tokenizer_class.from_pretrained(args.model_name, do_lower_case=args.do_lower_case)
+    model = model_class.from_pretrained(args.model_name, num_labels=num_labels)
 
-    tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)
-    model = BertForQuestionAnswering.from_pretrained(args.bert_model)
     if args.local_rank == 0:
         torch.distributed.barrier()
 
-    if args.fp16:
-        model.half()
-    model.to(device)
+    # Distributed and parrallel training
+    model.to(args.device)
     if args.local_rank != -1:
-        model = torch.nn.parallel.DistributedDataParallel(model,
-                                                          device_ids=[args.local_rank],
+        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
                                                           output_device=args.local_rank,
                                                           find_unused_parameters=True)
-    elif n_gpu > 1:
+    elif args.n_gpu > 1:
         model = torch.nn.DataParallel(model)
 
+    # Training
     if args.do_train:
         if args.local_rank in [-1, 0]:
             tb_writer = SummaryWriter()
diff --git a/examples/utils.py b/examples/utils.py
new file mode 100644
index 0000000000..e4b7263efa
--- /dev/null
+++ b/examples/utils.py
@@ -0,0 +1,61 @@
+# Copyright (c) 2019-present, the HuggingFace Inc. authors.
+# All rights reserved. This source code is licensed under the BSD-style
+# license found in the LICENSE file in the root directory of this source tree.
+import logging
+import os
+from tqdm import tqdm
+from pprint import pformat
+
+import torch
+
+from ignite.engine import Engine, Events
+from ignite.handlers import ModelCheckpoint
+from ignite.metrics import RunningAverage
+from ignite.contrib.handlers import ProgressBar
+from ignite.contrib.handlers.tensorboard_logger import OptimizerParamsHandler, OutputHandler, TensorboardLogger
+
+
+def average_distributed_scalar(scalar, args):
+    """ Average a scalar over nodes if we are in distributed training.
+        We use this for distributed evaluation.
+        Beware, such averages only works for metrics which are additive with regard
+        to the evaluation dataset, e.g. accuracy, log probabilities.
+        Doesn't work for ratio metrics like F1.
+    """
+    if args.local_rank == -1:
+        return scalar
+    scalar_t = torch.tensor(scalar, dtype=torch.float, device=args.device) / torch.distributed.get_world_size()
+    torch.distributed.all_reduce(scalar_t, op=torch.distributed.ReduceOp.SUM)
+    return scalar_t.item()
+
+
+def add_logging_and_checkpoint_saving(trainer, evaluator, metrics, model, optimizer, args, prefix=""):
+    """ Add to a PyTorch ignite training engine tensorboard logging,
+        progress bar with average loss, checkpoint saving and save training config.
+    """
+    # Add progress bar with average loss
+    RunningAverage(output_transform=lambda x: x).attach(trainer, prefix + "loss")
+    pbar = ProgressBar(persist=True)
+    pbar.attach(trainer, metric_names=[prefix + "loss"])
+    evaluator.add_event_handler(Events.COMPLETED, lambda _:
+                                pbar.log_message("Validation: %s" % pformat(evaluator.state.metrics)))
+
+    # Add tensorboard logging with training and evaluation metrics
+    tb_logger = TensorboardLogger(log_dir=None)
+    tb_logger.attach(trainer, log_handler=OutputHandler(tag="training", metric_names=[prefix + "loss"]),
+                     event_name=Events.ITERATION_COMPLETED)
+    tb_logger.attach(trainer, log_handler=OptimizerParamsHandler(optimizer),
+                     event_name=Events.ITERATION_STARTED)
+    @evaluator.on(Events.COMPLETED)
+    def tb_log_metrics(engine):
+        for name in metrics.keys():
+            tb_logger.writer.add_scalar(name, engine.state.metrics[name], trainer.state.iteration)
+
+    # Add checkpoint saving after each epoch - take care of distributed encapsulation ('getattr()')
+    checkpoint_handler = ModelCheckpoint(tb_logger.writer.log_dir, 'checkpoint', save_interval=1, n_saved=3)
+    trainer.add_event_handler(Events.EPOCH_COMPLETED, checkpoint_handler, {'mymodel': getattr(model, 'module', model)})
+
+    # Save training configuration
+    torch.save(args, os.path.join(tb_logger.writer.log_dir, CONFIG_NAME))
+
+    return checkpoint_handler, tb_logger
diff --git a/pytorch_transformers/modeling_xlnet.py b/pytorch_transformers/modeling_xlnet.py
index e0b3fb0661..1782cb2f84 100644
--- a/pytorch_transformers/modeling_xlnet.py
+++ b/pytorch_transformers/modeling_xlnet.py
@@ -393,7 +393,7 @@ class XLNetRelativeAttention(nn.Module):
         x = x[1:, ...]
         x = x.reshape(x_size[0], x_size[1] - 1, x_size[2], x_size[3])
         # x = x[:, 0:klen, :, :]
-        x = torch.index_select(x, 1, torch.arange(klen))
+        x = torch.index_select(x, 1, torch.arange(klen, device=x.device, dtype=torch.long))
 
         return x
 
diff --git a/pytorch_transformers/optimization.py b/pytorch_transformers/optimization.py
index d13dd45c6b..b2f2e43b1c 100644
--- a/pytorch_transformers/optimization.py
+++ b/pytorch_transformers/optimization.py
@@ -227,6 +227,8 @@ class BertAdam(Optimizer):
         lr = []
         for group in self.param_groups:
             for p in group['params']:
+                if p.grad is None:
+                    continue
                 state = self.state[p]
                 if len(state) == 0:
                     return [0]

From f773faa25871357cc25fb04d65cc0d4bcb1364da Mon Sep 17 00:00:00 2001
From: LysandreJik <lysandre.debut@reseau.eseo.fr>
Date: Wed, 10 Jul 2019 14:45:56 -0400
Subject: [PATCH 087/139] Fixed all links. Removed TPU. Changed CLI to
 Converting TF models. Many minor formatting adjustments. Added "TODO Lysandre
 filled" where necessary.

---
 docs/source/_static/css/huggingface.css       |   3 +-
 docs/source/bertology.md                      |   1 -
 docs/source/bertology.rst                     |  18 ++
 ...i.rst => converting_tensorflow_models.rst} |   6 +-
 docs/source/examples.rst                      |  49 ++---
 docs/source/index.rst                         | 173 +++++++++++++-----
 docs/source/model_doc/bert.rst                |  16 +-
 docs/source/model_doc/gpt.rst                 |   6 +-
 docs/source/model_doc/gpt2.rst                |   6 +-
 docs/source/model_doc/overview.rst            |  27 ++-
 docs/source/model_doc/transformerxl.rst       |   4 +-
 docs/source/model_doc/xlm.rst                 |  13 +-
 docs/source/model_doc/xlnet.rst               |  15 +-
 docs/source/notebooks.rst                     |   6 +-
 docs/source/tpu.rst                           |  13 --
 docs/source/usage.rst                         |   8 +-
 pytorch_transformers/modeling_bert.py         |  10 +-
 pytorch_transformers/tokenization_xlm.py      |  11 +-
 pytorch_transformers/tokenization_xlnet.py    |   3 +-
 19 files changed, 235 insertions(+), 153 deletions(-)
 delete mode 100644 docs/source/bertology.md
 create mode 100644 docs/source/bertology.rst
 rename docs/source/{cli.rst => converting_tensorflow_models.rst} (84%)
 delete mode 100644 docs/source/tpu.rst

diff --git a/docs/source/_static/css/huggingface.css b/docs/source/_static/css/huggingface.css
index 362c0992fb..15b5030972 100644
--- a/docs/source/_static/css/huggingface.css
+++ b/docs/source/_static/css/huggingface.css
@@ -6,6 +6,7 @@
 /* To keep the logo centered */
 .wy-side-scroll {
     width: auto;
+    font-size: 20px;
 }
 
 /* The div that holds the Hugging Face logo */
@@ -131,7 +132,7 @@ a {
 /* FONTS */
 body{
     font-family: Calibre;
-    font-size: 20px;
+    font-size: 16px;
 }
 
 h1 {
diff --git a/docs/source/bertology.md b/docs/source/bertology.md
deleted file mode 100644
index e408484e84..0000000000
--- a/docs/source/bertology.md
+++ /dev/null
@@ -1 +0,0 @@
-# Bertology
\ No newline at end of file
diff --git a/docs/source/bertology.rst b/docs/source/bertology.rst
new file mode 100644
index 0000000000..3c8a911953
--- /dev/null
+++ b/docs/source/bertology.rst
@@ -0,0 +1,18 @@
+BERTology
+---------
+
+There is a growing field of study concerned with investigating the inner working of large-scale transformers like BERT (that some call "BERTology"). Some good examples of this field are:
+
+
+* BERT Rediscovers the Classical NLP Pipeline by Ian Tenney, Dipanjan Das, Ellie Pavlick: https://arxiv.org/abs/1905.05950
+* Are Sixteen Heads Really Better than One? by Paul Michel, Omer Levy, Graham Neubig: https://arxiv.org/abs/1905.10650
+* What Does BERT Look At? An Analysis of BERT's Attention by Kevin Clark, Urvashi Khandelwal, Omer Levy, Christopher D. Manning: https://arxiv.org/abs/1906.04341
+
+In order to help this new field develop, we have included a few additional features in the BERT/GPT/GPT-2 models to help people access the inner representations, mainly adapted  from the great work of Paul Michel (https://arxiv.org/abs/1905.10650):
+
+
+* accessing all the hidden-states of BERT/GPT/GPT-2,
+* accessing all the attention weights for each head of BERT/GPT/GPT-2,
+* retrieving heads output values and gradients to be able to compute head importance score and prune head as explained in https://arxiv.org/abs/1905.10650.
+
+To help you understand and use these features, we have added a specific example script: `bertology.py <https://github.com/huggingface/pytorch-pretrained-BERT/tree/master/examples/bertology.py>`_ while extract information and prune a model pre-trained on MRPC.
diff --git a/docs/source/cli.rst b/docs/source/converting_tensorflow_models.rst
similarity index 84%
rename from docs/source/cli.rst
rename to docs/source/converting_tensorflow_models.rst
index 217cd1a8be..afcacc00a0 100644
--- a/docs/source/cli.rst
+++ b/docs/source/converting_tensorflow_models.rst
@@ -1,4 +1,4 @@
-CLI
+Converting Tensorflow Models
 ================================================
 
 A command-line interface is provided to convert a TensorFlow checkpoint in a PyTorch dump of the ``BertForPreTraining`` class  (for BERT) or NumPy checkpoint in a PyTorch dump of the ``OpenAIGPTModel`` class  (for OpenAI GPT).
@@ -6,9 +6,9 @@ A command-line interface is provided to convert a TensorFlow checkpoint in a PyT
 BERT
 ^^^^
 
-You can convert any TensorFlow checkpoint for BERT (in particular `the pre-trained models released by Google <https://github.com/google-research/bert#pre-trained-models>`_\ ) in a PyTorch save file by using the `\ ``convert_tf_checkpoint_to_pytorch.py`` <./pytorch_pretrained_bert/convert_tf_checkpoint_to_pytorch.py>`_ script.
+You can convert any TensorFlow checkpoint for BERT (in particular `the pre-trained models released by Google <https://github.com/google-research/bert#pre-trained-models>`_\ ) in a PyTorch save file by using the `convert_tf_checkpoint_to_pytorch.py <https://github.com/huggingface/pytorch-pretrained-BERT/tree/master/pytorch_pretrained_bert/convert_tf_checkpoint_to_pytorch.py>`_ script.
 
-This CLI takes as input a TensorFlow checkpoint (three files starting with ``bert_model.ckpt``\ ) and the associated configuration file (\ ``bert_config.json``\ ), and creates a PyTorch model for this configuration, loads the weights from the TensorFlow checkpoint in the PyTorch model and saves the resulting model in a standard PyTorch save file that can be imported using ``torch.load()`` (see examples in `\ ``run_bert_extract_features.py`` <./examples/run_bert_extract_features.py>`_\ , `\ ``run_bert_classifier.py`` <./examples/run_bert_classifier.py>`_ and `\ ``run_bert_squad.py`` <./examples/run_bert_squad.py>`_\ ).
+This CLI takes as input a TensorFlow checkpoint (three files starting with ``bert_model.ckpt``\ ) and the associated configuration file (\ ``bert_config.json``\ ), and creates a PyTorch model for this configuration, loads the weights from the TensorFlow checkpoint in the PyTorch model and saves the resulting model in a standard PyTorch save file that can be imported using ``torch.load()`` (see examples in `run_bert_extract_features.py <https://github.com/huggingface/pytorch-pretrained-BERT/tree/master/examples/run_bert_extract_features.py>`_\ , `run_bert_classifier.py <https://github.com/huggingface/pytorch-pretrained-BERT/tree/master/examples/run_bert_classifier.py>`_ and `run_bert_squad.py <https://github.com/huggingface/pytorch-pretrained-BERT/tree/master/examples/run_bert_squad.py>`_\ ).
 
 You only need to run this conversion script **once** to get a PyTorch model. You can then disregard the TensorFlow checkpoint (the three files starting with ``bert_model.ckpt``\ ) but be sure to keep the configuration file (\ ``bert_config.json``\ ) and the vocabulary file (\ ``vocab.txt``\ ) as these are needed for the PyTorch model too.
 
diff --git a/docs/source/examples.rst b/docs/source/examples.rst
index e7e1958d78..aee4066c2c 100644
--- a/docs/source/examples.rst
+++ b/docs/source/examples.rst
@@ -6,22 +6,24 @@ Examples
 
    * - Sub-section
      - Description
-   * - `Training large models: introduction, tools and examples <#Training-large-models-introduction,-tools-and-examples>`_
+   * - `Training large models: introduction, tools and examples <#introduction>`_
      - How to use gradient-accumulation, multi-gpu training, distributed training, optimize on CPU and 16-bits training to train Bert models
-   * - `Fine-tuning with BERT: running the examples <#Fine-tuning-with-BERT-running-the-examples>`_
-     - Running the examples in `\ ``./examples`` <./examples/>`_\ : ``extract_classif.py``\ , ``run_bert_classifier.py``\ , ``run_bert_squad.py`` and ``run_lm_finetuning.py``
-   * - `Fine-tuning with OpenAI GPT, Transformer-XL and GPT-2 <#openai-gpt-transformer-xl-and-gpt-2-running-the-examples>`_
-     - Running the examples in `\ ``./examples`` <./examples/>`_\ : ``run_openai_gpt.py``\ , ``run_transfo_xl.py`` and ``run_gpt2.py``
-   * - `Fine-tuning BERT-large on GPUs <#Fine-tuning-BERT-large-on-GPUs>`_
+   * - `Fine-tuning with BERT: running the examples <#fine-tuning-bert-examples>`_
+     - Running the examples in `examples <https://github.com/huggingface/pytorch-pretrained-BERT/tree/master/examples>`_\ : ``extract_classif.py``\ , ``run_bert_classifier.py``\ , ``run_bert_squad.py`` and ``run_lm_finetuning.py``
+   * - `Fine-tuning with OpenAI GPT, Transformer-XL and GPT-2 <#fine-tuning>`_
+     - Running the examples in `examples <https://github.com/huggingface/pytorch-pretrained-BERT/tree/master/examples>`_\ : ``run_openai_gpt.py``\ , ``run_transfo_xl.py`` and ``run_gpt2.py``
+   * - `Fine-tuning BERT-large on GPUs <#fine-tuning-bert-large>`_
      - How to fine tune ``BERT large``
 
 
+.. _introduction:
+
 Training large models: introduction, tools and examples
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 BERT-base and BERT-large are respectively 110M and 340M parameters models and it can be difficult to fine-tune them on a single GPU with the recommended batch size for good performance (in most case a batch size of 32).
 
-To help with fine-tuning these models, we have included several techniques that you can activate in the fine-tuning scripts `\ ``run_bert_classifier.py`` <./examples/run_bert_classifier.py>`_ and `\ ``run_bert_squad.py`` <./examples/run_bert_squad.py>`_\ : gradient-accumulation, multi-gpu training, distributed training and 16-bits training . For more details on how to use these techniques you can read `the tips on training large batches in PyTorch <https://medium.com/huggingface/training-larger-batches-practical-tips-on-1-gpu-multi-gpu-distributed-setups-ec88c3e51255>`_ that I published earlier this month.
+To help with fine-tuning these models, we have included several techniques that you can activate in the fine-tuning scripts `run_bert_classifier.py <https://github.com/huggingface/pytorch-pretrained-BERT/tree/master/examples/run_bert_classifier.py>`_ and `run_bert_squad.py <https://github.com/huggingface/pytorch-pretrained-BERT/tree/master/examples/run_bert_squad.py>`_\ : gradient-accumulation, multi-gpu training, distributed training and 16-bits training . For more details on how to use these techniques you can read `the tips on training large batches in PyTorch <https://medium.com/huggingface/training-larger-batches-practical-tips-on-1-gpu-multi-gpu-distributed-setups-ec88c3e51255>`_ that I published earlier this year.
 
 Here is how to use these techniques in our scripts:
 
@@ -33,7 +35,7 @@ Here is how to use these techniques in our scripts:
 
 To use 16-bits training and distributed training, you need to install NVIDIA's apex extension `as detailed here <https://github.com/nvidia/apex>`__. You will find more information regarding the internals of ``apex`` and how to use ``apex`` in `the doc and the associated repository <https://github.com/nvidia/apex>`_. The results of the tests performed on pytorch-BERT by the NVIDIA team (and my trials at reproducing them) can be consulted in `the relevant PR of the present repository <https://github.com/huggingface/pytorch-pretrained-BERT/pull/116>`_.
 
-Note: To use *Distributed Training*\ , you will need to run one training script on each of your machines. This can be done for example by running the following command on each server (see `the above mentioned blog post <(https://medium.com/huggingface/training-larger-batches-practical-tips-on-1-gpu-multi-gpu-distributed-setups-ec88c3e51255>`_\ ) for more details):
+Note: To use *Distributed Training*\ , you will need to run one training script on each of your machines. This can be done for example by running the following command on each server (see `the above mentioned blog post <https://medium.com/huggingface/training-larger-batches-practical-tips-on-1-gpu-multi-gpu-distributed-setups-ec88c3e51255>`_\ ) for more details):
 
 .. code-block:: bash
 
@@ -41,6 +43,8 @@ Note: To use *Distributed Training*\ , you will need to run one training script
 
 Where ``$THIS_MACHINE_INDEX`` is an sequential index assigned to each of your machine (0, 1, 2...) and the machine with rank 0 has an IP address ``192.168.1.1`` and an open port ``1234``.
 
+.. _fine-tuning-bert-examples:
+
 Fine-tuning with BERT: running the examples
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
@@ -333,10 +337,12 @@ LM Fine-tuning
 ~~~~~~~~~~~~~~
 
 The data should be a text file in the same format as `sample_text.txt <./samples/sample_text.txt>`_  (one sentence per line, docs separated by empty line).
-You can download an `exemplary training corpus <https://ext-bert-sample.obs.eu-de.otc.t-systems.com/small_wiki_sentence_corpus.txt>`_ generated from wikipedia articles and splitted into ~500k sentences with spaCy.
+You can download an `exemplary training corpus <https://ext-bert-sample.obs.eu-de.otc.t-systems.com/small_wiki_sentence_corpus.txt>`_ generated from wikipedia articles and split into ~500k sentences with spaCy.
 Training one epoch on this corpus takes about 1:20h on 4 x NVIDIA Tesla P100 with ``train_batch_size=200`` and ``max_seq_length=128``\ :
 
-Thank to the work of @Rocketknight1 and @tholor there are now **several scripts** that can be used to fine-tune BERT using the pretraining objective (combination of masked-language modeling and next sentence prediction loss). These scripts are detailed in the `\ ``README`` <./examples/lm_finetuning/README.md>`_ of the `\ ``examples/lm_finetuning/`` <./examples/lm_finetuning/>`_ folder.
+Thank to the work of @Rocketknight1 and @tholor there are now **several scripts** that can be used to fine-tune BERT using the pretraining objective (combination of masked-language modeling and next sentence prediction loss). These scripts are detailed in the `README <https://github.com/huggingface/pytorch-pretrained-BERT/tree/master/examples/lm_finetuning/README.md>`_ of the `examples/lm_finetuning/ <https://github.com/huggingface/pytorch-pretrained-BERT/tree/master/examples/lm_finetuning/>`_ folder.
+
+.. _fine-tuning:
 
 OpenAI GPT, Transformer-XL and GPT-2: running the examples
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -402,6 +408,8 @@ Unconditional generation:
 
 The same option as in the original scripts are provided, please refere to the code of the example and the original repository of OpenAI.
 
+.. _fine-tuning-BERT-large:
+
 Fine-tuning BERT-large on GPUs
 ------------------------------
 
@@ -571,23 +579,4 @@ Here is an example on MNLI:
      global_step = 18408
      loss = 0.04755385363816904
 
-This is the example of the ``bert-large-uncased-whole-word-masking-finetuned-mnli`` model
-
-BERTology
----------
-
-There is a growing field of study concerned with investigating the inner working of large-scale transformers like BERT (that some call "BERTology"). Some good examples of this field are:
-
-
-* BERT Rediscovers the Classical NLP Pipeline by Ian Tenney, Dipanjan Das, Ellie Pavlick: https://arxiv.org/abs/1905.05950
-* Are Sixteen Heads Really Better than One? by Paul Michel, Omer Levy, Graham Neubig: https://arxiv.org/abs/1905.10650
-* What Does BERT Look At? An Analysis of BERT's Attention by Kevin Clark, Urvashi Khandelwal, Omer Levy, Christopher D. Manning: https://arxiv.org/abs/1906.04341
-
-In order to help this new field develop, we have included a few additional features in the BERT/GPT/GPT-2 models to help people access the inner representations, mainly adapted  from the great work of Paul Michel (https://arxiv.org/abs/1905.10650):
-
-
-* accessing all the hidden-states of BERT/GPT/GPT-2,
-* accessing all the attention weights for each head of BERT/GPT/GPT-2,
-* retrieving heads output values and gradients to be able to compute head importance score and prune head as explained in https://arxiv.org/abs/1905.10650.
-
-To help you understand and use these features, we have added a specific example script: `\ ``bertology.py`` <./examples/bertology.py>`_ while extract information and prune a model pre-trained on MRPC.
+This is the example of the ``bert-large-uncased-whole-word-masking-finetuned-mnli`` model.
\ No newline at end of file
diff --git a/docs/source/index.rst b/docs/source/index.rst
index dd92507f15..ada3fc1656 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -11,8 +11,7 @@ Pytorch-Transformers
     usage
     examples
     notebooks
-    tpu
-    cli
+    converting_tensorflow_models
     migration
     bertology
     torchscript
@@ -44,7 +43,7 @@ This repository contains op-for-op PyTorch reimplementations, pre-trained models
 * `Google/CMU's Transformer-XL model <https://github.com/kimiyoung/transformer-xl>`_\ , and
 * `OpenAI's GPT-2 model <https://blog.openai.com/better-language-models/>`_.
 
-These implementations have been tested on several datasets (see the examples) and should match the performances of the associated TensorFlow implementations (e.g. ~91 F1 on SQuAD for BERT, ~88 F1 on RocStories for OpenAI GPT and ~18.3 perplexity on WikiText 103 for the Transformer-XL). You can find more details in the `Examples <#examples>`_ section below.
+These implementations have been tested on several datasets (see the examples) and should match the performances of the associated TensorFlow implementations (e.g. ~91 F1 on SQuAD for BERT, ~88 F1 on RocStories for OpenAI GPT and ~18.3 perplexity on WikiText 103 for the Transformer-XL). You can find more details in the `Examples <./examples.html>`_ section.
 
 Here are some information on these models:
 
@@ -54,12 +53,19 @@ This PyTorch implementation of BERT is provided with `Google's pre-trained model
 **OpenAI GPT** was released together with the paper `Improving Language Understanding by Generative Pre-Training <https://blog.openai.com/language-unsupervised/>`_ by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.
 This PyTorch implementation of OpenAI GPT is an adaptation of the `PyTorch implementation by HuggingFace <https://github.com/huggingface/pytorch-openai-transformer-lm>`_ and is provided with `OpenAI's pre-trained model <https://github.com/openai/finetune-transformer-lm>`__ and a command-line interface that was used to convert the pre-trained NumPy checkpoint in PyTorch.
 
-**Google/CMU's Transformer-XL** was released together with the paper `Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context <http://arxiv.org/abs/1901.02860>`_ by Zihang Dai\ *, Zhilin Yang*\ , Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
+**Google/CMU's Transformer-XL** was released together with the paper `Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context <http://arxiv.org/abs/1901.02860>`_ by Zihang Dai\*, Zhilin Yang\* , Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
 This PyTorch implementation of Transformer-XL is an adaptation of the original `PyTorch implementation <https://github.com/kimiyoung/transformer-xl>`_ which has been slightly modified to match the performances of the TensorFlow implementation and allow to re-use the pretrained weights. A command-line interface is provided to convert TensorFlow checkpoints in PyTorch models.
 
-**OpenAI GPT-2** was released together with the paper `Language Models are Unsupervised Multitask Learners <https://blog.openai.com/better-language-models/>`_ by Alec Radford\ *, Jeffrey Wu*\ , Rewon Child, David Luan, Dario Amodei\ ** and Ilya Sutskever**.
+**OpenAI GPT-2** was released together with the paper `Language Models are Unsupervised Multitask Learners <https://blog.openai.com/better-language-models/>`_ by Alec Radford\*, Jeffrey Wu\* , Rewon Child, David Luan, Dario Amodei\*\* and Ilya Sutskever\*\*.
 This PyTorch implementation of OpenAI GPT-2 is an adaptation of the `OpenAI's implementation <https://github.com/openai/gpt-2>`_ and is provided with `OpenAI's pre-trained model <https://github.com/openai/gpt-2>`__ and a command-line interface that was used to convert the TensorFlow checkpoint in PyTorch.
 
+**Facebook Research's XLM** was released together with the paper `Cross-lingual Language Model Pretraining <https://arxiv.org/abs/1901.07291>`_ by Guillaume Lample and Alexis Conneau.
+This PyTorch implementation of XLM is an adaptation of the original `PyTorch implementation <https://github.com/facebookresearch/XLM>`_. TODO Lysandre filled
+
+**Google's XLNet** was released together with the paper `XLNet: Generalized Autoregressive Pretraining for Language Understanding <https://arxiv.org/abs/1906.08237>`_ by Zhilin Yang\*, Zihang Dai\*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov and Quoc V. Le.
+This PyTorch implementation of XLM is an adaptation of the `Tensorflow implementation <https://github.com/zihangdai/xlnet>`_. TODO Lysandre filled
+
+
 Content
 -------
 
@@ -68,111 +74,180 @@ Content
 
    * - Section
      - Description
-   * - `Installation <#installation>`_
+   * - `Installation <./installation.html>`_
      - How to install the package
-   * - `Overview <#overview>`_
-     - Overview of the package
-   * - `Usage <#usage>`_
+   * - `Philosphy <./philosophy.html>`_
+     - The philosophy behind this package
+   * - `Usage <./usage.html>`_
      - Quickstart examples
-   * - `Doc <#doc>`_
-     - Detailed documentation
-   * - `Examples <#examples>`_
+   * - `Examples <./examples.html>`_
      - Detailed examples on how to fine-tune Bert
-   * - `Notebooks <#notebooks>`_
+   * - `Notebooks <./notebooks.html>`_
      - Introduction on the provided Jupyter Notebooks
-   * - `TPU <#tpu>`_
+   * - `TPU <./tpu.html>`_
      - Notes on TPU support and pretraining scripts
-   * - `Command-line interface <#Command-line-interface>`_
+   * - `Command-line interface <./cli.html>`_
      - Convert a TensorFlow checkpoint in a PyTorch dump
+   * - `Migration <./migration.html>`_
+     - Migrating from ``pytorch_pretrained_BERT`` (v0.6) to ``pytorch_transformers`` (v1.0)
+   * - `Bertology <./bertology.html>`_
+     - TODO Lysandre didn't know how to fill
+   * - `TorchScript <./torchscript.html>`_
+     - Convert a model to TorchScript for use in other programming languages
+
+.. list-table::
+   :header-rows: 1
+
+   * - Section
+     - Description
+   * - `Overview <./model_doc/overview.html>`_
+     - Overview of the package
+   * - `BERT <./model_doc/bert.html>`_
+     - BERT Models, Tokenizers and optimizers
+   * - `OpenAI GPT <./model_doc/gpt.html>`_
+     - GPT Models, Tokenizers and optimizers
+   * - `TransformerXL <./model_doc/transformerxl.html>`_
+     - TransformerXL Models, Tokenizers and optimizers
+   * - `OpenAI GPT2 <./model_doc/gpt2.html>`_
+     - GPT2 Models, Tokenizers and optimizers
+   * - `XLM <./model_doc/xlm.html>`_
+     - XLM Models, Tokenizers and optimizers
+   * - `XLNet <./model_doc/xlnet.html>`_
+     - XLNet Models, Tokenizers and optimizers
+
+TODO Lysandre filled: might need an introduction for both parts. Is it even necessary, since there is a summary? Up to you Thom.
 
 Overview
 --------
 
-This package comprises the following classes that can be imported in Python and are detailed in the `Doc <#doc>`_ section of this readme:
+This package comprises the following classes that can be imported in Python and are detailed in the `documentation <./model_doc/overview.html>`_ section of this package:
 
 
 *
-  Eight **Bert** PyTorch models (\ ``torch.nn.Module``\ ) with pre-trained weights (in the `modeling.py <https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/pytorch_pretrained_bert/modeling.py>`_ file):
+  Eight **Bert** PyTorch models (\ ``torch.nn.Module``\ ) with pre-trained weights (in the `modeling_bert.py <./_modules/pytorch_transformers/modeling_bert.html>`_ file):
 
 
-  * `BertModel <https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/pytorch_pretrained_bert/modeling.py#L639>`_ - raw BERT Transformer model (\ **fully pre-trained**\ ),
-  * `BertForMaskedLM <https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/pytorch_pretrained_bert/modeling.py#L793>`_ - BERT Transformer with the pre-trained masked language modeling head on top (\ **fully pre-trained**\ ),
-  * `BertForNextSentencePrediction <https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/pytorch_pretrained_bert/modeling.py#L854>`_ - BERT Transformer with the pre-trained next sentence prediction classifier on top  (\ **fully pre-trained**\ ),
-  * `BertForPreTraining <https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/pytorch_pretrained_bert/modeling.py#L722>`_ - BERT Transformer with masked language modeling head and next sentence prediction classifier on top (\ **fully pre-trained**\ ),
-  * `BertForSequenceClassification <https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/pytorch_pretrained_bert/modeling.py#L916>`_ - BERT Transformer with a sequence classification head on top (BERT Transformer is **pre-trained**\ , the sequence classification head **is only initialized and has to be trained**\ ),
-  * `BertForMultipleChoice <https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/pytorch_pretrained_bert/modeling.py#L982>`_ - BERT Transformer with a multiple choice head on top (used for task like Swag) (BERT Transformer is **pre-trained**\ , the multiple choice classification head **is only initialized and has to be trained**\ ),
-  * `BertForTokenClassification <https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/pytorch_pretrained_bert/modeling.py#L1051>`_ - BERT Transformer with a token classification head on top (BERT Transformer is **pre-trained**\ , the token classification head **is only initialized and has to be trained**\ ),
-  * `BertForQuestionAnswering <https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/pytorch_pretrained_bert/modeling.py#L1124>`_ - BERT Transformer with a token classification head on top (BERT Transformer is **pre-trained**\ , the token classification head **is only initialized and has to be trained**\ ).
+  * `BertModel <./model_doc/bert.html#pytorch_transformers.BertModel>`_ - raw BERT Transformer model (\ **fully pre-trained**\ ),
+  * `BertForMaskedLM <./model_doc/bert.html#pytorch_transformers.BertForMaskedLM>`_ - BERT Transformer with the pre-trained masked language modeling head on top (\ **fully pre-trained**\ ),
+  * `BertForNextSentencePrediction <./model_doc/bert.html#pytorch_transformers.BertForNextSentencePrediction>`_ - BERT Transformer with the pre-trained next sentence prediction classifier on top  (\ **fully pre-trained**\ ),
+  * `BertForPreTraining <./model_doc/bert.html#pytorch_transformers.BertForPreTraining>`_ - BERT Transformer with masked language modeling head and next sentence prediction classifier on top (\ **fully pre-trained**\ ),
+  * `BertForSequenceClassification <./model_doc/bert.html#pytorch_transformers.BertForSequenceClassification>`_ - BERT Transformer with a sequence classification head on top (BERT Transformer is **pre-trained**\ , the sequence classification head **is only initialized and has to be trained**\ ),
+  * `BertForMultipleChoice <./model_doc/bert.html#pytorch_transformers.BertForMultipleChoice>`_ - BERT Transformer with a multiple choice head on top (used for task like Swag) (BERT Transformer is **pre-trained**\ , the multiple choice classification head **is only initialized and has to be trained**\ ),
+  * `BertForTokenClassification <./model_doc/bert.html#pytorch_transformers.BertForTokenClassification>`_ - BERT Transformer with a token classification head on top (BERT Transformer is **pre-trained**\ , the token classification head **is only initialized and has to be trained**\ ),
+  * `BertForQuestionAnswering <./model_doc/bert.html#pytorch_transformers.BertForQuestionAnswering>`_ - BERT Transformer with a token classification head on top (BERT Transformer is **pre-trained**\ , the token classification head **is only initialized and has to be trained**\ ).
 
 *
-  Three **OpenAI GPT** PyTorch models (\ ``torch.nn.Module``\ ) with pre-trained weights (in the `modeling_openai.py <https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/pytorch_pretrained_bert/modeling_openai.py>`_ file):
+  Three **OpenAI GPT** PyTorch models (\ ``torch.nn.Module``\ ) with pre-trained weights (in the `modeling_openai.py <./_modules/pytorch_transformers/modeling_openai.html>`_ file):
 
 
-  * `OpenAIGPTModel <https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/pytorch_pretrained_bert/modeling_openai.py#L536>`_ - raw OpenAI GPT Transformer model (\ **fully pre-trained**\ ),
-  * `OpenAIGPTLMHeadModel <https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/pytorch_pretrained_bert/modeling_openai.py#L643>`_ - OpenAI GPT Transformer with the tied language modeling head on top (\ **fully pre-trained**\ ),
-  * `OpenAIGPTDoubleHeadsModel <https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/pytorch_pretrained_bert/modeling_openai.py#L722>`_ - OpenAI GPT Transformer with the tied language modeling head and a multiple choice classification head on top (OpenAI GPT Transformer is **pre-trained**\ , the multiple choice classification head **is only initialized and has to be trained**\ ),
+  * `OpenAIGPTModel <./model_doc/gpt.html#pytorch_transformers.OpenAIGPTModel>`_ - raw OpenAI GPT Transformer model (\ **fully pre-trained**\ ),
+  * `OpenAIGPTLMHeadModel <./model_doc/gpt.html#pytorch_transformers.OpenAIGPTLMHeadModel>`_ - OpenAI GPT Transformer with the tied language modeling head on top (\ **fully pre-trained**\ ),
+  * `OpenAIGPTDoubleHeadsModel <./model_doc/gpt.html#pytorch_transformers.OpenAIGPTDoubleHeadsModel>`_ - OpenAI GPT Transformer with the tied language modeling head and a multiple choice classification head on top (OpenAI GPT Transformer is **pre-trained**\ , the multiple choice classification head **is only initialized and has to be trained**\ ),
 
 *
-  Two **Transformer-XL** PyTorch models (\ ``torch.nn.Module``\ ) with pre-trained weights (in the `modeling_transfo_xl.py <https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/pytorch_pretrained_bert/modeling_transfo_xl.py>`_ file):
+  Two **Transformer-XL** PyTorch models (\ ``torch.nn.Module``\ ) with pre-trained weights (in the `modeling_transfo_xl.py <./_modules/pytorch_transformers/modeling_transfo_xl.html>`_ file):
 
 
-  * `TransfoXLModel <https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/pytorch_pretrained_bert/modeling_transfo_xl.py#L983>`_ - Transformer-XL model which outputs the last hidden state and memory cells (\ **fully pre-trained**\ ),
-  * `TransfoXLLMHeadModel <https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/pytorch_pretrained_bert/modeling_transfo_xl.py#L1260>`_ - Transformer-XL with the tied adaptive softmax head on top for language modeling which outputs the logits/loss and memory cells (\ **fully pre-trained**\ ),
+  * `TransfoXLModel <./model_doc/transformerxl.html#pytorch_transformers.TransfoXLModel>`_ - Transformer-XL model which outputs the last hidden state and memory cells (\ **fully pre-trained**\ ),
+  * `TransfoXLLMHeadModel <./model_doc/transformerxl.html#pytorch_transformers.TransfoXLLMHeadModel>`_ - Transformer-XL with the tied adaptive softmax head on top for language modeling which outputs the logits/loss and memory cells (\ **fully pre-trained**\ ),
 
 *
-  Three **OpenAI GPT-2** PyTorch models (\ ``torch.nn.Module``\ ) with pre-trained weights (in the `modeling_gpt2.py <https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/pytorch_pretrained_bert/modeling_gpt2.py>`_ file):
+  Three **OpenAI GPT-2** PyTorch models (\ ``torch.nn.Module``\ ) with pre-trained weights (in the `modeling_gpt2.py <./_modules/pytorch_transformers/modeling_gpt2.html>`_ file):
 
 
-  * `GPT2Model <https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/pytorch_pretrained_bert/modeling_gpt2.py#L479>`_ - raw OpenAI GPT-2 Transformer model (\ **fully pre-trained**\ ),
-  * `GPT2LMHeadModel <https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/pytorch_pretrained_bert/modeling_gpt2.py#L559>`_ - OpenAI GPT-2 Transformer with the tied language modeling head on top (\ **fully pre-trained**\ ),
-  * `GPT2DoubleHeadsModel <https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/pytorch_pretrained_bert/modeling_gpt2.py#L624>`_ - OpenAI GPT-2 Transformer with the tied language modeling head and a multiple choice classification head on top (OpenAI GPT-2 Transformer is **pre-trained**\ , the multiple choice classification head **is only initialized and has to be trained**\ ),
+  * `GPT2Model <./model_doc/gpt2.html#pytorch_transformers.GPT2Model>`_ - raw OpenAI GPT-2 Transformer model (\ **fully pre-trained**\ ),
+  * `GPT2LMHeadModel <./model_doc/gpt2.html#pytorch_transformers.GPT2LMHeadModel>`_ - OpenAI GPT-2 Transformer with the tied language modeling head on top (\ **fully pre-trained**\ ),
+  * `GPT2DoubleHeadsModel <./model_doc/gpt2.html#pytorch_transformers.GPT2DoubleHeadsModel>`_ - OpenAI GPT-2 Transformer with the tied language modeling head and a multiple choice classification head on top (OpenAI GPT-2 Transformer is **pre-trained**\ , the multiple choice classification head **is only initialized and has to be trained**\ ),
 
 *
-  Tokenizers for **BERT** (using word-piece) (in the `tokenization.py <https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/pytorch_pretrained_bert/tokenization.py>`_ file):
+  Four **XLM** PyTorch models (\ ``torch.nn.Module``\ ) with pre-trained weights (in the `modeling_xlm.py <./_modules/pytorch_transformers/modeling_xlm.html>`_ file):
 
 
+  * `XLMModel <./model_doc/xlm.html#pytorch_transformers.XLMModel>`_ - raw XLM Transformer model (\ **fully pre-trained**\ ),
+  * `XLMWithLMHeadModel <./model_doc/xlm.html#pytorch_transformers.XLMWithLMHeadModel>`_ - XLM Transformer with the tied language modeling head on top (\ **fully pre-trained**\ ),
+  * `XLMForSequenceClassification <./model_doc/xlm.html#pytorch_transformers.XLMForSequenceClassification>`_ - XLM Transformer with a sequence classification head on top (XLM Transformer is **pre-trained**\ , the sequence classification head **is only initialized and has to be trained**\ ),
+  * `XLMForQuestionAnswering <./model_doc/xlm.html#pytorch_transformers.XLMForQuestionAnswering>`_ - XLM Transformer with a token classification head on top (XLM Transformer is **pre-trained**\ , the token classification head **is only initialized and has to be trained**\ )
+
+*
+  Four **XLNet** PyTorch models (\ ``torch.nn.Module``\ ) with pre-trained weights (in the `modeling_xlnet.py <./_modules/pytorch_transformers/modeling_xlnet.html>`_ file):
+
+
+  * `XLNetModel <./model_doc/xlnet.html#pytorch_transformers.XLNetModel>`_ - raw XLNet Transformer model (\ **fully pre-trained**\ ),
+  * `XLNetLMHeadModel <./model_doc/xlnet.html#pytorch_transformers.XLNetLMHeadModel>`_ - XLNet Transformer with the tied language modeling head on top (\ **fully pre-trained**\ ),
+  * `XLNetForSequenceClassification <./model_doc/xlnet.html#pytorch_transformers.XLNetForSequenceClassification>`_ - XLNet Transformer with a sequence classification head on top (XLM Transformer is **pre-trained**\ , the sequence classification head **is only initialized and has to be trained**\ ),
+  * `XLNetForQuestionAnswering <./model_doc/xlnet.html#pytorch_transformers.XLNetForQuestionAnswering>`_ - XLNet Transformer with a token classification head on top (XLNet Transformer is **pre-trained**\ , the token classification head **is only initialized and has to be trained**\ )
+
+
+TODO Lysandre filled: I filled in XLM and XLNet. I didn't do the Tokenizers because I don't know the current philosophy behind them.
+
+*
+  Tokenizers for **BERT** (using word-piece) (in the `tokenization_bert.py <./_modules/pytorch_transformers/tokenization_bert.html>`_ file):
+
   * ``BasicTokenizer`` - basic tokenization (punctuation splitting, lower casing, etc.),
   * ``WordpieceTokenizer`` - WordPiece tokenization,
   * ``BertTokenizer`` - perform end-to-end tokenization, i.e. basic tokenization followed by WordPiece tokenization.
 
-*
-  Tokenizer for **OpenAI GPT** (using Byte-Pair-Encoding) (in the `tokenization_openai.py <https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/pytorch_pretrained_bert/tokenization_openai.py>`_ file):
 
+*
+  Tokenizer for **OpenAI GPT** (using Byte-Pair-Encoding) (in the `tokenization_openai.py <./_modules/pytorch_transformers/tokenization_openai.html>`_ file):
 
   * ``OpenAIGPTTokenizer`` - perform Byte-Pair-Encoding (BPE) tokenization.
 
-*
-  Tokenizer for **Transformer-XL** (word tokens ordered by frequency for adaptive softmax) (in the `tokenization_transfo_xl.py <https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/pytorch_pretrained_bert/tokenization_transfo_xl.py>`_ file):
-
-
-  * ``OpenAIGPTTokenizer`` - perform word tokenization and can order words by frequency in a corpus for use in an adaptive softmax.
 
 *
-  Tokenizer for **OpenAI GPT-2** (using byte-level Byte-Pair-Encoding) (in the `tokenization_gpt2.py <https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/pytorch_pretrained_bert/tokenization_gpt2.py>`_ file):
-
+  Tokenizer for **OpenAI GPT-2** (using byte-level Byte-Pair-Encoding) (in the `tokenization_gpt2.py <./_modules/pytorch_transformers/tokenization_gpt2.html>`_ file):
 
   * ``GPT2Tokenizer`` - perform byte-level Byte-Pair-Encoding (BPE) tokenization.
 
+
 *
-  Optimizer for **BERT** (in the `optimization.py <https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/pytorch_pretrained_bert/pytorch_pretrained_bert/optimization.py>`_ file):
+  Tokenizer for **Transformer-XL** (word tokens ordered by frequency for adaptive softmax) (in the `tokenization_transfo_xl.py <./_modules/pytorch_transformers/tokenization_transfo_xl.html>`_ file):
+
+  * ``OpenAIGPTTokenizer`` - perform word tokenization and can order words by frequency in a corpus for use in an adaptive softmax.
+
+
+*
+  Tokenizer for **XLNet** (SentencePiece based tokenizer) (in the `tokenization_xlnet.py <./_modules/pytorch_transformers/tokenization_xlnet.html>`_ file):
+
+  * ``XLNetTokenizer`` - perform SentencePiece tokenization.
+
+
+*
+  Tokenizer for **XLM** (using Byte-Pair-Encoding) (in the `tokenization_xlm.py <./_modules/pytorch_transformers/tokenization_xlm.html>`_ file):
+
+  * ``GPT2Tokenizer`` - perform Byte-Pair-Encoding (BPE) tokenization.
+
+
+*
+  Optimizer for **BERT** (in the `optimization.py <./_modules/pytorch_transformers/optimization.html>`_ file):
 
 
   * ``BertAdam`` - Bert version of Adam algorithm with weight decay fix, warmup and linear decay of the learning rate.
 
+
 *
-  Optimizer for **OpenAI GPT** (in the `optimization_openai.py <https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/pytorch_pretrained_bert/optimization_openai.py>`_ file):
+  Optimizer for **OpenAI GPT** (in the `optimization_openai.py <./_modules/pytorch_transformers/optimization_openai.html>`_ file):
 
 
   * ``OpenAIAdam`` - OpenAI GPT version of Adam algorithm with weight decay fix, warmup and linear decay of the learning rate.
 
+
 *
-  Configuration classes for BERT, OpenAI GPT and Transformer-XL (in the respective `modeling.py <https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/pytorch_pretrained_bert/modeling.py>`_\ , `modeling_openai.py <https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/pytorch_pretrained_bert/modeling_openai.py>`_\ , `modeling_transfo_xl.py <https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/pytorch_pretrained_bert/modeling_transfo_xl.py>`_ files):
+  Configuration classes for BERT, OpenAI GPT, Transformer-XL, XLM and XLNet (in the respective \
+  `modeling_bert.py <./_modules/pytorch_transformers/modeling_bert.html>`_\ , \
+  `modeling_openai.py <./_modules/pytorch_transformers/modeling_openai.html>`_\ , \
+  `modeling_transfo_xl.py <./_modules/pytorch_transformers/modeling_transfo_xl.html>`_, \
+  `modeling_xlm.py <./_modules/pytorch_transformers/modeling_xlm.html>`_, \
+  `modeling_xlnet.py <./_modules/pytorch_transformers/modeling_xlnet.html>`_ \
+  files):
 
 
   * ``BertConfig`` - Configuration class to store the configuration of a ``BertModel`` with utilities to read and write from JSON configuration files.
   * ``OpenAIGPTConfig`` - Configuration class to store the configuration of a ``OpenAIGPTModel`` with utilities to read and write from JSON configuration files.
   * ``GPT2Config`` - Configuration class to store the configuration of a ``GPT2Model`` with utilities to read and write from JSON configuration files.
   * ``TransfoXLConfig`` - Configuration class to store the configuration of a ``TransfoXLModel`` with utilities to read and write from JSON configuration files.
+  * ``XLMConfig`` - Configuration class to store the configuration of a ``XLMModel`` with utilities to read and write from JSON configuration files.
+  * ``XLNetConfig`` - Configuration class to store the configuration of a ``XLNetModel`` with utilities to read and write from JSON configuration files.
 
 The repository further comprises:
 
diff --git a/docs/source/model_doc/bert.rst b/docs/source/model_doc/bert.rst
index 554c6e9b7c..3a2e12a6dd 100644
--- a/docs/source/model_doc/bert.rst
+++ b/docs/source/model_doc/bert.rst
@@ -21,56 +21,56 @@ BERT
 .. autoclass:: pytorch_transformers.BertAdam
     :members:
 
-1. ``BertModel``
+``BertModel``
 ~~~~~~~~~~~~~~~~~~~~
 
 .. autoclass:: pytorch_transformers.BertModel
     :members:
 
 
-2. ``BertForPreTraining``
+``BertForPreTraining``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 .. autoclass:: pytorch_transformers.BertForPreTraining
     :members:
 
 
-3. ``BertForMaskedLM``
+``BertForMaskedLM``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 .. autoclass:: pytorch_transformers.BertForMaskedLM
     :members:
 
 
-4. ``BertForNextSentencePrediction``
+``BertForNextSentencePrediction``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 .. autoclass:: pytorch_transformers.BertForNextSentencePrediction
     :members:
 
 
-5. ``BertForSequenceClassification``
+``BertForSequenceClassification``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 .. autoclass:: pytorch_transformers.BertForSequenceClassification
     :members:
 
 
-6. ``BertForMultipleChoice``
+``BertForMultipleChoice``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 .. autoclass:: pytorch_transformers.BertForMultipleChoice
     :members:
 
 
-7. ``BertForTokenClassification``
+``BertForTokenClassification``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 .. autoclass:: pytorch_transformers.BertForTokenClassification
     :members:
 
 
-8. ``BertForQuestionAnswering``
+``BertForQuestionAnswering``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 .. autoclass:: pytorch_transformers.BertForQuestionAnswering
diff --git a/docs/source/model_doc/gpt.rst b/docs/source/model_doc/gpt.rst
index b5e518759a..815cbe5787 100644
--- a/docs/source/model_doc/gpt.rst
+++ b/docs/source/model_doc/gpt.rst
@@ -22,21 +22,21 @@ OpenAI GPT
     :members:
 
 
-9. ``OpenAIGPTModel``
+``OpenAIGPTModel``
 ~~~~~~~~~~~~~~~~~~~~~~~~~
 
 .. autoclass:: pytorch_transformers.OpenAIGPTModel
     :members:
 
 
-10. ``OpenAIGPTLMHeadModel``
+``OpenAIGPTLMHeadModel``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 .. autoclass:: pytorch_transformers.OpenAIGPTLMHeadModel
     :members:
 
 
-11. ``OpenAIGPTDoubleHeadsModel``
+``OpenAIGPTDoubleHeadsModel``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 .. autoclass:: pytorch_transformers.OpenAIGPTDoubleHeadsModel
diff --git a/docs/source/model_doc/gpt2.rst b/docs/source/model_doc/gpt2.rst
index fe2cd46c37..a49d1b4258 100644
--- a/docs/source/model_doc/gpt2.rst
+++ b/docs/source/model_doc/gpt2.rst
@@ -15,21 +15,21 @@ OpenAI GPT2
     :members:
 
 
-14. ``GPT2Model``
+``GPT2Model``
 ~~~~~~~~~~~~~~~~~~~~~
 
 .. autoclass:: pytorch_transformers.GPT2Model
     :members:
 
 
-15. ``GPT2LMHeadModel``
+``GPT2LMHeadModel``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 .. autoclass:: pytorch_transformers.GPT2LMHeadModel
     :members:
 
 
-16. ``GPT2DoubleHeadsModel``
+``GPT2DoubleHeadsModel``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 .. autoclass:: pytorch_transformers.GPT2DoubleHeadsModel
diff --git a/docs/source/model_doc/overview.rst b/docs/source/model_doc/overview.rst
index f76c010e85..d70fa3beb9 100644
--- a/docs/source/model_doc/overview.rst
+++ b/docs/source/model_doc/overview.rst
@@ -14,13 +14,8 @@ Here is a detailed documentation of the classes in the package and how to use th
    * - `Serialization best-practices <#serialization-best-practices>`__
      - How to save and reload a fine-tuned model
    * - `Configurations <#configurations>`__
-     - API of the configuration classes for BERT, GPT, GPT-2 and Transformer-XL
-   * - `Models <#models>`__
-     - API of the PyTorch model classes for BERT, GPT, GPT-2 and Transformer-XL
-   * - `Tokenizers <#tokenizers>`__
-     - API of the tokenizers class for BERT, GPT, GPT-2 and Transformer-XL
-   * - `Optimizers <#optimizers>`__
-     - API of the optimizers
+
+TODO Lysandre filled: Removed Models/Tokenizers/Optimizers as no single link can be made.
 
 
 Configurations
@@ -245,7 +240,7 @@ An overview of the implemented schedules:
 
 
 * ``ConstantLR``\ : always returns learning rate 1.
-* ``WarmupConstantSchedule`` : Linearly increases learning rate from 0 to 1 over ``warmup`` fraction of training steps.
+* ``WarmupConstantSchedule`` : Linearly increases learning rate from 0 to 1 over ``warmup`` fraction of training steps. \
     Keeps learning rate equal to 1. after warmup.
 
   .. image:: /imgs/warmup_constant_schedule.png
@@ -253,7 +248,7 @@ An overview of the implemented schedules:
      :alt:
 
 
-* ``WarmupLinearSchedule`` : Linearly increases learning rate from 0 to 1 over ``warmup`` fraction of training steps.
+* ``WarmupLinearSchedule`` : Linearly increases learning rate from 0 to 1 over ``warmup`` fraction of training steps. \
     Linearly decreases learning rate from 1. to 0. over remaining ``1 - warmup`` steps.
 
   .. image:: /imgs/warmup_linear_schedule.png
@@ -261,9 +256,9 @@ An overview of the implemented schedules:
      :alt:
 
 
-* ``WarmupCosineSchedule`` : Linearly increases learning rate from 0 to 1 over ``warmup`` fraction of training steps.
-   Decreases learning rate from 1. to 0. over remaining ``1 - warmup`` steps following a cosine curve.
-   If ``cycles`` (default=0.5) is different from default, learning rate follows cosine function after warmup.
+* ``WarmupCosineSchedule`` : Linearly increases learning rate from 0 to 1 over ``warmup`` fraction of training steps. \
+  Decreases learning rate from 1. to 0. over remaining ``1 - warmup`` steps following a cosine curve. \
+  If ``cycles`` (default=0.5) is different from default, learning rate follows cosine function after warmup.
 
   .. image:: /imgs/warmup_cosine_schedule.png
      :target: /imgs/warmup_cosine_schedule.png
@@ -271,7 +266,7 @@ An overview of the implemented schedules:
 
 
 * ``WarmupCosineWithHardRestartsSchedule`` : Linearly increases learning rate from 0 to 1 over ``warmup`` fraction of training steps.
-    If ``cycles`` (default=1.) is different from default, learning rate follows ``cycles`` times a cosine decaying learning rate (with hard restarts).
+  If ``cycles`` (default=1.) is different from default, learning rate follows ``cycles`` times a cosine decaying learning rate (with hard restarts).
 
   .. image:: /imgs/warmup_cosine_hard_restarts_schedule.png
      :target: /imgs/warmup_cosine_hard_restarts_schedule.png
@@ -279,9 +274,9 @@ An overview of the implemented schedules:
 
 
 * ``WarmupCosineWithWarmupRestartsSchedule`` : All training progress is divided in ``cycles`` (default=1.) parts of equal length.
-    Every part follows a schedule with the first ``warmup`` fraction of the training steps linearly increasing from 0. to 1.,
-    followed by a learning rate decreasing from 1. to 0. following a cosine curve.
-    Note that the total number of all warmup steps over all cycles together is equal to ``warmup`` * ``cycles``
+  Every part follows a schedule with the first ``warmup`` fraction of the training steps linearly increasing from 0. to 1.,
+  followed by a learning rate decreasing from 1. to 0. following a cosine curve.
+  Note that the total number of all warmup steps over all cycles together is equal to ``warmup`` * ``cycles``
 
   .. image:: /imgs/warmup_cosine_warm_restarts_schedule.png
      :target: /imgs/warmup_cosine_warm_restarts_schedule.png
diff --git a/docs/source/model_doc/transformerxl.rst b/docs/source/model_doc/transformerxl.rst
index 20cc7a224c..88cca450ee 100644
--- a/docs/source/model_doc/transformerxl.rst
+++ b/docs/source/model_doc/transformerxl.rst
@@ -16,14 +16,14 @@ Transformer XL
     :members:
 
 
-12. ``TransfoXLModel``
+``TransfoXLModel``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 .. autoclass:: pytorch_transformers.TransfoXLModel
     :members:
 
 
-13. ``TransfoXLLMHeadModel``
+``TransfoXLLMHeadModel``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 .. autoclass:: pytorch_transformers.TransfoXLLMHeadModel
diff --git a/docs/source/model_doc/xlm.rst b/docs/source/model_doc/xlm.rst
index 3d7a9e5e44..217952ea5e 100644
--- a/docs/source/model_doc/xlm.rst
+++ b/docs/source/model_doc/xlm.rst
@@ -7,29 +7,34 @@ XLM
 .. autoclass:: pytorch_transformers.XLMConfig
     :members:
 
+``XLMTokenizer``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-17. ``XLMModel``
+.. autoclass:: pytorch_transformers.XLMTokenizer
+    :members:
+
+``XLMModel``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 .. autoclass:: pytorch_transformers.XLMModel
     :members:
 
 
-18. ``XLMWithLMHeadModel``
+``XLMWithLMHeadModel``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 .. autoclass:: pytorch_transformers.XLMWithLMHeadModel
     :members:
 
 
-19. ``XLMForSequenceClassification``
+``XLMForSequenceClassification``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 .. autoclass:: pytorch_transformers.XLMForSequenceClassification
     :members:
 
 
-20. ``XLMForQuestionAnswering``
+``XLMForQuestionAnswering``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 .. autoclass:: pytorch_transformers.XLMForQuestionAnswering
diff --git a/docs/source/model_doc/xlnet.rst b/docs/source/model_doc/xlnet.rst
index b150e771bd..e388934c56 100644
--- a/docs/source/model_doc/xlnet.rst
+++ b/docs/source/model_doc/xlnet.rst
@@ -8,28 +8,35 @@ XLNet
     :members:
 
 
-21. ``XLNetModel``
+``XLNetTokenizer``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: pytorch_transformers.XLNetTokenizer
+    :members:
+
+
+``XLNetModel``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 .. autoclass:: pytorch_transformers.XLNetModel
     :members:
 
 
-22. ``XLNetLMHeadModel``
+``XLNetLMHeadModel``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 .. autoclass:: pytorch_transformers.XLNetLMHeadModel
     :members:
 
 
-23. ``XLNetForSequenceClassification``
+``XLNetForSequenceClassification``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 .. autoclass:: pytorch_transformers.XLNetForSequenceClassification
     :members:
 
 
-24. ``XLNetForQuestionAnswering``
+``XLNetForQuestionAnswering``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 .. autoclass:: pytorch_transformers.XLNetForQuestionAnswering
diff --git a/docs/source/notebooks.rst b/docs/source/notebooks.rst
index f608bf64c5..35d54370ba 100644
--- a/docs/source/notebooks.rst
+++ b/docs/source/notebooks.rst
@@ -5,12 +5,12 @@ We include `three Jupyter Notebooks <https://github.com/huggingface/pytorch-pret
 
 
 *
-  The first NoteBook (\ `Comparing-TF-and-PT-models.ipynb <./notebooks/Comparing-TF-and-PT-models.ipynb>`_\ ) extracts the hidden states of a full sequence on each layers of the TensorFlow and the PyTorch models and computes the standard deviation between them. In the given example, we get a standard deviation of 1.5e-7 to 9e-7 on the various hidden state of the models.
+  The first NoteBook (\ `Comparing-TF-and-PT-models.ipynb <https://github.com/huggingface/pytorch-pretrained-BERT/tree/master/notebooks/Comparing-TF-and-PT-models.ipynb>`_\ ) extracts the hidden states of a full sequence on each layers of the TensorFlow and the PyTorch models and computes the standard deviation between them. In the given example, we get a standard deviation of 1.5e-7 to 9e-7 on the various hidden state of the models.
 
 *
-  The second NoteBook (\ `Comparing-TF-and-PT-models-SQuAD.ipynb <./notebooks/Comparing-TF-and-PT-models-SQuAD.ipynb>`_\ ) compares the loss computed by the TensorFlow and the PyTorch models for identical initialization of the fine-tuning layer of the ``BertForQuestionAnswering`` and computes the standard deviation between them. In the given example, we get a standard deviation of 2.5e-7 between the models.
+  The second NoteBook (\ `Comparing-TF-and-PT-models-SQuAD.ipynb <https://github.com/huggingface/pytorch-pretrained-BERT/tree/master/notebooks/Comparing-TF-and-PT-models-SQuAD.ipynb>`_\ ) compares the loss computed by the TensorFlow and the PyTorch models for identical initialization of the fine-tuning layer of the ``BertForQuestionAnswering`` and computes the standard deviation between them. In the given example, we get a standard deviation of 2.5e-7 between the models.
 
 *
-  The third NoteBook (\ `Comparing-TF-and-PT-models-MLM-NSP.ipynb <./notebooks/Comparing-TF-and-PT-models-MLM-NSP.ipynb>`_\ ) compares the predictions computed by the TensorFlow and the PyTorch models for masked token language modeling using the pre-trained masked language modeling model.
+  The third NoteBook (\ `Comparing-TF-and-PT-models-MLM-NSP.ipynb <https://github.com/huggingface/pytorch-pretrained-BERT/tree/mnotebooks/Comparing-TF-and-PT-models-MLM-NSP.ipynb>`_\ ) compares the predictions computed by the TensorFlow and the PyTorch models for masked token language modeling using the pre-trained masked language modeling model.
 
 Please follow the instructions given in the notebooks to run and modify them.
diff --git a/docs/source/tpu.rst b/docs/source/tpu.rst
deleted file mode 100644
index 31f72ca891..0000000000
--- a/docs/source/tpu.rst
+++ /dev/null
@@ -1,13 +0,0 @@
-TPU
-================================================
-
-TPU support and pretraining scripts
-------------------------------------------------
-
-TPU are not supported by the current stable release of PyTorch (0.4.1). However, the next version of PyTorch (v1.0) should support training on TPU and is expected to be released soon (see the recent `official announcement <https://cloud.google.com/blog/products/ai-machine-learning/introducing-pytorch-across-google-cloud>`_\ ).
-
-We will add TPU support when this next release is published.
-
-The original TensorFlow code further comprises two scripts for pre-training BERT: `create_pretraining_data.py <https://github.com/google-research/bert/blob/master/create_pretraining_data.py>`_ and `run_pretraining.py <https://github.com/google-research/bert/blob/master/run_pretraining.py>`_.
-
-Since, pre-training BERT is a particularly expensive operation that basically requires one or several TPUs to be completed in a reasonable amout of time (see details `here <https://github.com/google-research/bert#pre-training-with-bert>`_\ ) we have decided to wait for the inclusion of TPU support in PyTorch to convert these pre-training scripts.
diff --git a/docs/source/usage.rst b/docs/source/usage.rst
index 1abfa3c1aa..9956f3ac84 100644
--- a/docs/source/usage.rst
+++ b/docs/source/usage.rst
@@ -4,7 +4,7 @@ Usage
 BERT
 ^^^^
 
-Here is a quick-start example using ``BertTokenizer``\ , ``BertModel`` and ``BertForMaskedLM`` class with Google AI's pre-trained ``Bert base uncased`` model. See the `doc section <#doc>`_ below for all the details on these classes.
+Here is a quick-start example using ``BertTokenizer``\ , ``BertModel`` and ``BertForMaskedLM`` class with Google AI's pre-trained ``Bert base uncased`` model. See the `doc section <./model_doc/overview.html>`_ below for all the details on these classes.
 
 First let's prepare a tokenized input with ``BertTokenizer``
 
@@ -82,7 +82,7 @@ And how to use ``BertForMaskedLM``
 OpenAI GPT
 ^^^^^^^^^^
 
-Here is a quick-start example using ``OpenAIGPTTokenizer``\ , ``OpenAIGPTModel`` and ``OpenAIGPTLMHeadModel`` class with OpenAI's pre-trained  model. See the `doc section <#doc>`_ below for all the details on these classes.
+Here is a quick-start example using ``OpenAIGPTTokenizer``\ , ``OpenAIGPTModel`` and ``OpenAIGPTLMHeadModel`` class with OpenAI's pre-trained  model. See the `doc section <./model_doc/overview.html>`_ for all the details on these classes.
 
 First let's prepare a tokenized input with ``OpenAIGPTTokenizer``
 
@@ -170,7 +170,7 @@ And how to use ``OpenAIGPTDoubleHeadsModel``
 Transformer-XL
 ^^^^^^^^^^^^^^
 
-Here is a quick-start example using ``TransfoXLTokenizer``\ , ``TransfoXLModel`` and ``TransfoXLModelLMHeadModel`` class with the Transformer-XL model pre-trained on WikiText-103. See the `doc section <#doc>`_ below for all the details on these classes.
+Here is a quick-start example using ``TransfoXLTokenizer``\ , ``TransfoXLModel`` and ``TransfoXLModelLMHeadModel`` class with the Transformer-XL model pre-trained on WikiText-103. See the `doc section <./model_doc/overview.html>`_ for all the details on these classes.
 
 First let's prepare a tokenized input with ``TransfoXLTokenizer``
 
@@ -246,7 +246,7 @@ And how to use ``TransfoXLLMHeadModel``
 OpenAI GPT-2
 ^^^^^^^^^^^^
 
-Here is a quick-start example using ``GPT2Tokenizer``\ , ``GPT2Model`` and ``GPT2LMHeadModel`` class with OpenAI's pre-trained  model. See the `doc section <#doc>`_ below for all the details on these classes.
+Here is a quick-start example using ``GPT2Tokenizer``\ , ``GPT2Model`` and ``GPT2LMHeadModel`` class with OpenAI's pre-trained  model. See the `doc section <./model_doc/overview.html>`_ for all the details on these classes.
 
 First let's prepare a tokenized input with ``GPT2Tokenizer``
 
diff --git a/pytorch_transformers/modeling_bert.py b/pytorch_transformers/modeling_bert.py
index 848296ad9f..cad5409350 100644
--- a/pytorch_transformers/modeling_bert.py
+++ b/pytorch_transformers/modeling_bert.py
@@ -1274,20 +1274,20 @@ class BertForQuestionAnswering(BertPreTrainedModel):
         Performs a model forward pass. **Can be called by calling the class directly, once it has been instantiated.**
 
         Parameters:
-            `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
+            `input_ids`: a ``torch.LongTensor`` of shape [batch_size, sequence_length]
                 with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
                 `run_bert_extract_features.py`, `run_bert_classifier.py` and `run_bert_squad.py`)
-            `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
+            `token_type_ids`: an optional ``torch.LongTensor`` of shape [batch_size, sequence_length] with the token
                 types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
                 a `sentence B` token (see BERT paper for more details).
-            `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
+            `attention_mask`: an optional ``torch.LongTensor`` of shape [batch_size, sequence_length] with indices
                 selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
                 input sequence length in the current batch. It's the mask that we typically use for attention when
                 a batch has varying length sentences.
-            `start_positions`: position of the first token for the labeled span: torch.LongTensor of shape [batch_size].
+            `start_positions`: position of the first token for the labeled span: ``torch.LongTensor`` of shape [batch_size].
                 Positions are clamped to the length of the sequence and position outside of the sequence are not taken
                 into account for computing the loss.
-            `end_positions`: position of the last token for the labeled span: torch.LongTensor of shape [batch_size].
+            `end_positions`: position of the last token for the labeled span: ``torch.LongTensor`` of shape [batch_size].
                 Positions are clamped to the length of the sequence and position outside of the sequence are not taken
                 into account for computing the loss.
             `head_mask`: an optional ``torch.Tensor`` of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
diff --git a/pytorch_transformers/tokenization_xlm.py b/pytorch_transformers/tokenization_xlm.py
index 8851455829..74bc56f350 100644
--- a/pytorch_transformers/tokenization_xlm.py
+++ b/pytorch_transformers/tokenization_xlm.py
@@ -77,10 +77,15 @@ def text_standardize(text):
 class XLMTokenizer(PreTrainedTokenizer):
     """
     BPE tokenizer for XLM, adapted from OpenAI BPE tokenizer. Peculiarities:
+
         - lower case all inputs
-        - uses SpaCy tokenizer and ftfy for pre-BPE tokenization if they are installed, fallback to BERT's BasicTokenizer if not.
-        - argument special_tokens and function set_special_tokens:
-            can be used to add additional symbols (ex: "__classify__") to a vocabulary.
+
+        - uses `SpaCy tokenizer <https://spacy.io/api/tokenizer/>`_ and \
+        `ftfy <https://ftfy.readthedocs.io/en/latest/>`_ for pre-BPE tokenization if they are installed, \
+        fallback to BERT's BasicTokenizer if not.
+
+        - argument ``special_tokens`` and function ``set_special_tokens``, can be used to add additional symbols \
+        (ex: "__classify__") to a vocabulary.
     """
     vocab_files_names = VOCAB_FILES_NAMES
     pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
diff --git a/pytorch_transformers/tokenization_xlnet.py b/pytorch_transformers/tokenization_xlnet.py
index 942b532ec6..48ec3d88a1 100644
--- a/pytorch_transformers/tokenization_xlnet.py
+++ b/pytorch_transformers/tokenization_xlnet.py
@@ -52,7 +52,8 @@ SEG_ID_PAD = 4
 class XLNetTokenizer(PreTrainedTokenizer):
     """
         SentencePiece based tokenizer. Peculiarities:
-            - requires SentencePiece: https://github.com/google/sentencepiece
+
+            - requires `SentencePiece <https://github.com/google/sentencepiece>`_
     """
     vocab_files_names = VOCAB_FILES_NAMES
     pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP

From 5288913bdd397bfe6e954ec7602d29f76dabc2a2 Mon Sep 17 00:00:00 2001
From: LysandreJik <lysandre.debut@reseau.eseo.fr>
Date: Wed, 10 Jul 2019 15:16:40 -0400
Subject: [PATCH 088/139] All TODOs to be checked by Thom have been added.

---
 pytorch_transformers/modeling_gpt2.py   | 27 ++++++++++---
 pytorch_transformers/modeling_openai.py | 25 ++++++++----
 pytorch_transformers/modeling_xlnet.py  | 52 ++++++++++++++++---------
 3 files changed, 72 insertions(+), 32 deletions(-)

diff --git a/pytorch_transformers/modeling_gpt2.py b/pytorch_transformers/modeling_gpt2.py
index ec2abf72b9..9ec5107b2a 100644
--- a/pytorch_transformers/modeling_gpt2.py
+++ b/pytorch_transformers/modeling_gpt2.py
@@ -483,7 +483,14 @@ class GPT2Model(GPT2PreTrainedModel):
         self.apply(self.init_weights)
 
     def set_num_special_tokens(self, num_special_tokens=None):
-        """Update input embeddings with new embedding matrix if needed."""
+        """
+        Update input embeddings with new embedding matrix if needed.
+
+        Args:
+            num_special_tokens: Special tokens to be added to the embedding matrix
+
+        TODO Lysandre filled args
+        """
         if num_special_tokens is None or self.config.n_special == num_special_tokens:
             return
         # Update config
@@ -625,8 +632,6 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
         `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
             This can be used to compute head importance metrics. Default: False
 
-
-
     Example::
 
         config = modeling_gpt2.GPT2Config()
@@ -642,7 +647,13 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
     def set_num_special_tokens(self, num_special_tokens, predict_special_tokens=True):
         """
         Update input and output embeddings with new embedding matrix. Make sure we are sharing the embeddings.
-        TODO Shouldn't we put args + returns ?
+
+        Args:
+            num_special_tokens: Special tokens to be added to the embedding matrix
+            predict_special_tokens: if set to True, the model will try and predict the specified ``num_special_tokens``.
+                Defaults to True.
+
+        TODO Lysandre filled args
         """
         self.config.predict_special_tokens = self.transformer.config.predict_special_tokens = predict_special_tokens
         self.transformer.set_num_special_tokens(num_special_tokens)
@@ -737,7 +748,13 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
     def set_num_special_tokens(self, num_special_tokens, predict_special_tokens=True):
         """
         Update input and output embeddings with new embedding matrix.Make sure we are sharing the embeddings
-        TODO Shouldn't we put args + returns ?
+
+        Args:
+            num_special_tokens: Special tokens to be added to the embedding matrix
+            predict_special_tokens: if set to True, the model will try and predict the specified ``num_special_tokens``.
+                Defaults to True.
+
+        TODO Lysandre filled args
         """
         self.config.predict_special_tokens = self.transformer.config.predict_special_tokens = predict_special_tokens
         self.transformer.set_num_special_tokens(num_special_tokens)
diff --git a/pytorch_transformers/modeling_openai.py b/pytorch_transformers/modeling_openai.py
index 37736efed7..d5e8185c12 100644
--- a/pytorch_transformers/modeling_openai.py
+++ b/pytorch_transformers/modeling_openai.py
@@ -496,12 +496,10 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
         """
         Update input embeddings with new embedding matrice if needed
 
-        TODO
-
         Args:
-            num_special_tokens:
+            num_special_tokens: Special tokens to be added to the embedding matrix
 
-        Returns:
+        TODO Lysandre filled Args
 
         """
         if num_special_tokens is None or self.config.n_special == num_special_tokens:
@@ -665,7 +663,13 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
     def set_num_special_tokens(self, num_special_tokens, predict_special_tokens=True):
         """
         Update input and output embeddings with new embedding matrix. Make sure we are sharing the embeddings
-        TODO
+
+        Args:
+            num_special_tokens: Special tokens to be added to the embedding matrix
+            predict_special_tokens: if set to True, the model will try and predict the specified ``num_special_tokens``.
+                Defaults to True.
+
+        TODO Lysandre filled Args
 
         """
         self.config.predict_special_tokens = self.transformer.config.predict_special_tokens = predict_special_tokens
@@ -775,9 +779,14 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
         self.apply(self.init_weights)
 
     def set_num_special_tokens(self, num_special_tokens, predict_special_tokens=True):
-        """ Update input and output embeddings with new embedding matrice
-            Make sure we are sharing the embeddings
-            TODO
+        """ Update input and output embeddings with new embedding matrix. Make sure we are sharing the embeddings.
+
+        Args:
+            num_special_tokens: Special tokens to be added to the embedding matrix
+            predict_special_tokens: if set to True, the model will try and predict the specified ``num_special_tokens``.
+                Defaults to True.
+
+        TODO Lysandre filled Args
         """
         self.config.predict_special_tokens = self.transformer.config.predict_special_tokens = predict_special_tokens
         self.transformer.set_num_special_tokens(num_special_tokens)
diff --git a/pytorch_transformers/modeling_xlnet.py b/pytorch_transformers/modeling_xlnet.py
index 5fee4e8524..7cef1b101b 100644
--- a/pytorch_transformers/modeling_xlnet.py
+++ b/pytorch_transformers/modeling_xlnet.py
@@ -623,7 +623,7 @@ class XLNetPreTrainedModel(PreTrainedModel):
 class XLNetModel(XLNetPreTrainedModel):
     """XLNet model ("XLNet: Generalized Autoregressive Pretraining for Language Understanding").
 
-    TODO: this was copied from the XLNetLMHeadModel, check that it's ok.
+    TODO Lysandre filled: this was copied from the XLNetLMHeadModel, check that it's ok.
 
     Args:
         `config`: a XLNetConfig class instance with the configuration to build a new model
@@ -631,7 +631,15 @@ class XLNetModel(XLNetPreTrainedModel):
         `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
             This can be used to compute head importance metrics. Default: False
 
-    TODO: Add usage
+
+    Example::
+
+        config = modeling.XLNetConfig(vocab_size_or_config_json_file=32000, d_model=768,
+            n_layer=12, num_attention_heads=12, intermediate_size=3072)
+
+        model = modeling.XLNetModel(config=config)
+
+    TODO Lysandre filled: Added example usage
     """
     def __init__(self, config):
         super(XLNetModel, self).__init__(config)
@@ -663,8 +671,8 @@ class XLNetModel(XLNetPreTrainedModel):
         Creates causal attention mask. Float mask where 1.0 indicates masked, 0.0 indicates not-masked.
 
         Args:
-            qlen: TODO
-            mlen: TODO
+            qlen: TODO Lysandre didn't fill
+            mlen: TODO Lysandre didn't fill
 
         ::
 
@@ -783,19 +791,25 @@ class XLNetModel(XLNetPreTrainedModel):
                 1 for tokens with losses and 0 for tokens without losses.
                 Only used during pretraining for two-stream attention.
                 Set to None during finetuning.
+            head_mask: TODO Lysandre didn't fill
 
-            mem_len: int, the number of tokens to cache.
-            reuse_len: int, the number of tokens in the currect batch to be cached
-                and reused in the future.
-            bi_data: bool, whether to use bidirectional input pipeline.
-                Usually set to True during pretraining and False during finetuning.
-            clamp_len: int, clamp all relative distances larger than clamp_len.
-                -1 means no clamping.
-            same_length: bool, whether to use the same attention length for each token.
-            summary_type: str, "last", "first", "mean", or "attn". The method
-                to pool the input to get a vector representation.
 
-        TODO: Add usage
+        Returns:
+            TODO Lysandre didn't fill: Missing returns!
+
+        Example::
+
+            # Already been converted into WordPiece token ids
+            input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
+            input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
+            token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
+
+            all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)
+            # or
+            all_encoder_layers, pooled_output = model.forward(input_ids, token_type_ids, input_mask)
+
+        TODO Lysandre filled: Filled with the LMHead example, is probably different since it has a different output
+
         """
         # the original code for XLNet uses shapes [len, bsz] with the batch dimension at the end
         # but we want a unified interface in the library with the batch size on the first dimension
@@ -951,14 +965,14 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
         `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
             This can be used to compute head importance metrics. Default: False
 
-
-
     Example::
 
         config = modeling.XLNetConfig(vocab_size_or_config_json_file=32000, d_model=768,
             n_layer=12, num_attention_heads=12, intermediate_size=3072)
 
-        model = modeling.XLNetModel(config=config)
+        model = modeling.XLNetLMHeadModel(config=config)
+
+    TODO Lysandre modified: Changed XLNetModel to XLNetLMHeadModel in the example
     """
     def __init__(self, config):
         super(XLNetLMHeadModel, self).__init__(config)
@@ -1122,7 +1136,7 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel):
                 1 for tokens with losses and 0 for tokens without losses.
                 Only used during pre-training for two-stream attention.
                 Set to None during fine-tuning.
-            labels: TODO
+            labels: TODO Lysandre didn't fill
             head_mask: an optional ``torch.Tensor`` of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
                 It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
 

From c82b74b996b309bb1334ebcc5ffca1fa96cb00e1 Mon Sep 17 00:00:00 2001
From: LysandreJik <lysandre.debut@reseau.eseo.fr>
Date: Wed, 10 Jul 2019 15:30:19 -0400
Subject: [PATCH 089/139] Fixed Sphinx errors and warnings

---
 docs/source/index.rst              | 194 ++++++++++++++---------------
 docs/source/model_doc/overview.rst |   2 +
 docs/source/notebooks.rst          |   2 +-
 3 files changed, 100 insertions(+), 98 deletions(-)

diff --git a/docs/source/index.rst b/docs/source/index.rst
index ada3fc1656..ded234354d 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -38,32 +38,32 @@ Pytorch-Transformers
 This repository contains op-for-op PyTorch reimplementations, pre-trained models and fine-tuning examples for:
 
 
-* `Google's BERT model <https://github.com/google-research/bert>`_\ ,
-* `OpenAI's GPT model <https://github.com/openai/finetune-transformer-lm>`_\ ,
-* `Google/CMU's Transformer-XL model <https://github.com/kimiyoung/transformer-xl>`_\ , and
-* `OpenAI's GPT-2 model <https://blog.openai.com/better-language-models/>`_.
+* `Google's BERT model <https://github.com/google-research/bert>`__\ ,
+* `OpenAI's GPT model <https://github.com/openai/finetune-transformer-lm>`__\ ,
+* `Google/CMU's Transformer-XL model <https://github.com/kimiyoung/transformer-xl>`__\ , and
+* `OpenAI's GPT-2 model <https://blog.openai.com/better-language-models/>`__.
 
-These implementations have been tested on several datasets (see the examples) and should match the performances of the associated TensorFlow implementations (e.g. ~91 F1 on SQuAD for BERT, ~88 F1 on RocStories for OpenAI GPT and ~18.3 perplexity on WikiText 103 for the Transformer-XL). You can find more details in the `Examples <./examples.html>`_ section.
+These implementations have been tested on several datasets (see the examples) and should match the performances of the associated TensorFlow implementations (e.g. ~91 F1 on SQuAD for BERT, ~88 F1 on RocStories for OpenAI GPT and ~18.3 perplexity on WikiText 103 for the Transformer-XL). You can find more details in the `Examples <./examples.html>`__ section.
 
 Here are some information on these models:
 
-**BERT** was released together with the paper `BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding <https://arxiv.org/abs/1810.04805>`_ by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova.
-This PyTorch implementation of BERT is provided with `Google's pre-trained models <https://github.com/google-research/bert>`_\ , examples, notebooks and a command-line interface to load any pre-trained TensorFlow checkpoint for BERT is also provided.
+**BERT** was released together with the paper `BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding <https://arxiv.org/abs/1810.04805>`__ by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova.
+This PyTorch implementation of BERT is provided with `Google's pre-trained models <https://github.com/google-research/bert>`__\ , examples, notebooks and a command-line interface to load any pre-trained TensorFlow checkpoint for BERT is also provided.
 
-**OpenAI GPT** was released together with the paper `Improving Language Understanding by Generative Pre-Training <https://blog.openai.com/language-unsupervised/>`_ by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.
-This PyTorch implementation of OpenAI GPT is an adaptation of the `PyTorch implementation by HuggingFace <https://github.com/huggingface/pytorch-openai-transformer-lm>`_ and is provided with `OpenAI's pre-trained model <https://github.com/openai/finetune-transformer-lm>`__ and a command-line interface that was used to convert the pre-trained NumPy checkpoint in PyTorch.
+**OpenAI GPT** was released together with the paper `Improving Language Understanding by Generative Pre-Training <https://blog.openai.com/language-unsupervised/>`__ by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.
+This PyTorch implementation of OpenAI GPT is an adaptation of the `PyTorch implementation by HuggingFace <https://github.com/huggingface/pytorch-openai-transformer-lm>`__ and is provided with `OpenAI's pre-trained model <https://github.com/openai/finetune-transformer-lm>`__ and a command-line interface that was used to convert the pre-trained NumPy checkpoint in PyTorch.
 
-**Google/CMU's Transformer-XL** was released together with the paper `Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context <http://arxiv.org/abs/1901.02860>`_ by Zihang Dai\*, Zhilin Yang\* , Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
-This PyTorch implementation of Transformer-XL is an adaptation of the original `PyTorch implementation <https://github.com/kimiyoung/transformer-xl>`_ which has been slightly modified to match the performances of the TensorFlow implementation and allow to re-use the pretrained weights. A command-line interface is provided to convert TensorFlow checkpoints in PyTorch models.
+**Google/CMU's Transformer-XL** was released together with the paper `Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context <http://arxiv.org/abs/1901.02860>`__ by Zihang Dai\*, Zhilin Yang\* , Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
+This PyTorch implementation of Transformer-XL is an adaptation of the original `PyTorch implementation <https://github.com/kimiyoung/transformer-xl>`__ which has been slightly modified to match the performances of the TensorFlow implementation and allow to re-use the pretrained weights. A command-line interface is provided to convert TensorFlow checkpoints in PyTorch models.
 
-**OpenAI GPT-2** was released together with the paper `Language Models are Unsupervised Multitask Learners <https://blog.openai.com/better-language-models/>`_ by Alec Radford\*, Jeffrey Wu\* , Rewon Child, David Luan, Dario Amodei\*\* and Ilya Sutskever\*\*.
-This PyTorch implementation of OpenAI GPT-2 is an adaptation of the `OpenAI's implementation <https://github.com/openai/gpt-2>`_ and is provided with `OpenAI's pre-trained model <https://github.com/openai/gpt-2>`__ and a command-line interface that was used to convert the TensorFlow checkpoint in PyTorch.
+**OpenAI GPT-2** was released together with the paper `Language Models are Unsupervised Multitask Learners <https://blog.openai.com/better-language-models/>`__ by Alec Radford\*, Jeffrey Wu\* , Rewon Child, David Luan, Dario Amodei\*\* and Ilya Sutskever\*\*.
+This PyTorch implementation of OpenAI GPT-2 is an adaptation of the `OpenAI's implementation <https://github.com/openai/gpt-2>`__ and is provided with `OpenAI's pre-trained model <https://github.com/openai/gpt-2>`__ and a command-line interface that was used to convert the TensorFlow checkpoint in PyTorch.
 
-**Facebook Research's XLM** was released together with the paper `Cross-lingual Language Model Pretraining <https://arxiv.org/abs/1901.07291>`_ by Guillaume Lample and Alexis Conneau.
-This PyTorch implementation of XLM is an adaptation of the original `PyTorch implementation <https://github.com/facebookresearch/XLM>`_. TODO Lysandre filled
+**Facebook Research's XLM** was released together with the paper `Cross-lingual Language Model Pretraining <https://arxiv.org/abs/1901.07291>`__ by Guillaume Lample and Alexis Conneau.
+This PyTorch implementation of XLM is an adaptation of the original `PyTorch implementation <https://github.com/facebookresearch/XLM>`__. TODO Lysandre filled
 
-**Google's XLNet** was released together with the paper `XLNet: Generalized Autoregressive Pretraining for Language Understanding <https://arxiv.org/abs/1906.08237>`_ by Zhilin Yang\*, Zihang Dai\*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov and Quoc V. Le.
-This PyTorch implementation of XLM is an adaptation of the `Tensorflow implementation <https://github.com/zihangdai/xlnet>`_. TODO Lysandre filled
+**Google's XLNet** was released together with the paper `XLNet: Generalized Autoregressive Pretraining for Language Understanding <https://arxiv.org/abs/1906.08237>`__ by Zhilin Yang\*, Zihang Dai\*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov and Quoc V. Le.
+This PyTorch implementation of XLM is an adaptation of the `Tensorflow implementation <https://github.com/zihangdai/xlnet>`__. TODO Lysandre filled
 
 
 Content
@@ -74,25 +74,25 @@ Content
 
    * - Section
      - Description
-   * - `Installation <./installation.html>`_
+   * - `Installation <./installation.html>`__
      - How to install the package
-   * - `Philosphy <./philosophy.html>`_
+   * - `Philosphy <./philosophy.html>`__
      - The philosophy behind this package
-   * - `Usage <./usage.html>`_
+   * - `Usage <./usage.html>`__
      - Quickstart examples
-   * - `Examples <./examples.html>`_
+   * - `Examples <./examples.html>`__
      - Detailed examples on how to fine-tune Bert
-   * - `Notebooks <./notebooks.html>`_
+   * - `Notebooks <./notebooks.html>`__
      - Introduction on the provided Jupyter Notebooks
-   * - `TPU <./tpu.html>`_
+   * - `TPU <./tpu.html>`__
      - Notes on TPU support and pretraining scripts
-   * - `Command-line interface <./cli.html>`_
+   * - `Command-line interface <./cli.html>`__
      - Convert a TensorFlow checkpoint in a PyTorch dump
-   * - `Migration <./migration.html>`_
+   * - `Migration <./migration.html>`__
      - Migrating from ``pytorch_pretrained_BERT`` (v0.6) to ``pytorch_transformers`` (v1.0)
-   * - `Bertology <./bertology.html>`_
+   * - `Bertology <./bertology.html>`__
      - TODO Lysandre didn't know how to fill
-   * - `TorchScript <./torchscript.html>`_
+   * - `TorchScript <./torchscript.html>`__
      - Convert a model to TorchScript for use in other programming languages
 
 .. list-table::
@@ -100,19 +100,19 @@ Content
 
    * - Section
      - Description
-   * - `Overview <./model_doc/overview.html>`_
+   * - `Overview <./model_doc/overview.html>`__
      - Overview of the package
-   * - `BERT <./model_doc/bert.html>`_
+   * - `BERT <./model_doc/bert.html>`__
      - BERT Models, Tokenizers and optimizers
-   * - `OpenAI GPT <./model_doc/gpt.html>`_
+   * - `OpenAI GPT <./model_doc/gpt.html>`__
      - GPT Models, Tokenizers and optimizers
-   * - `TransformerXL <./model_doc/transformerxl.html>`_
+   * - `TransformerXL <./model_doc/transformerxl.html>`__
      - TransformerXL Models, Tokenizers and optimizers
-   * - `OpenAI GPT2 <./model_doc/gpt2.html>`_
+   * - `OpenAI GPT2 <./model_doc/gpt2.html>`__
      - GPT2 Models, Tokenizers and optimizers
-   * - `XLM <./model_doc/xlm.html>`_
+   * - `XLM <./model_doc/xlm.html>`__
      - XLM Models, Tokenizers and optimizers
-   * - `XLNet <./model_doc/xlnet.html>`_
+   * - `XLNet <./model_doc/xlnet.html>`__
      - XLNet Models, Tokenizers and optimizers
 
 TODO Lysandre filled: might need an introduction for both parts. Is it even necessary, since there is a summary? Up to you Thom.
@@ -120,68 +120,68 @@ TODO Lysandre filled: might need an introduction for both parts. Is it even nece
 Overview
 --------
 
-This package comprises the following classes that can be imported in Python and are detailed in the `documentation <./model_doc/overview.html>`_ section of this package:
+This package comprises the following classes that can be imported in Python and are detailed in the `documentation <./model_doc/overview.html>`__ section of this package:
 
 
 *
-  Eight **Bert** PyTorch models (\ ``torch.nn.Module``\ ) with pre-trained weights (in the `modeling_bert.py <./_modules/pytorch_transformers/modeling_bert.html>`_ file):
+  Eight **Bert** PyTorch models (\ ``torch.nn.Module``\ ) with pre-trained weights (in the `modeling_bert.py <./_modules/pytorch_transformers/modeling_bert.html>`__ file):
 
 
-  * `BertModel <./model_doc/bert.html#pytorch_transformers.BertModel>`_ - raw BERT Transformer model (\ **fully pre-trained**\ ),
-  * `BertForMaskedLM <./model_doc/bert.html#pytorch_transformers.BertForMaskedLM>`_ - BERT Transformer with the pre-trained masked language modeling head on top (\ **fully pre-trained**\ ),
-  * `BertForNextSentencePrediction <./model_doc/bert.html#pytorch_transformers.BertForNextSentencePrediction>`_ - BERT Transformer with the pre-trained next sentence prediction classifier on top  (\ **fully pre-trained**\ ),
-  * `BertForPreTraining <./model_doc/bert.html#pytorch_transformers.BertForPreTraining>`_ - BERT Transformer with masked language modeling head and next sentence prediction classifier on top (\ **fully pre-trained**\ ),
-  * `BertForSequenceClassification <./model_doc/bert.html#pytorch_transformers.BertForSequenceClassification>`_ - BERT Transformer with a sequence classification head on top (BERT Transformer is **pre-trained**\ , the sequence classification head **is only initialized and has to be trained**\ ),
-  * `BertForMultipleChoice <./model_doc/bert.html#pytorch_transformers.BertForMultipleChoice>`_ - BERT Transformer with a multiple choice head on top (used for task like Swag) (BERT Transformer is **pre-trained**\ , the multiple choice classification head **is only initialized and has to be trained**\ ),
-  * `BertForTokenClassification <./model_doc/bert.html#pytorch_transformers.BertForTokenClassification>`_ - BERT Transformer with a token classification head on top (BERT Transformer is **pre-trained**\ , the token classification head **is only initialized and has to be trained**\ ),
-  * `BertForQuestionAnswering <./model_doc/bert.html#pytorch_transformers.BertForQuestionAnswering>`_ - BERT Transformer with a token classification head on top (BERT Transformer is **pre-trained**\ , the token classification head **is only initialized and has to be trained**\ ).
+  * `BertModel <./model_doc/bert.html#pytorch_transformers.BertModel>`__ - raw BERT Transformer model (\ **fully pre-trained**\ ),
+  * `BertForMaskedLM <./model_doc/bert.html#pytorch_transformers.BertForMaskedLM>`__ - BERT Transformer with the pre-trained masked language modeling head on top (\ **fully pre-trained**\ ),
+  * `BertForNextSentencePrediction <./model_doc/bert.html#pytorch_transformers.BertForNextSentencePrediction>`__ - BERT Transformer with the pre-trained next sentence prediction classifier on top  (\ **fully pre-trained**\ ),
+  * `BertForPreTraining <./model_doc/bert.html#pytorch_transformers.BertForPreTraining>`__ - BERT Transformer with masked language modeling head and next sentence prediction classifier on top (\ **fully pre-trained**\ ),
+  * `BertForSequenceClassification <./model_doc/bert.html#pytorch_transformers.BertForSequenceClassification>`__ - BERT Transformer with a sequence classification head on top (BERT Transformer is **pre-trained**\ , the sequence classification head **is only initialized and has to be trained**\ ),
+  * `BertForMultipleChoice <./model_doc/bert.html#pytorch_transformers.BertForMultipleChoice>`__ - BERT Transformer with a multiple choice head on top (used for task like Swag) (BERT Transformer is **pre-trained**\ , the multiple choice classification head **is only initialized and has to be trained**\ ),
+  * `BertForTokenClassification <./model_doc/bert.html#pytorch_transformers.BertForTokenClassification>`__ - BERT Transformer with a token classification head on top (BERT Transformer is **pre-trained**\ , the token classification head **is only initialized and has to be trained**\ ),
+  * `BertForQuestionAnswering <./model_doc/bert.html#pytorch_transformers.BertForQuestionAnswering>`__ - BERT Transformer with a token classification head on top (BERT Transformer is **pre-trained**\ , the token classification head **is only initialized and has to be trained**\ ).
 
 *
-  Three **OpenAI GPT** PyTorch models (\ ``torch.nn.Module``\ ) with pre-trained weights (in the `modeling_openai.py <./_modules/pytorch_transformers/modeling_openai.html>`_ file):
+  Three **OpenAI GPT** PyTorch models (\ ``torch.nn.Module``\ ) with pre-trained weights (in the `modeling_openai.py <./_modules/pytorch_transformers/modeling_openai.html>`__ file):
 
 
-  * `OpenAIGPTModel <./model_doc/gpt.html#pytorch_transformers.OpenAIGPTModel>`_ - raw OpenAI GPT Transformer model (\ **fully pre-trained**\ ),
-  * `OpenAIGPTLMHeadModel <./model_doc/gpt.html#pytorch_transformers.OpenAIGPTLMHeadModel>`_ - OpenAI GPT Transformer with the tied language modeling head on top (\ **fully pre-trained**\ ),
-  * `OpenAIGPTDoubleHeadsModel <./model_doc/gpt.html#pytorch_transformers.OpenAIGPTDoubleHeadsModel>`_ - OpenAI GPT Transformer with the tied language modeling head and a multiple choice classification head on top (OpenAI GPT Transformer is **pre-trained**\ , the multiple choice classification head **is only initialized and has to be trained**\ ),
+  * `OpenAIGPTModel <./model_doc/gpt.html#pytorch_transformers.OpenAIGPTModel>`__ - raw OpenAI GPT Transformer model (\ **fully pre-trained**\ ),
+  * `OpenAIGPTLMHeadModel <./model_doc/gpt.html#pytorch_transformers.OpenAIGPTLMHeadModel>`__ - OpenAI GPT Transformer with the tied language modeling head on top (\ **fully pre-trained**\ ),
+  * `OpenAIGPTDoubleHeadsModel <./model_doc/gpt.html#pytorch_transformers.OpenAIGPTDoubleHeadsModel>`__ - OpenAI GPT Transformer with the tied language modeling head and a multiple choice classification head on top (OpenAI GPT Transformer is **pre-trained**\ , the multiple choice classification head **is only initialized and has to be trained**\ ),
 
 *
-  Two **Transformer-XL** PyTorch models (\ ``torch.nn.Module``\ ) with pre-trained weights (in the `modeling_transfo_xl.py <./_modules/pytorch_transformers/modeling_transfo_xl.html>`_ file):
+  Two **Transformer-XL** PyTorch models (\ ``torch.nn.Module``\ ) with pre-trained weights (in the `modeling_transfo_xl.py <./_modules/pytorch_transformers/modeling_transfo_xl.html>`__ file):
 
 
-  * `TransfoXLModel <./model_doc/transformerxl.html#pytorch_transformers.TransfoXLModel>`_ - Transformer-XL model which outputs the last hidden state and memory cells (\ **fully pre-trained**\ ),
-  * `TransfoXLLMHeadModel <./model_doc/transformerxl.html#pytorch_transformers.TransfoXLLMHeadModel>`_ - Transformer-XL with the tied adaptive softmax head on top for language modeling which outputs the logits/loss and memory cells (\ **fully pre-trained**\ ),
+  * `TransfoXLModel <./model_doc/transformerxl.html#pytorch_transformers.TransfoXLModel>`__ - Transformer-XL model which outputs the last hidden state and memory cells (\ **fully pre-trained**\ ),
+  * `TransfoXLLMHeadModel <./model_doc/transformerxl.html#pytorch_transformers.TransfoXLLMHeadModel>`__ - Transformer-XL with the tied adaptive softmax head on top for language modeling which outputs the logits/loss and memory cells (\ **fully pre-trained**\ ),
 
 *
-  Three **OpenAI GPT-2** PyTorch models (\ ``torch.nn.Module``\ ) with pre-trained weights (in the `modeling_gpt2.py <./_modules/pytorch_transformers/modeling_gpt2.html>`_ file):
+  Three **OpenAI GPT-2** PyTorch models (\ ``torch.nn.Module``\ ) with pre-trained weights (in the `modeling_gpt2.py <./_modules/pytorch_transformers/modeling_gpt2.html>`__ file):
 
 
-  * `GPT2Model <./model_doc/gpt2.html#pytorch_transformers.GPT2Model>`_ - raw OpenAI GPT-2 Transformer model (\ **fully pre-trained**\ ),
-  * `GPT2LMHeadModel <./model_doc/gpt2.html#pytorch_transformers.GPT2LMHeadModel>`_ - OpenAI GPT-2 Transformer with the tied language modeling head on top (\ **fully pre-trained**\ ),
-  * `GPT2DoubleHeadsModel <./model_doc/gpt2.html#pytorch_transformers.GPT2DoubleHeadsModel>`_ - OpenAI GPT-2 Transformer with the tied language modeling head and a multiple choice classification head on top (OpenAI GPT-2 Transformer is **pre-trained**\ , the multiple choice classification head **is only initialized and has to be trained**\ ),
+  * `GPT2Model <./model_doc/gpt2.html#pytorch_transformers.GPT2Model>`__ - raw OpenAI GPT-2 Transformer model (\ **fully pre-trained**\ ),
+  * `GPT2LMHeadModel <./model_doc/gpt2.html#pytorch_transformers.GPT2LMHeadModel>`__ - OpenAI GPT-2 Transformer with the tied language modeling head on top (\ **fully pre-trained**\ ),
+  * `GPT2DoubleHeadsModel <./model_doc/gpt2.html#pytorch_transformers.GPT2DoubleHeadsModel>`__ - OpenAI GPT-2 Transformer with the tied language modeling head and a multiple choice classification head on top (OpenAI GPT-2 Transformer is **pre-trained**\ , the multiple choice classification head **is only initialized and has to be trained**\ ),
 
 *
-  Four **XLM** PyTorch models (\ ``torch.nn.Module``\ ) with pre-trained weights (in the `modeling_xlm.py <./_modules/pytorch_transformers/modeling_xlm.html>`_ file):
+  Four **XLM** PyTorch models (\ ``torch.nn.Module``\ ) with pre-trained weights (in the `modeling_xlm.py <./_modules/pytorch_transformers/modeling_xlm.html>`__ file):
 
 
-  * `XLMModel <./model_doc/xlm.html#pytorch_transformers.XLMModel>`_ - raw XLM Transformer model (\ **fully pre-trained**\ ),
-  * `XLMWithLMHeadModel <./model_doc/xlm.html#pytorch_transformers.XLMWithLMHeadModel>`_ - XLM Transformer with the tied language modeling head on top (\ **fully pre-trained**\ ),
-  * `XLMForSequenceClassification <./model_doc/xlm.html#pytorch_transformers.XLMForSequenceClassification>`_ - XLM Transformer with a sequence classification head on top (XLM Transformer is **pre-trained**\ , the sequence classification head **is only initialized and has to be trained**\ ),
-  * `XLMForQuestionAnswering <./model_doc/xlm.html#pytorch_transformers.XLMForQuestionAnswering>`_ - XLM Transformer with a token classification head on top (XLM Transformer is **pre-trained**\ , the token classification head **is only initialized and has to be trained**\ )
+  * `XLMModel <./model_doc/xlm.html#pytorch_transformers.XLMModel>`__ - raw XLM Transformer model (\ **fully pre-trained**\ ),
+  * `XLMWithLMHeadModel <./model_doc/xlm.html#pytorch_transformers.XLMWithLMHeadModel>`__ - XLM Transformer with the tied language modeling head on top (\ **fully pre-trained**\ ),
+  * `XLMForSequenceClassification <./model_doc/xlm.html#pytorch_transformers.XLMForSequenceClassification>`__ - XLM Transformer with a sequence classification head on top (XLM Transformer is **pre-trained**\ , the sequence classification head **is only initialized and has to be trained**\ ),
+  * `XLMForQuestionAnswering <./model_doc/xlm.html#pytorch_transformers.XLMForQuestionAnswering>`__ - XLM Transformer with a token classification head on top (XLM Transformer is **pre-trained**\ , the token classification head **is only initialized and has to be trained**\ )
 
 *
-  Four **XLNet** PyTorch models (\ ``torch.nn.Module``\ ) with pre-trained weights (in the `modeling_xlnet.py <./_modules/pytorch_transformers/modeling_xlnet.html>`_ file):
+  Four **XLNet** PyTorch models (\ ``torch.nn.Module``\ ) with pre-trained weights (in the `modeling_xlnet.py <./_modules/pytorch_transformers/modeling_xlnet.html>`__ file):
 
 
-  * `XLNetModel <./model_doc/xlnet.html#pytorch_transformers.XLNetModel>`_ - raw XLNet Transformer model (\ **fully pre-trained**\ ),
-  * `XLNetLMHeadModel <./model_doc/xlnet.html#pytorch_transformers.XLNetLMHeadModel>`_ - XLNet Transformer with the tied language modeling head on top (\ **fully pre-trained**\ ),
-  * `XLNetForSequenceClassification <./model_doc/xlnet.html#pytorch_transformers.XLNetForSequenceClassification>`_ - XLNet Transformer with a sequence classification head on top (XLM Transformer is **pre-trained**\ , the sequence classification head **is only initialized and has to be trained**\ ),
-  * `XLNetForQuestionAnswering <./model_doc/xlnet.html#pytorch_transformers.XLNetForQuestionAnswering>`_ - XLNet Transformer with a token classification head on top (XLNet Transformer is **pre-trained**\ , the token classification head **is only initialized and has to be trained**\ )
+  * `XLNetModel <./model_doc/xlnet.html#pytorch_transformers.XLNetModel>`__ - raw XLNet Transformer model (\ **fully pre-trained**\ ),
+  * `XLNetLMHeadModel <./model_doc/xlnet.html#pytorch_transformers.XLNetLMHeadModel>`__ - XLNet Transformer with the tied language modeling head on top (\ **fully pre-trained**\ ),
+  * `XLNetForSequenceClassification <./model_doc/xlnet.html#pytorch_transformers.XLNetForSequenceClassification>`__ - XLNet Transformer with a sequence classification head on top (XLM Transformer is **pre-trained**\ , the sequence classification head **is only initialized and has to be trained**\ ),
+  * `XLNetForQuestionAnswering <./model_doc/xlnet.html#pytorch_transformers.XLNetForQuestionAnswering>`__ - XLNet Transformer with a token classification head on top (XLNet Transformer is **pre-trained**\ , the token classification head **is only initialized and has to be trained**\ )
 
 
 TODO Lysandre filled: I filled in XLM and XLNet. I didn't do the Tokenizers because I don't know the current philosophy behind them.
 
 *
-  Tokenizers for **BERT** (using word-piece) (in the `tokenization_bert.py <./_modules/pytorch_transformers/tokenization_bert.html>`_ file):
+  Tokenizers for **BERT** (using word-piece) (in the `tokenization_bert.py <./_modules/pytorch_transformers/tokenization_bert.html>`__ file):
 
   * ``BasicTokenizer`` - basic tokenization (punctuation splitting, lower casing, etc.),
   * ``WordpieceTokenizer`` - WordPiece tokenization,
@@ -189,44 +189,44 @@ TODO Lysandre filled: I filled in XLM and XLNet. I didn't do the Tokenizers beca
 
 
 *
-  Tokenizer for **OpenAI GPT** (using Byte-Pair-Encoding) (in the `tokenization_openai.py <./_modules/pytorch_transformers/tokenization_openai.html>`_ file):
+  Tokenizer for **OpenAI GPT** (using Byte-Pair-Encoding) (in the `tokenization_openai.py <./_modules/pytorch_transformers/tokenization_openai.html>`__ file):
 
   * ``OpenAIGPTTokenizer`` - perform Byte-Pair-Encoding (BPE) tokenization.
 
 
 *
-  Tokenizer for **OpenAI GPT-2** (using byte-level Byte-Pair-Encoding) (in the `tokenization_gpt2.py <./_modules/pytorch_transformers/tokenization_gpt2.html>`_ file):
+  Tokenizer for **OpenAI GPT-2** (using byte-level Byte-Pair-Encoding) (in the `tokenization_gpt2.py <./_modules/pytorch_transformers/tokenization_gpt2.html>`__ file):
 
   * ``GPT2Tokenizer`` - perform byte-level Byte-Pair-Encoding (BPE) tokenization.
 
 
 *
-  Tokenizer for **Transformer-XL** (word tokens ordered by frequency for adaptive softmax) (in the `tokenization_transfo_xl.py <./_modules/pytorch_transformers/tokenization_transfo_xl.html>`_ file):
+  Tokenizer for **Transformer-XL** (word tokens ordered by frequency for adaptive softmax) (in the `tokenization_transfo_xl.py <./_modules/pytorch_transformers/tokenization_transfo_xl.html>`__ file):
 
   * ``OpenAIGPTTokenizer`` - perform word tokenization and can order words by frequency in a corpus for use in an adaptive softmax.
 
 
 *
-  Tokenizer for **XLNet** (SentencePiece based tokenizer) (in the `tokenization_xlnet.py <./_modules/pytorch_transformers/tokenization_xlnet.html>`_ file):
+  Tokenizer for **XLNet** (SentencePiece based tokenizer) (in the `tokenization_xlnet.py <./_modules/pytorch_transformers/tokenization_xlnet.html>`__ file):
 
   * ``XLNetTokenizer`` - perform SentencePiece tokenization.
 
 
 *
-  Tokenizer for **XLM** (using Byte-Pair-Encoding) (in the `tokenization_xlm.py <./_modules/pytorch_transformers/tokenization_xlm.html>`_ file):
+  Tokenizer for **XLM** (using Byte-Pair-Encoding) (in the `tokenization_xlm.py <./_modules/pytorch_transformers/tokenization_xlm.html>`__ file):
 
   * ``GPT2Tokenizer`` - perform Byte-Pair-Encoding (BPE) tokenization.
 
 
 *
-  Optimizer for **BERT** (in the `optimization.py <./_modules/pytorch_transformers/optimization.html>`_ file):
+  Optimizer for **BERT** (in the `optimization.py <./_modules/pytorch_transformers/optimization.html>`__ file):
 
 
   * ``BertAdam`` - Bert version of Adam algorithm with weight decay fix, warmup and linear decay of the learning rate.
 
 
 *
-  Optimizer for **OpenAI GPT** (in the `optimization_openai.py <./_modules/pytorch_transformers/optimization_openai.html>`_ file):
+  Optimizer for **OpenAI GPT** (in the `optimization_openai.py <./_modules/pytorch_transformers/optimization_openai.html>`__ file):
 
 
   * ``OpenAIAdam`` - OpenAI GPT version of Adam algorithm with weight decay fix, warmup and linear decay of the learning rate.
@@ -234,11 +234,11 @@ TODO Lysandre filled: I filled in XLM and XLNet. I didn't do the Tokenizers beca
 
 *
   Configuration classes for BERT, OpenAI GPT, Transformer-XL, XLM and XLNet (in the respective \
-  `modeling_bert.py <./_modules/pytorch_transformers/modeling_bert.html>`_\ , \
-  `modeling_openai.py <./_modules/pytorch_transformers/modeling_openai.html>`_\ , \
-  `modeling_transfo_xl.py <./_modules/pytorch_transformers/modeling_transfo_xl.html>`_, \
-  `modeling_xlm.py <./_modules/pytorch_transformers/modeling_xlm.html>`_, \
-  `modeling_xlnet.py <./_modules/pytorch_transformers/modeling_xlnet.html>`_ \
+  `modeling_bert.py <./_modules/pytorch_transformers/modeling_bert.html>`__\ , \
+  `modeling_openai.py <./_modules/pytorch_transformers/modeling_openai.html>`__\ , \
+  `modeling_transfo_xl.py <./_modules/pytorch_transformers/modeling_transfo_xl.html>`__, \
+  `modeling_xlm.py <./_modules/pytorch_transformers/modeling_xlm.html>`__, \
+  `modeling_xlnet.py <./_modules/pytorch_transformers/modeling_xlnet.html>`__ \
   files):
 
 
@@ -253,47 +253,47 @@ The repository further comprises:
 
 
 *
-  Five examples on how to use **BERT** (in the `examples folder <https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/examples>`_\ ):
+  Five examples on how to use **BERT** (in the `examples folder <https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/examples>`__\ ):
 
 
-  * `run_bert_extract_features.py <https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/examples/run_bert_extract_features.py>`_ - Show how to extract hidden states from an instance of ``BertModel``\ ,
-  * `run_bert_classifier.py <https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/examples/run_bert_classifier.py>`_ - Show how to fine-tune an instance of ``BertForSequenceClassification`` on GLUE's MRPC task,
-  * `run_bert_squad.py <https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/examples/run_bert_squad.py>`_ - Show how to fine-tune an instance of ``BertForQuestionAnswering`` on SQuAD v1.0 and SQuAD v2.0 tasks.
-  * `run_swag.py <https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/examples/run_swag.py>`_ - Show how to fine-tune an instance of ``BertForMultipleChoice`` on Swag task.
-  * `simple_lm_finetuning.py <https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/examples/lm_finetuning/simple_lm_finetuning.py>`_ - Show how to fine-tune an instance of ``BertForPretraining`` on a target text corpus.
+  * `run_bert_extract_features.py <https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/examples/run_bert_extract_features.py>`__ - Show how to extract hidden states from an instance of ``BertModel``\ ,
+  * `run_bert_classifier.py <https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/examples/run_bert_classifier.py>`__ - Show how to fine-tune an instance of ``BertForSequenceClassification`` on GLUE's MRPC task,
+  * `run_bert_squad.py <https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/examples/run_bert_squad.py>`__ - Show how to fine-tune an instance of ``BertForQuestionAnswering`` on SQuAD v1.0 and SQuAD v2.0 tasks.
+  * `run_swag.py <https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/examples/run_swag.py>`__ - Show how to fine-tune an instance of ``BertForMultipleChoice`` on Swag task.
+  * `simple_lm_finetuning.py <https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/examples/lm_finetuning/simple_lm_finetuning.py>`__ - Show how to fine-tune an instance of ``BertForPretraining`` on a target text corpus.
 
 *
-  One example on how to use **OpenAI GPT** (in the `examples folder <https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/examples>`_\ ):
+  One example on how to use **OpenAI GPT** (in the `examples folder <https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/examples>`__\ ):
 
 
-  * `run_openai_gpt.py <https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/examples/run_openai_gpt.py>`_ - Show how to fine-tune an instance of ``OpenGPTDoubleHeadsModel`` on the RocStories task.
+  * `run_openai_gpt.py <https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/examples/run_openai_gpt.py>`__ - Show how to fine-tune an instance of ``OpenGPTDoubleHeadsModel`` on the RocStories task.
 
 *
-  One example on how to use **Transformer-XL** (in the `examples folder <https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/examples>`_\ ):
+  One example on how to use **Transformer-XL** (in the `examples folder <https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/examples>`__\ ):
 
 
-  * `run_transfo_xl.py <https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/examples/run_transfo_xl.py>`_ - Show how to load and evaluate a pre-trained model of ``TransfoXLLMHeadModel`` on WikiText 103.
+  * `run_transfo_xl.py <https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/examples/run_transfo_xl.py>`__ - Show how to load and evaluate a pre-trained model of ``TransfoXLLMHeadModel`` on WikiText 103.
 
 *
-  One example on how to use **OpenAI GPT-2** in the unconditional and interactive mode (in the `examples folder <https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/examples>`_\ ):
+  One example on how to use **OpenAI GPT-2** in the unconditional and interactive mode (in the `examples folder <https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/examples>`__\ ):
 
 
-  * `run_gpt2.py <https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/examples/run_gpt2.py>`_ - Show how to use OpenAI GPT-2 an instance of ``GPT2LMHeadModel`` to generate text (same as the original OpenAI GPT-2 examples).
+  * `run_gpt2.py <https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/examples/run_gpt2.py>`__ - Show how to use OpenAI GPT-2 an instance of ``GPT2LMHeadModel`` to generate text (same as the original OpenAI GPT-2 examples).
 
-  These examples are detailed in the `Examples <#examples>`_ section of this readme.
+  These examples are detailed in the `Examples <#examples>`__ section of this readme.
 
 *
-  Three notebooks that were used to check that the TensorFlow and PyTorch models behave identically (in the `notebooks folder <https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/notebooks>`_\ ):
+  Three notebooks that were used to check that the TensorFlow and PyTorch models behave identically (in the `notebooks folder <https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/notebooks>`__\ ):
 
 
-  * `Comparing-TF-and-PT-models.ipynb <https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/notebooks/Comparing-TF-and-PT-models.ipynb>`_ - Compare the hidden states predicted by ``BertModel``\ ,
-  * `Comparing-TF-and-PT-models-SQuAD.ipynb <https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/notebooks/Comparing-TF-and-PT-models-SQuAD.ipynb>`_ - Compare the spans predicted by  ``BertForQuestionAnswering`` instances,
-  * `Comparing-TF-and-PT-models-MLM-NSP.ipynb <https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/notebooks/Comparing-TF-and-PT-models-MLM-NSP.ipynb>`_ - Compare the predictions of the ``BertForPretraining`` instances.
+  * `Comparing-TF-and-PT-models.ipynb <https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/notebooks/Comparing-TF-and-PT-models.ipynb>`__ - Compare the hidden states predicted by ``BertModel``\ ,
+  * `Comparing-TF-and-PT-models-SQuAD.ipynb <https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/notebooks/Comparing-TF-and-PT-models-SQuAD.ipynb>`__ - Compare the spans predicted by  ``BertForQuestionAnswering`` instances,
+  * `Comparing-TF-and-PT-models-MLM-NSP.ipynb <https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/notebooks/Comparing-TF-and-PT-models-MLM-NSP.ipynb>`__ - Compare the predictions of the ``BertForPretraining`` instances.
 
-  These notebooks are detailed in the `Notebooks <#notebooks>`_ section of this readme.
+  These notebooks are detailed in the `Notebooks <#notebooks>`__ section of this readme.
 
 
 *
   A command-line interface to convert TensorFlow checkpoints (BERT, Transformer-XL) or NumPy checkpoint (OpenAI) in a PyTorch save of the associated PyTorch model:
 
-  This CLI is detailed in the `Command-line interface <#Command-line-interface>`_ section of this readme.
+  This CLI is detailed in the `Command-line interface <#Command-line-interface>`__ section of this readme.
diff --git a/docs/source/model_doc/overview.rst b/docs/source/model_doc/overview.rst
index d70fa3beb9..00e538e68d 100644
--- a/docs/source/model_doc/overview.rst
+++ b/docs/source/model_doc/overview.rst
@@ -14,6 +14,8 @@ Here is a detailed documentation of the classes in the package and how to use th
    * - `Serialization best-practices <#serialization-best-practices>`__
      - How to save and reload a fine-tuned model
    * - `Configurations <#configurations>`__
+     - API of the configuration classes for BERT, GPT, GPT-2 and Transformer-XL
+
 
 TODO Lysandre filled: Removed Models/Tokenizers/Optimizers as no single link can be made.
 
diff --git a/docs/source/notebooks.rst b/docs/source/notebooks.rst
index 35d54370ba..592867a862 100644
--- a/docs/source/notebooks.rst
+++ b/docs/source/notebooks.rst
@@ -11,6 +11,6 @@ We include `three Jupyter Notebooks <https://github.com/huggingface/pytorch-pret
   The second NoteBook (\ `Comparing-TF-and-PT-models-SQuAD.ipynb <https://github.com/huggingface/pytorch-pretrained-BERT/tree/master/notebooks/Comparing-TF-and-PT-models-SQuAD.ipynb>`_\ ) compares the loss computed by the TensorFlow and the PyTorch models for identical initialization of the fine-tuning layer of the ``BertForQuestionAnswering`` and computes the standard deviation between them. In the given example, we get a standard deviation of 2.5e-7 between the models.
 
 *
-  The third NoteBook (\ `Comparing-TF-and-PT-models-MLM-NSP.ipynb <https://github.com/huggingface/pytorch-pretrained-BERT/tree/mnotebooks/Comparing-TF-and-PT-models-MLM-NSP.ipynb>`_\ ) compares the predictions computed by the TensorFlow and the PyTorch models for masked token language modeling using the pre-trained masked language modeling model.
+  The third NoteBook (\ `Comparing-TF-and-PT-models-MLM-NSP.ipynb <https://github.com/huggingface/pytorch-pretrained-BERT/tree/notebooks/Comparing-TF-and-PT-models-MLM-NSP.ipynb>`_\ ) compares the predictions computed by the TensorFlow and the PyTorch models for masked token language modeling using the pre-trained masked language modeling model.
 
 Please follow the instructions given in the notebooks to run and modify them.

From dee3e45b93e65e3de9cdf28f5ffbe148d91e361e Mon Sep 17 00:00:00 2001
From: LysandreJik <lysandre.debut@reseau.eseo.fr>
Date: Wed, 10 Jul 2019 19:04:21 -0400
Subject: [PATCH 090/139] Fixed XLM weights conversion script. Added 5 new
 checkpoints for XLM.

---
 .../convert_xlm_checkpoint_to_pytorch.py               |  5 +++--
 pytorch_transformers/modeling_xlm.py                   | 10 ++++++++++
 pytorch_transformers/tokenization_xlm.py               | 10 ++++++++++
 3 files changed, 23 insertions(+), 2 deletions(-)

diff --git a/pytorch_transformers/convert_xlm_checkpoint_to_pytorch.py b/pytorch_transformers/convert_xlm_checkpoint_to_pytorch.py
index e5815252f1..416f1bc16d 100755
--- a/pytorch_transformers/convert_xlm_checkpoint_to_pytorch.py
+++ b/pytorch_transformers/convert_xlm_checkpoint_to_pytorch.py
@@ -23,7 +23,8 @@ from io import open
 import torch
 import numpy
 
-from pytorch_transformers.modeling_xlm import (CONFIG_NAME, WEIGHTS_NAME, XLMConfig, XLMModel)
+from pytorch_transformers.modeling_utils import CONFIG_NAME, WEIGHTS_NAME
+from pytorch_transformers.modeling_xlm import (XLMConfig, XLMModel)
 from pytorch_transformers.tokenization_xlm import VOCAB_FILES_NAMES
 
 
@@ -37,7 +38,7 @@ def convert_xlm_checkpoint_to_pytorch(xlm_checkpoint_path, pytorch_dump_folder_p
     config = dict((n, v) for n, v in config.items() if not isinstance(v, (torch.Tensor, numpy.ndarray)))
 
     vocab = chkpt['dico_word2id']
-    vocab = dict((s + '</w>' if s.find('@@') == -1 and i > 13 else s.replace('@@', ''), i) for s, i in d.items())
+    vocab = dict((s + '</w>' if s.find('@@') == -1 and i > 13 else s.replace('@@', ''), i) for s, i in vocab.items())
 
     # Save pytorch-model
     pytorch_weights_dump_path = pytorch_dump_folder_path + '/' + WEIGHTS_NAME
diff --git a/pytorch_transformers/modeling_xlm.py b/pytorch_transformers/modeling_xlm.py
index 72ec6397a0..60dc1a7bed 100644
--- a/pytorch_transformers/modeling_xlm.py
+++ b/pytorch_transformers/modeling_xlm.py
@@ -37,9 +37,19 @@ logger = logging.getLogger(__name__)
 
 XLM_PRETRAINED_MODEL_ARCHIVE_MAP = {
     'xlm-mlm-en-2048': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-en-2048-pytorch_model.bin",
+    'xlm-mlm-ende-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-ende-1024-pytorch_model.bin",
+    'xlm-mlm-enfr-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enfr-1024-pytorch_model.bin",
+    'xlm-mlm-enro-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enro-1024-pytorch_model.bin",
+    'xlm-mlm-tlm-xnli15-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-tlm-xnli15-1024-pytorch_model.bin",
+    'xlm-mlm-xnli15-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-xnli15-1024-pytorch_model.bin",
 }
 XLM_PRETRAINED_CONFIG_ARCHIVE_MAP = {
     'xlm-mlm-en-2048': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-en-2048-config.json",
+    'xlm-mlm-ende-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-ende-1024-config.bin",
+    'xlm-mlm-enfr-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enfr-1024-configl.bin",
+    'xlm-mlm-enro-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enro-1024-config.bin",
+    'xlm-mlm-tlm-xnli15-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-tlm-xnli15-1024-config.bin",
+    'xlm-mlm-xnli15-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-xnli15-1024-config.bin",
 }
 
 
diff --git a/pytorch_transformers/tokenization_xlm.py b/pytorch_transformers/tokenization_xlm.py
index 74bc56f350..a81567f21c 100644
--- a/pytorch_transformers/tokenization_xlm.py
+++ b/pytorch_transformers/tokenization_xlm.py
@@ -36,10 +36,20 @@ PRETRAINED_VOCAB_FILES_MAP = {
     'vocab_file':
     {
         'xlm-mlm-en-2048': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-en-2048-vocab.json",
+        'xlm-mlm-ende-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-ende-1024-vocab.bin",
+        'xlm-mlm-enfr-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enfr-1024-vocab.bin",
+        'xlm-mlm-enro-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enro-1024-vocab.bin",
+        'xlm-mlm-tlm-xnli15-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-tlm-xnli15-1024-vocab.bin",
+        'xlm-mlm-xnli15-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-xnli15-1024-vocab.bin",
     },
     'merges_file':
     {
         'xlm-mlm-en-2048': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-en-2048-merges.txt",
+        'xlm-mlm-ende-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-ende-1024-merges.bin",
+        'xlm-mlm-enfr-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enfr-1024-merges.bin",
+        'xlm-mlm-enro-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enro-1024-merges.bin",
+        'xlm-mlm-tlm-xnli15-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-tlm-xnli15-1024-merges.bin",
+        'xlm-mlm-xnli15-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-xnli15-1024-merges.bin",
     },
 }
 

From 7fdbc47822ba7ea709da0559674b2b30c4eb796a Mon Sep 17 00:00:00 2001
From: LysandreJik <lysandre.debut@reseau.eseo.fr>
Date: Wed, 10 Jul 2019 19:37:24 -0400
Subject: [PATCH 091/139] Added the two CLM XLM pretrained checkpoints. Fixed
 file extensions for config/vocab/merges of XLM models.

---
 pytorch_transformers/modeling_xlm.py     | 14 +++++++++-----
 pytorch_transformers/tokenization_xlm.py | 24 ++++++++++++++----------
 2 files changed, 23 insertions(+), 15 deletions(-)

diff --git a/pytorch_transformers/modeling_xlm.py b/pytorch_transformers/modeling_xlm.py
index 60dc1a7bed..7567a0f24b 100644
--- a/pytorch_transformers/modeling_xlm.py
+++ b/pytorch_transformers/modeling_xlm.py
@@ -42,14 +42,18 @@ XLM_PRETRAINED_MODEL_ARCHIVE_MAP = {
     'xlm-mlm-enro-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enro-1024-pytorch_model.bin",
     'xlm-mlm-tlm-xnli15-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-tlm-xnli15-1024-pytorch_model.bin",
     'xlm-mlm-xnli15-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-xnli15-1024-pytorch_model.bin",
+    'xlm-clm-enfr-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-enfr-1024-pytorch_model.bin",
+    'xlm-clm-ende-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-ende-1024-pytorch_model.bin",
 }
 XLM_PRETRAINED_CONFIG_ARCHIVE_MAP = {
     'xlm-mlm-en-2048': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-en-2048-config.json",
-    'xlm-mlm-ende-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-ende-1024-config.bin",
-    'xlm-mlm-enfr-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enfr-1024-configl.bin",
-    'xlm-mlm-enro-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enro-1024-config.bin",
-    'xlm-mlm-tlm-xnli15-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-tlm-xnli15-1024-config.bin",
-    'xlm-mlm-xnli15-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-xnli15-1024-config.bin",
+    'xlm-mlm-ende-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-ende-1024-config.json",
+    'xlm-mlm-enfr-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enfr-1024-configl.json",
+    'xlm-mlm-enro-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enro-1024-config.json",
+    'xlm-mlm-tlm-xnli15-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-tlm-xnli15-1024-config.json",
+    'xlm-mlm-xnli15-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-xnli15-1024-config.json",
+    'xlm-clm-enfr-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-enfr-1024-config.json",
+    'xlm-clm-ende-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-ende-1024-config.json",
 }
 
 
diff --git a/pytorch_transformers/tokenization_xlm.py b/pytorch_transformers/tokenization_xlm.py
index a81567f21c..58fefa104b 100644
--- a/pytorch_transformers/tokenization_xlm.py
+++ b/pytorch_transformers/tokenization_xlm.py
@@ -36,20 +36,24 @@ PRETRAINED_VOCAB_FILES_MAP = {
     'vocab_file':
     {
         'xlm-mlm-en-2048': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-en-2048-vocab.json",
-        'xlm-mlm-ende-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-ende-1024-vocab.bin",
-        'xlm-mlm-enfr-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enfr-1024-vocab.bin",
-        'xlm-mlm-enro-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enro-1024-vocab.bin",
-        'xlm-mlm-tlm-xnli15-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-tlm-xnli15-1024-vocab.bin",
-        'xlm-mlm-xnli15-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-xnli15-1024-vocab.bin",
+        'xlm-mlm-ende-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-ende-1024-vocab.json",
+        'xlm-mlm-enfr-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enfr-1024-vocab.json",
+        'xlm-mlm-enro-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enro-1024-vocab.json",
+        'xlm-mlm-tlm-xnli15-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-tlm-xnli15-1024-vocab.json",
+        'xlm-mlm-xnli15-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-xnli15-1024-vocab.json",
+        'xlm-clm-enfr-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-enfr-1024-vocab.json",
+        'xlm-clm-ende-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-ende-1024-vocab.json",
     },
     'merges_file':
     {
         'xlm-mlm-en-2048': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-en-2048-merges.txt",
-        'xlm-mlm-ende-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-ende-1024-merges.bin",
-        'xlm-mlm-enfr-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enfr-1024-merges.bin",
-        'xlm-mlm-enro-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enro-1024-merges.bin",
-        'xlm-mlm-tlm-xnli15-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-tlm-xnli15-1024-merges.bin",
-        'xlm-mlm-xnli15-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-xnli15-1024-merges.bin",
+        'xlm-mlm-ende-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-ende-1024-merges.txt",
+        'xlm-mlm-enfr-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enfr-1024-merges.txt",
+        'xlm-mlm-enro-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enro-1024-merges.txt",
+        'xlm-mlm-tlm-xnli15-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-tlm-xnli15-1024-merges.txt",
+        'xlm-mlm-xnli15-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-xnli15-1024-merges.txt",
+        'xlm-clm-enfr-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enfr-1024-merges.txt",
+        'xlm-clm-ende-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-ende-1024-merges.txt",
     },
 }
 

From 4fef5919a58499f68bc1fe048949a99b86ff0228 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Thu, 11 Jul 2019 12:03:08 +0200
Subject: [PATCH 092/139] updating examples

---
 examples/run_glue.py                          | 94 ++++++++++++-------
 examples/utils_glue.py                        |  1 +
 pytorch_transformers/modeling_bert.py         | 12 +--
 pytorch_transformers/modeling_gpt2.py         |  8 +-
 pytorch_transformers/modeling_openai.py       |  2 +-
 pytorch_transformers/modeling_transfo_xl.py   | 10 +-
 .../modeling_transfo_xl_utilities.py          | 70 --------------
 pytorch_transformers/modeling_utils.py        | 21 ++++-
 pytorch_transformers/modeling_xlnet.py        | 16 ++--
 .../tokenization_transfo_xl.py                | 32 +++----
 10 files changed, 116 insertions(+), 150 deletions(-)

diff --git a/examples/run_glue.py b/examples/run_glue.py
index 1e14a3e183..b0d8158d9a 100644
--- a/examples/run_glue.py
+++ b/examples/run_glue.py
@@ -18,46 +18,37 @@
 from __future__ import absolute_import, division, print_function
 
 import argparse
+import glob
 import logging
 import os
 import random
-from tqdm import tqdm, trange
 
 import numpy as np
-
 import torch
+from tensorboardX import SummaryWriter
 from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
                               TensorDataset)
 from torch.utils.data.distributed import DistributedSampler
+from tqdm import tqdm, trange
 
-from tensorboardX import SummaryWriter
-
-from pytorch_transformers import (BertForSequenceClassification, XLNetForSequenceClassification,
-                                  XLMForSequenceClassification, BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
-                                  XLNET_PRETRAINED_MODEL_ARCHIVE_MAP, XLM_PRETRAINED_MODEL_ARCHIVE_MAP)
-from pytorch_transformers import (BertTokenizer, XLNetTokenizer,
-                                  XLMTokenizer)
+from pytorch_transformers import WEIGHTS_NAME
+from pytorch_transformers import (BertConfig, BertForSequenceClassification,
+                                  BertTokenizer, XLMConfig,
+                                  XLMForSequenceClassification, XLMTokenizer,
+                                  XLNetConfig, XLNetForSequenceClassification,
+                                  XLNetTokenizer)
 from pytorch_transformers.optimization import BertAdam
-
-from utils_glue import processors, output_modes, convert_examples_to_features, compute_metrics
-
+from utils_glue import (compute_metrics, convert_examples_to_features,
+                        output_modes, processors)
 
 logger = logging.getLogger(__name__)
 
-ALL_MODELS = sum((tuple(m.keys()) for m in (BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
-                                            XLNET_PRETRAINED_MODEL_ARCHIVE_MAP,
-                                            XLM_PRETRAINED_MODEL_ARCHIVE_MAP)), ())
+ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) for conf in (BertConfig, XLNetConfig, XLMConfig)), ())
 
 MODEL_CLASSES = {
-    'bert': BertForSequenceClassification,
-    'xlnet': XLNetForSequenceClassification,
-    'xlm': XLMForSequenceClassification,
-}
-
-TOKENIZER_CLASSES = {
-    'bert': BertTokenizer,
-    'xlnet': XLNetTokenizer,
-    'xlm': XLMTokenizer,
+    'bert': (BertConfig, BertForSequenceClassification, BertTokenizer),
+    'xlnet': (XLNetConfig, XLNetForSequenceClassification, XLNetTokenizer),
+    'xlm': (XLMConfig, XLMForSequenceClassification, XLMTokenizer),
 }
 
 def train(args, train_dataset, model, tokenizer):
@@ -130,14 +121,26 @@ def train(args, train_dataset, model, tokenizer):
                 optimizer.step()
                 optimizer.zero_grad()
                 global_step += 1
+
                 if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
+                    # Log metrics
                     if args.local_rank == -1:  # Only evaluate on single GPU otherwise metrics may not average well
-                        results = evaluate(args, model, tokenizer)
+                        results = evaluate(args, model, tokenizer, prefix=global_step)
                         for key, value in results.items():
                             tb_writer.add_scalar('eval_{}'.format(key), value, global_step)
                     tb_writer.add_scalar('lr', optimizer.get_lr()[0], global_step)
                     tb_writer.add_scalar('loss', (tr_loss - logging_loss)/args.logging_steps, global_step)
                     logging_loss = tr_loss
+
+                if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
+                    # Save model checkpoint
+                    output_dir = os.path.join(args.output_dir, 'checkpoint-{}'.format(global_step))
+                    if not os.path.exists(output_dir):
+                        os.makedirs(output_dir)
+                    model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
+                    model_to_save.save_pretrained(output_dir)
+                    torch.save(args, os.path.join(output_dir, 'training_args.bin'))
+
             if args.max_steps > 0 and global_step > args.max_steps:
                 break
         if args.max_steps > 0 and global_step > args.max_steps:
@@ -146,7 +149,7 @@ def train(args, train_dataset, model, tokenizer):
     return global_step, tr_loss / global_step
 
 
-def evaluate(args, model, tokenizer):
+def evaluate(args, model, tokenizer, prefix=""):
     # Loop to handle MNLI double evaluation (matched, mis-matched)
     eval_task_names = ("mnli", "mnli-mm") if args.task_name == "mnli" else (args.task_name,)
     eval_outputs_dirs = (args.output_dir, args.output_dir + '-MM') if args.task_name == "mnli" else (args.output_dir,)
@@ -202,7 +205,7 @@ def evaluate(args, model, tokenizer):
 
         output_eval_file = os.path.join(eval_output_dir, "eval_results.txt")
         with open(output_eval_file, "w") as writer:
-            logger.info("***** Eval results *****")
+            logger.info("***** Eval results {} *****".format(prefix))
             for key in sorted(result.keys()):
                 logger.info("  %s = %s", key, str(result[key]))
                 writer.write("%s = %s\n" % (key, str(result[key])))
@@ -264,6 +267,10 @@ def main():
                         help="The output directory where the model predictions and checkpoints will be written.")
 
     ## Other parameters
+    parser.add_argument("--config_name", default="", type=str,
+                        help="Pretrained config name or path if not the same as model_name")
+    parser.add_argument("--tokenizer_name", default="", type=str,
+                        help="Pretrained tokenizer name or path if not the same as model_name")
     parser.add_argument("--cache_dir", default="", type=str,
                         help="Where do you want to store the pre-trained models downloaded from s3")
     parser.add_argument("--max_seq_length", default=128, type=int,
@@ -293,8 +300,12 @@ def main():
     parser.add_argument("--warmup_proportion", default=0.1, type=float,
                         help="Proportion of training with linear learning rate warmup (0.1 = 10%% of training).")
 
-    parser.add_argument('--logging_steps', type=int, default=100,
+    parser.add_argument('--logging_steps', type=int, default=50,
                         help="Log every X updates steps.")
+    parser.add_argument('--save_steps', type=int, default=50,
+                        help="Save checkpoint every X updates steps.")
+    parser.add_argument("--eval_all_checkpoints", action='store_true',
+                        help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number")
     parser.add_argument("--no_cuda", action='store_true',
                         help="Avoid using CUDA when available")
     parser.add_argument('--overwrite_output_dir', action='store_true',
@@ -363,11 +374,15 @@ def main():
         # Make sure only the first process in distributed training will download model & vocab
         torch.distributed.barrier()
 
-    args.model_type = args.model_name.lower().split('-')[0]
-    tokenizer_class = TOKENIZER_CLASSES[args.model_type]
-    model_class = MODEL_CLASSES[args.model_type]
-    tokenizer = tokenizer_class.from_pretrained(args.model_name, do_lower_case=args.do_lower_case)
-    model = model_class.from_pretrained(args.model_name, num_labels=num_labels)
+    args.model_type = ""
+    for key in MODEL_CLASSES:
+        if key in args.model_name.lower():
+            args.model_type = key  # take the first match in model types
+            break
+    config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
+    config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name, num_labels=num_labels, finetuning_task=args.task_name)
+    tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name, do_lower_case=args.do_lower_case)
+    model = model_class.from_pretrained(args.model_name, from_tf=bool('.ckpt' in args.model_name), config=config)
 
     if args.local_rank == 0:
         torch.distributed.barrier()
@@ -410,8 +425,17 @@ def main():
 
     # Evaluation
     if args.do_eval and args.local_rank in [-1, 0]:
-        results = evaluate(args, model, tokenizer)
-
+        checkpoints = [args.output_dir + './' + WEIGHTS_NAME]
+        if args.eval_all_checkpoints:
+            checkpoints = list(os.path.dirname(c) for c in glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True))
+        logger.info("Evaluate the following checkpoints: %s", checkpoints)
+        results = {}
+        for checkpoint in checkpoints:
+            global_step = int(checkpoints.split('-')[-1])
+            model = model_class.from_pretrained(checkpoints)
+            model.to(args.device)
+            result = evaluate(args, model, tokenizer, prefix=global_step)
+            result = dict(n + '_{}'.format())
         return results
 
 
diff --git a/examples/utils_glue.py b/examples/utils_glue.py
index 4750592957..5ad36abf10 100644
--- a/examples/utils_glue.py
+++ b/examples/utils_glue.py
@@ -21,6 +21,7 @@ import csv
 import logging
 import os
 import sys
+from io import open
 
 from scipy.stats import pearsonr, spearmanr
 from sklearn.metrics import matthews_corrcoef, f1_score
diff --git a/pytorch_transformers/modeling_bert.py b/pytorch_transformers/modeling_bert.py
index 7cb723b563..5219bac601 100644
--- a/pytorch_transformers/modeling_bert.py
+++ b/pytorch_transformers/modeling_bert.py
@@ -73,17 +73,17 @@ def load_tf_weights_in_bert(model, config, tf_checkpoint_path):
         import numpy as np
         import tensorflow as tf
     except ImportError:
-        print("Loading a TensorFlow models in PyTorch, requires TensorFlow to be installed. Please see "
+        logger.error("Loading a TensorFlow models in PyTorch, requires TensorFlow to be installed. Please see "
             "https://www.tensorflow.org/install/ for installation instructions.")
         raise
     tf_path = os.path.abspath(tf_checkpoint_path)
-    print("Converting TensorFlow checkpoint from {}".format(tf_path))
+    logger.info("Converting TensorFlow checkpoint from {}".format(tf_path))
     # Load weights from TF model
     init_vars = tf.train.list_variables(tf_path)
     names = []
     arrays = []
     for name, shape in init_vars:
-        print("Loading TF weight {} with shape {}".format(name, shape))
+        logger.info("Loading TF weight {} with shape {}".format(name, shape))
         array = tf.train.load_variable(tf_path, name)
         names.append(name)
         arrays.append(array)
@@ -93,7 +93,7 @@ def load_tf_weights_in_bert(model, config, tf_checkpoint_path):
         # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
         # which are not required for using pretrained model
         if any(n in ["adam_v", "adam_m", "global_step"] for n in name):
-            print("Skipping {}".format("/".join(name)))
+            logger.info("Skipping {}".format("/".join(name)))
             continue
         pointer = model
         for m_name in name:
@@ -113,7 +113,7 @@ def load_tf_weights_in_bert(model, config, tf_checkpoint_path):
                 try:
                     pointer = getattr(pointer, l[0])
                 except AttributeError:
-                    print("Skipping {}".format("/".join(name)))
+                    logger.info("Skipping {}".format("/".join(name)))
                     continue
             if len(l) >= 2:
                 num = int(l[1])
@@ -127,7 +127,7 @@ def load_tf_weights_in_bert(model, config, tf_checkpoint_path):
         except AssertionError as e:
             e.args += (pointer.shape, array.shape)
             raise
-        print("Initialize PyTorch weight {}".format(name))
+        logger.info("Initialize PyTorch weight {}".format(name))
         pointer.data = torch.from_numpy(array)
     return model
 
diff --git a/pytorch_transformers/modeling_gpt2.py b/pytorch_transformers/modeling_gpt2.py
index 840016098a..5f777ec7b1 100644
--- a/pytorch_transformers/modeling_gpt2.py
+++ b/pytorch_transformers/modeling_gpt2.py
@@ -49,17 +49,17 @@ def load_tf_weights_in_gpt2(model, config, gpt2_checkpoint_path):
         import numpy as np
         import tensorflow as tf
     except ImportError:
-        print("Loading a TensorFlow models in PyTorch, requires TensorFlow to be installed. Please see "
+        logger.error("Loading a TensorFlow models in PyTorch, requires TensorFlow to be installed. Please see "
             "https://www.tensorflow.org/install/ for installation instructions.")
         raise
     tf_path = os.path.abspath(gpt2_checkpoint_path)
-    print("Converting TensorFlow checkpoint from {}".format(tf_path))
+    logger.info("Converting TensorFlow checkpoint from {}".format(tf_path))
     # Load weights from TF model
     init_vars = tf.train.list_variables(tf_path)
     names = []
     arrays = []
     for name, shape in init_vars:
-        print("Loading TF weight {} with shape {}".format(name, shape))
+        logger.info("Loading TF weight {} with shape {}".format(name, shape))
         array = tf.train.load_variable(tf_path, name)
         names.append(name)
         arrays.append(array.squeeze())
@@ -90,7 +90,7 @@ def load_tf_weights_in_gpt2(model, config, gpt2_checkpoint_path):
         except AssertionError as e:
             e.args += (pointer.shape, array.shape)
             raise
-        print("Initialize PyTorch weight {}".format(name))
+        logger.info("Initialize PyTorch weight {}".format(name))
         pointer.data = torch.from_numpy(array)
     return model
 
diff --git a/pytorch_transformers/modeling_openai.py b/pytorch_transformers/modeling_openai.py
index 024ff8eb41..706da7269b 100644
--- a/pytorch_transformers/modeling_openai.py
+++ b/pytorch_transformers/modeling_openai.py
@@ -110,7 +110,7 @@ def load_tf_weights_in_openai_gpt(model, config, openai_checkpoint_folder_path):
         except AssertionError as e:
             e.args += (pointer.shape, array.shape)
             raise
-        print("Initialize PyTorch weight {}".format(name))
+        logger.info("Initialize PyTorch weight {}".format(name))
         pointer.data = torch.from_numpy(array)
     return model
 
diff --git a/pytorch_transformers/modeling_transfo_xl.py b/pytorch_transformers/modeling_transfo_xl.py
index 0c5d127d62..3717862186 100644
--- a/pytorch_transformers/modeling_transfo_xl.py
+++ b/pytorch_transformers/modeling_transfo_xl.py
@@ -126,7 +126,7 @@ def load_tf_weights_in_transfo_xl(model, config, tf_path):
         import numpy as np
         import tensorflow as tf
     except ImportError:
-        print("Loading a TensorFlow models in PyTorch, requires TensorFlow to be installed. Please see "
+        logger.error("Loading a TensorFlow models in PyTorch, requires TensorFlow to be installed. Please see "
             "https://www.tensorflow.org/install/ for installation instructions.")
         raise
     # Build TF to PyTorch weights loading map
@@ -136,7 +136,7 @@ def load_tf_weights_in_transfo_xl(model, config, tf_path):
     init_vars = tf.train.list_variables(tf_path)
     tf_weights = {}
     for name, shape in init_vars:
-        print("Loading TF weight {} with shape {}".format(name, shape))
+        logger.info("Loading TF weight {} with shape {}".format(name, shape))
         array = tf.train.load_variable(tf_path, name)
         tf_weights[name] = array
 
@@ -157,7 +157,7 @@ def load_tf_weights_in_transfo_xl(model, config, tf_path):
                 except AssertionError as e:
                     e.args += (p_i.shape, arr_i.shape)
                     raise
-                print("Initialize PyTorch weight {} for layer {}".format(name, i))
+                logger.info("Initialize PyTorch weight {} for layer {}".format(name, i))
                 p_i.data = torch.from_numpy(arr_i)
         else:
             try:
@@ -165,13 +165,13 @@ def load_tf_weights_in_transfo_xl(model, config, tf_path):
             except AssertionError as e:
                 e.args += (pointer.shape, array.shape)
                 raise
-            print("Initialize PyTorch weight {}".format(name))
+            logger.info("Initialize PyTorch weight {}".format(name))
             pointer.data = torch.from_numpy(array)
         tf_weights.pop(name, None)
         tf_weights.pop(name + '/Adam', None)
         tf_weights.pop(name + '/Adam_1', None)
 
-    print("Weights not copied to PyTorch model: {}".format(', '.join(tf_weights.keys())))
+    logger.info("Weights not copied to PyTorch model: {}".format(', '.join(tf_weights.keys())))
     return model
 
 
diff --git a/pytorch_transformers/modeling_transfo_xl_utilities.py b/pytorch_transformers/modeling_transfo_xl_utilities.py
index 2621a57517..6af13d7602 100644
--- a/pytorch_transformers/modeling_transfo_xl_utilities.py
+++ b/pytorch_transformers/modeling_transfo_xl_utilities.py
@@ -272,7 +272,6 @@ class LogUniformSampler(object):
             self.range_max = range_max
             log_indices = torch.arange(1., range_max+2., 1.).log_()
             self.dist = (log_indices[1:] - log_indices[:-1]) / log_indices[-1]
-            # print('P', self.dist.numpy().tolist()[-30:])
 
             self.log_q = (- (-self.dist.double().log1p_() * 2 * n_sample).expm1_()).log_().float()
 
@@ -331,72 +330,3 @@ def sample_logits(embedding, bias, labels, inputs, sampler):
     logits = torch.cat([true_logits[:, :, None], sample_logits], -1)
 
     return logits
-
-
-# class LogUniformSampler(object):
-#     def __init__(self, range_max, unique=False):
-#         """
-#         Reference : https://github.com/tensorflow/tensorflow/blob/r1.10/tensorflow/python/ops/candidate_sampling_ops.py
-#             `P(class) = (log(class + 2) - log(class + 1)) / log(range_max + 1)`
-#         """
-#         self.range_max = range_max
-#         log_indices = torch.arange(1., range_max+2., 1.).log_()
-#         self.dist = (log_indices[1:] - log_indices[:-1]) / log_indices[-1]
-
-#         self.unique = unique
-
-#         if self.unique:
-#             self.exclude_mask = torch.ByteTensor(range_max).fill_(0)
-
-#     def sample(self, n_sample, labels):
-#         pos_sample, new_labels = labels.unique(return_inverse=True)
-#         n_pos_sample = pos_sample.size(0)
-#         n_neg_sample = n_sample - n_pos_sample
-
-#         if self.unique:
-#             self.exclude_mask.index_fill_(0, pos_sample, 1)
-#             sample_dist = self.dist.clone().masked_fill_(self.exclude_mask, 0)
-#             self.exclude_mask.index_fill_(0, pos_sample, 0)
-#         else:
-#             sample_dist = self.dist
-
-#         neg_sample = torch.multinomial(sample_dist, n_neg_sample)
-
-#         sample = torch.cat([pos_sample, neg_sample])
-#         sample_prob = self.dist[sample]
-
-#         return new_labels, sample, sample_prob
-
-
-if __name__ == '__main__':
-    S, B = 3, 4
-    n_vocab = 10000
-    n_sample = 5
-    H = 32
-
-    labels = torch.LongTensor(S, B).random_(0, n_vocab)
-
-    # sampler = LogUniformSampler(n_vocab, unique=False)
-    # new_labels, sample, sample_prob = sampler.sample(n_sample, labels)
-
-    sampler = LogUniformSampler(n_vocab, n_sample)#, unique=True)
-    # true_probs, samp_probs, neg_samples = sampler.sample(n_sample, labels)
-
-    # print('true_probs', true_probs.numpy().tolist())
-    # print('samp_probs', samp_probs.numpy().tolist())
-    # print('neg_samples', neg_samples.numpy().tolist())
-
-    # print('sum', torch.sum(sampler.dist).item())
-
-    # assert torch.all(torch.sort(sample.unique())[0].eq(torch.sort(sample)[0])).item()
-
-    embedding = nn.Embedding(n_vocab, H)
-    bias = torch.zeros(n_vocab)
-    inputs = torch.Tensor(S, B, H).normal_()
-
-    logits, out_labels = sample_logits(embedding, bias, labels, inputs, sampler, n_sample)
-    print('logits', logits.detach().numpy().tolist())
-    print('logits shape', logits.size())
-    print('out_labels', out_labels.detach().numpy().tolist())
-    print('out_labels shape', out_labels.size())
-
diff --git a/pytorch_transformers/modeling_utils.py b/pytorch_transformers/modeling_utils.py
index 36b506da3b..49ea7c7a75 100644
--- a/pytorch_transformers/modeling_utils.py
+++ b/pytorch_transformers/modeling_utils.py
@@ -57,16 +57,18 @@ class PretrainedConfig(object):
             pretrained_model_name_or_path: either:
                 - a str with the name of a pre-trained model to load selected in the list of:
                     . `xlnet-large-cased`
-                - a path or url to a pretrained model archive containing:
-                    . `config.json` a configuration file for the model
+                - a path or url to a directory containing a configuration file `config.json` for the model,
+                - a path or url to a configuration file for the model.
             cache_dir: an optional path to a folder in which the pre-trained model configuration will be cached.
         """
         cache_dir = kwargs.pop('cache_dir', None)
 
         if pretrained_model_name_or_path in cls.pretrained_config_archive_map:
             config_file = cls.pretrained_config_archive_map[pretrained_model_name_or_path]
-        else:
+        elif os.path.isdir(pretrained_model_name_or_path):
             config_file = os.path.join(pretrained_model_name_or_path, CONFIG_NAME)
+        else:
+            config_file = pretrained_model_name_or_path
         # redirect to the cache, if necessary
         try:
             resolved_config_file = cached_path(config_file, cache_dir=cache_dir)
@@ -200,6 +202,7 @@ class PreTrainedModel(nn.Module):
                 - a path or url to a tensorflow pretrained model checkpoint containing:
                     . `config.json` a configuration file for the model
                     . `model.chkpt` a TensorFlow checkpoint
+            config: an optional configuration for the model
             from_tf: should we load the weights from a locally saved TensorFlow checkpoint
             cache_dir: an optional path to a folder in which the pre-trained models will be cached.
             state_dict: an optional state dictionnary (collections.OrderedDict object) to use
@@ -207,23 +210,31 @@ class PreTrainedModel(nn.Module):
             *inputs, **kwargs: additional input for the specific XLNet class
                 (ex: num_labels for XLNetForSequenceClassification)
         """
+        config = kwargs.pop('config', None)
         state_dict = kwargs.pop('state_dict', None)
         cache_dir = kwargs.pop('cache_dir', None)
         from_tf = kwargs.pop('from_tf', False)
         output_loading_info = kwargs.pop('output_loading_info', False)
 
         # Load config
-        config = cls.config_class.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
+        if config is None:
+            config = cls.config_class.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
 
         # Load model
         if pretrained_model_name_or_path in cls.pretrained_model_archive_map:
             archive_file = cls.pretrained_model_archive_map[pretrained_model_name_or_path]
-        else:
+        elif os.path.isdir(pretrained_model_name_or_path):
             if from_tf:
                 # Directly load from a TensorFlow checkpoint
                 archive_file = os.path.join(pretrained_model_name_or_path, TF_WEIGHTS_NAME + ".index")
             else:
                 archive_file = os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME)
+        else:
+            if from_tf:
+                # Directly load from a TensorFlow checkpoint
+                archive_file = pretrained_model_name_or_path + ".index"
+            else:
+                archive_file = pretrained_model_name_or_path
         # redirect to the cache, if necessary
         try:
             resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir)
diff --git a/pytorch_transformers/modeling_xlnet.py b/pytorch_transformers/modeling_xlnet.py
index 1782cb2f84..fcc87cde4e 100644
--- a/pytorch_transformers/modeling_xlnet.py
+++ b/pytorch_transformers/modeling_xlnet.py
@@ -122,14 +122,14 @@ def load_tf_weights_in_xlnet(model, config, tf_path):
         import numpy as np
         import tensorflow as tf
     except ImportError:
-        print("Loading a TensorFlow models in PyTorch, requires TensorFlow to be installed. Please see "
+        logger.error("Loading a TensorFlow models in PyTorch, requires TensorFlow to be installed. Please see "
             "https://www.tensorflow.org/install/ for installation instructions.")
         raise
     # Load weights from TF model
     init_vars = tf.train.list_variables(tf_path)
     tf_weights = {}
     for name, shape in init_vars:
-        print("Loading TF weight {} with shape {}".format(name, shape))
+        logger.info("Loading TF weight {} with shape {}".format(name, shape))
         array = tf.train.load_variable(tf_path, name)
         tf_weights[name] = array
 
@@ -137,15 +137,15 @@ def load_tf_weights_in_xlnet(model, config, tf_path):
     tf_to_pt_map = build_tf_xlnet_to_pytorch_map(model, config, tf_weights)
 
     for name, pointer in tf_to_pt_map.items():
-        print("Importing {}".format(name))
+        logger.info("Importing {}".format(name))
         if name not in tf_weights:
-            print("{} not in tf pre-trained weights, skipping".format(name))
+            logger.info("{} not in tf pre-trained weights, skipping".format(name))
             continue
         array = tf_weights[name]
         # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
         # which are not required for using pretrained model
         if 'kernel' in name and ('ff' in name or 'summary' in name or 'logit' in name):
-            print("Transposing")
+            logger.info("Transposing")
             array = np.transpose(array)
         if isinstance(pointer, list):
             # Here we will split the TF weigths
@@ -157,7 +157,7 @@ def load_tf_weights_in_xlnet(model, config, tf_path):
                 except AssertionError as e:
                     e.args += (p_i.shape, arr_i.shape)
                     raise
-                print("Initialize PyTorch weight {} for layer {}".format(name, i))
+                logger.info("Initialize PyTorch weight {} for layer {}".format(name, i))
                 p_i.data = torch.from_numpy(arr_i)
         else:
             try:
@@ -165,13 +165,13 @@ def load_tf_weights_in_xlnet(model, config, tf_path):
             except AssertionError as e:
                 e.args += (pointer.shape, array.shape)
                 raise
-            print("Initialize PyTorch weight {}".format(name))
+            logger.info("Initialize PyTorch weight {}".format(name))
             pointer.data = torch.from_numpy(array)
         tf_weights.pop(name, None)
         tf_weights.pop(name + '/Adam', None)
         tf_weights.pop(name + '/Adam_1', None)
 
-    print("Weights not copied to PyTorch model: {}".format(', '.join(tf_weights.keys())))
+    logger.info("Weights not copied to PyTorch model: {}".format(', '.join(tf_weights.keys())))
     return model
 
 
diff --git a/pytorch_transformers/tokenization_transfo_xl.py b/pytorch_transformers/tokenization_transfo_xl.py
index 0b4e8c0ca5..fe1fe28e9a 100644
--- a/pytorch_transformers/tokenization_transfo_xl.py
+++ b/pytorch_transformers/tokenization_transfo_xl.py
@@ -98,14 +98,14 @@ class TransfoXLTokenizer(PreTrainedTokenizer):
             self.build_vocab()
 
     def count_file(self, path, verbose=False, add_eos=False):
-        if verbose: print('counting file {} ...'.format(path))
+        if verbose: logger.info('counting file {} ...'.format(path))
         assert os.path.exists(path)
 
         sents = []
         with open(path, 'r', encoding='utf-8') as f:
             for idx, line in enumerate(f):
                 if verbose and idx > 0 and idx % 500000 == 0:
-                    print('    line {}'.format(idx))
+                    logger.info('    line {}'.format(idx))
                 symbols = self.tokenize(line, add_eos=add_eos)
                 self.counter.update(symbols)
                 sents.append(symbols)
@@ -116,10 +116,10 @@ class TransfoXLTokenizer(PreTrainedTokenizer):
         """
             sents : a list of sentences, each a list of tokenized symbols
         """
-        if verbose: print('counting {} sents ...'.format(len(sents)))
+        if verbose: logger.info('counting {} sents ...'.format(len(sents)))
         for idx, symbols in enumerate(sents):
             if verbose and idx > 0 and idx % 500000 == 0:
-                print('    line {}'.format(idx))
+                logger.info('    line {}'.format(idx))
             self.counter.update(symbols)
 
     def _build_from_file(self, vocab_file):
@@ -147,11 +147,11 @@ class TransfoXLTokenizer(PreTrainedTokenizer):
 
     def build_vocab(self):
         if self.vocab_file:
-            print('building vocab from {}'.format(self.vocab_file))
+            logger.info('building vocab from {}'.format(self.vocab_file))
             self._build_from_file(self.vocab_file)
-            print('final vocab size {}'.format(len(self)))
+            logger.info('final vocab size {}'.format(len(self)))
         else:
-            print('building vocab with min_freq={}, max_size={}'.format(
+            logger.info('building vocab with min_freq={}, max_size={}'.format(
                 self.min_freq, self.max_size))
             self.idx2sym = []
             self.sym2idx = OrderedDict()
@@ -163,18 +163,18 @@ class TransfoXLTokenizer(PreTrainedTokenizer):
                 if cnt < self.min_freq: break
                 self.add_symbol(sym)
 
-            print('final vocab size {} from {} unique tokens'.format(
+            logger.info('final vocab size {} from {} unique tokens'.format(
                 len(self), len(self.counter)))
 
     def encode_file(self, path, ordered=False, verbose=False, add_eos=True,
             add_double_eos=False):
-        if verbose: print('encoding file {} ...'.format(path))
+        if verbose: logger.info('encoding file {} ...'.format(path))
         assert os.path.exists(path)
         encoded = []
         with open(path, 'r', encoding='utf-8') as f:
             for idx, line in enumerate(f):
                 if verbose and idx > 0 and idx % 500000 == 0:
-                    print('    line {}'.format(idx))
+                    logger.info('    line {}'.format(idx))
                 symbols = self.tokenize(line, add_eos=add_eos,
                     add_double_eos=add_double_eos)
                 encoded.append(self.convert_to_tensor(symbols))
@@ -185,11 +185,11 @@ class TransfoXLTokenizer(PreTrainedTokenizer):
         return encoded
 
     def encode_sents(self, sents, ordered=False, verbose=False):
-        if verbose: print('encoding {} sents ...'.format(len(sents)))
+        if verbose: logger.info('encoding {} sents ...'.format(len(sents)))
         encoded = []
         for idx, symbols in enumerate(sents):
             if verbose and idx > 0 and idx % 500000 == 0:
-                print('    line {}'.format(idx))
+                logger.info('    line {}'.format(idx))
             encoded.append(self.convert_to_tensor(symbols))
 
         if ordered:
@@ -218,7 +218,7 @@ class TransfoXLTokenizer(PreTrainedTokenizer):
         if sym in self.sym2idx:
             return self.sym2idx[sym]
         else:
-            # print('encounter unk {}'.format(sym))
+            # logger.info('encounter unk {}'.format(sym))
             # assert '<eos>' not in sym
             if hasattr(self, 'unk_idx'):
                 return self.sym2idx.get(sym, self.unk_idx)
@@ -544,14 +544,14 @@ def get_lm_corpus(datadir, dataset):
     fn = os.path.join(datadir, 'cache.pt')
     fn_pickle = os.path.join(datadir, 'cache.pkl')
     if os.path.exists(fn):
-        print('Loading cached dataset...')
+        logger.info('Loading cached dataset...')
         corpus = torch.load(fn_pickle)
     elif os.path.exists(fn):
-        print('Loading cached dataset from pickle...')
+        logger.info('Loading cached dataset from pickle...')
         with open(fn, "rb") as fp:
             corpus = pickle.load(fp)
     else:
-        print('Producing dataset {}...'.format(dataset))
+        logger.info('Producing dataset {}...'.format(dataset))
         kwargs = {}
         if dataset in ['wt103', 'wt2']:
             kwargs['special'] = ['<eos>']

From ec07cf5a660926833d6f5208b58730e4af8d1178 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Thu, 11 Jul 2019 14:48:22 +0200
Subject: [PATCH 093/139] rewamp optimization

---
 examples/run_glue.py                          |  77 +++--
 pytorch_transformers/__init__.py              |   4 +-
 pytorch_transformers/modeling_utils.py        |   2 +-
 pytorch_transformers/modeling_xlnet.py        |  13 -
 pytorch_transformers/optimization.py          | 275 ++++++------------
 pytorch_transformers/optimization_openai.py   | 127 --------
 .../tests/optimization_test.py                |  29 +-
 7 files changed, 138 insertions(+), 389 deletions(-)
 delete mode 100644 pytorch_transformers/optimization_openai.py

diff --git a/examples/run_glue.py b/examples/run_glue.py
index b0d8158d9a..93f69e1741 100644
--- a/examples/run_glue.py
+++ b/examples/run_glue.py
@@ -25,19 +25,21 @@ import random
 
 import numpy as np
 import torch
-from tensorboardX import SummaryWriter
 from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
                               TensorDataset)
 from torch.utils.data.distributed import DistributedSampler
+from tensorboardX import SummaryWriter
 from tqdm import tqdm, trange
 
-from pytorch_transformers import WEIGHTS_NAME
-from pytorch_transformers import (BertConfig, BertForSequenceClassification,
-                                  BertTokenizer, XLMConfig,
-                                  XLMForSequenceClassification, XLMTokenizer,
-                                  XLNetConfig, XLNetForSequenceClassification,
+from pytorch_transformers import (WEIGHTS_NAME, BertConfig,
+                                  BertForSequenceClassification, BertTokenizer,
+                                  XLMConfig, XLMForSequenceClassification,
+                                  XLMTokenizer, XLNetConfig,
+                                  XLNetForSequenceClassification,
                                   XLNetTokenizer)
-from pytorch_transformers.optimization import BertAdam
+
+from pytorch_transformers import AdamW, WarmupLinearSchedule
+
 from utils_glue import (compute_metrics, convert_examples_to_features,
                         output_modes, processors)
 
@@ -56,24 +58,24 @@ def train(args, train_dataset, model, tokenizer):
     if args.local_rank in [-1, 0]:
         tb_writer = SummaryWriter()
 
-    args.train_batch_size = args.per_gpu_train_batch_size * args.n_gpu
+    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
     train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
     train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size)
 
     if args.max_steps > 0:
-        num_train_optimization_steps = args.max_steps
+        t_total = args.max_steps
         args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
     else:
-        num_train_optimization_steps = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
+        t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
 
-    # Prepare optimizer
+    # Prepare optimizer and schedule (linear warmup and decay)
     no_decay = ['bias', 'LayerNorm.weight']
     optimizer_grouped_parameters = [
         {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay},
         {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
         ]
-    optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate,
-                         t_total=num_train_optimization_steps, warmup=args.warmup_proportion)
+    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate)
+    schedule = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total)
     if args.fp16:
         try:
             from apex import amp
@@ -89,11 +91,11 @@ def train(args, train_dataset, model, tokenizer):
     logger.info("  Total train batch size (w. parallel, distributed & accumulation) = %d",
                    args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1))
     logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
-    logger.info("  Total optimization steps = %d", num_train_optimization_steps)
+    logger.info("  Total optimization steps = %d", t_total)
 
     global_step = 0
     tr_loss, logging_loss = 0.0, 0.0
-    optimizer.zero_grad()
+    model.zero_grad()
     for _ in trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]):
         for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])):
             model.train()
@@ -103,7 +105,7 @@ def train(args, train_dataset, model, tokenizer):
                       'token_type_ids': batch[2] if args.model_type in ['bert', 'xlnet'] else None,  # XLM don't use segment_ids
                       'labels':         batch[3]}
             ouputs = model(**inputs)
-            loss = ouputs[0]
+            loss = ouputs[0]  # model outputs are always tuple in pytorch-transformers (see doc)
 
             if args.n_gpu > 1:
                 loss = loss.mean() # mean() to average on multi-gpu parallel training
@@ -113,22 +115,25 @@ def train(args, train_dataset, model, tokenizer):
             if args.fp16:
                 with amp.scale_loss(loss, optimizer) as scaled_loss:
                     scaled_loss.backward()
+                torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
             else:
                 loss.backward()
+                torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
 
             tr_loss += loss.item()
             if (step + 1) % args.gradient_accumulation_steps == 0:
+                scheduler.step()  # Update learning rate schedule
                 optimizer.step()
-                optimizer.zero_grad()
+                model.zero_grad()
                 global_step += 1
 
                 if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                     # Log metrics
-                    if args.local_rank == -1:  # Only evaluate on single GPU otherwise metrics may not average well
-                        results = evaluate(args, model, tokenizer, prefix=global_step)
+                    if args.local_rank == -1:  # Only evaluate when single GPU otherwise metrics may not average well
+                        results = evaluate(args, model, tokenizer)
                         for key, value in results.items():
                             tb_writer.add_scalar('eval_{}'.format(key), value, global_step)
-                    tb_writer.add_scalar('lr', optimizer.get_lr()[0], global_step)
+                    tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step)
                     tb_writer.add_scalar('loss', (tr_loss - logging_loss)/args.logging_steps, global_step)
                     logging_loss = tr_loss
 
@@ -140,6 +145,7 @@ def train(args, train_dataset, model, tokenizer):
                     model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
                     model_to_save.save_pretrained(output_dir)
                     torch.save(args, os.path.join(output_dir, 'training_args.bin'))
+                    logger.info("Saving model checkpoint to %s", output_dir)
 
             if args.max_steps > 0 and global_step > args.max_steps:
                 break
@@ -162,20 +168,21 @@ def evaluate(args, model, tokenizer, prefix=""):
         if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]:
             os.makedirs(eval_output_dir)
 
+        args.eval_batch_size = args.per_gpu_eval_batch_size * args.n_gpu
         # Note that DistributedSampler samples randomly
         eval_sampler = SequentialSampler(eval_dataset) if args.local_rank == -1 else DistributedSampler(eval_dataset)
         eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)
 
         # Eval!
-        logger.info("***** Running evaluation *****")
+        logger.info("***** Running evaluation {} *****".format(prefix))
         logger.info("  Num examples = %d", len(eval_dataset))
         logger.info("  Batch size = %d", args.eval_batch_size)
-        model.eval()
         eval_loss = 0
         nb_eval_steps = 0
         preds = None
         out_label_ids = None
         for batch in tqdm(eval_dataloader, desc="Evaluating"):
+            model.eval()
             batch = tuple(t.to(args.device) for t in batch)
 
             with torch.no_grad():
@@ -186,7 +193,7 @@ def evaluate(args, model, tokenizer, prefix=""):
                 outputs = model(**inputs)
                 tmp_eval_loss, logits = outputs[:2]
 
-            eval_loss += tmp_eval_loss.mean().item()
+                eval_loss += tmp_eval_loss.mean().item()
             nb_eval_steps += 1
             if preds is None:
                 preds = logits.detach().cpu().numpy()
@@ -213,7 +220,7 @@ def evaluate(args, model, tokenizer, prefix=""):
     return results
 
 
-def load_and_cache_examples(args, task, tokenizer, evaluate=False, overwrite_cache=False):
+def load_and_cache_examples(args, task, tokenizer, evaluate=False):
     processor = processors[task]()
     output_mode = output_modes[task]
     # Load data features from cache or dataset file
@@ -285,20 +292,22 @@ def main():
 
     parser.add_argument("--per_gpu_train_batch_size", default=8, type=int,
                         help="Batch size per GPU for training.")
-    parser.add_argument("--eval_batch_size", default=8, type=int,
-                        help="Total batch size for eval.")
+    parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int,
+                        help="Batch size per GPU for evaluation.")
     parser.add_argument('--gradient_accumulation_steps', type=int, default=1,
                         help="Number of updates steps to accumulate before performing a backward/update pass.")
     parser.add_argument("--learning_rate", default=5e-5, type=float,
                         help="The initial learning rate for Adam.")
     parser.add_argument("--weight_decay", default=0.0, type=float,
                         help="Weight deay if we apply some.")
+    parser.add_argument("--max_grad_norm", default=1.0, type=float,
+                        help="Max gradient norm.")
     parser.add_argument("--num_train_epochs", default=3.0, type=float,
                         help="Total number of training epochs to perform.")
     parser.add_argument("--max_steps", default=-1, type=int,
                         help="If > 0: set total number of training steps to perform. Override num_train_epochs.")
-    parser.add_argument("--warmup_proportion", default=0.1, type=float,
-                        help="Proportion of training with linear learning rate warmup (0.1 = 10%% of training).")
+    parser.add_argument("--warmup_steps", default=0, type=int,
+                        help="Linear warmup over warmup_steps.")
 
     parser.add_argument('--logging_steps', type=int, default=50,
                         help="Log every X updates steps.")
@@ -409,6 +418,7 @@ def main():
         if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
             os.makedirs(args.output_dir)
 
+        logger.info("Saving model checkpoint to %s", args.output_dir)
         # Save a trained model, configuration and tokenizer using `save_pretrained()`.
         # They can then be reloaded using `from_pretrained()`
         model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
@@ -427,15 +437,18 @@ def main():
     if args.do_eval and args.local_rank in [-1, 0]:
         checkpoints = [args.output_dir + './' + WEIGHTS_NAME]
         if args.eval_all_checkpoints:
-            checkpoints = list(os.path.dirname(c) for c in glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True))
+            checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True)))
+            logging.getLogger("pytorch_transformers.modeling_utils").setLevel(logging.WARN)  # Reduce logging
         logger.info("Evaluate the following checkpoints: %s", checkpoints)
         results = {}
         for checkpoint in checkpoints:
-            global_step = int(checkpoints.split('-')[-1])
-            model = model_class.from_pretrained(checkpoints)
+            global_step = int(checkpoint.split('-')[-1])
+            model = model_class.from_pretrained(checkpoint)
             model.to(args.device)
             result = evaluate(args, model, tokenizer, prefix=global_step)
-            result = dict(n + '_{}'.format())
+            result = dict((k + '_{}'.format(global_step), v) for k, v in result.items())
+            results.update(result)
+
         return results
 
 
diff --git a/pytorch_transformers/__init__.py b/pytorch_transformers/__init__.py
index c8f64a07de..4c652e3596 100644
--- a/pytorch_transformers/__init__.py
+++ b/pytorch_transformers/__init__.py
@@ -36,7 +36,7 @@ from .modeling_xlm import (XLMConfig, XLMModel,
 from .modeling_utils import (WEIGHTS_NAME, CONFIG_NAME, TF_WEIGHTS_NAME,
                           PretrainedConfig, PreTrainedModel, prune_layer, Conv1D)
 
-from .optimization import BertAdam
-from .optimization_openai import OpenAIAdam
+from .optimization import (AdamW, ConstantLRSchedule, WarmupConstantSchedule, WarmupCosineSchedule,
+                           WarmupCosineWithHardRestartsSchedule, WarmupLinearSchedule)
 
 from .file_utils import (PYTORCH_PRETRAINED_BERT_CACHE, cached_path)
diff --git a/pytorch_transformers/modeling_utils.py b/pytorch_transformers/modeling_utils.py
index 49ea7c7a75..c304e7fdf0 100644
--- a/pytorch_transformers/modeling_utils.py
+++ b/pytorch_transformers/modeling_utils.py
@@ -104,7 +104,7 @@ class PretrainedConfig(object):
         for key in to_remove:
             kwargs.pop(key, None)
 
-        logger.info("Model config {}".format(config))
+        logger.info("Model config %s", config)
         return config
 
     @classmethod
diff --git a/pytorch_transformers/modeling_xlnet.py b/pytorch_transformers/modeling_xlnet.py
index fcc87cde4e..341447d8d2 100644
--- a/pytorch_transformers/modeling_xlnet.py
+++ b/pytorch_transformers/modeling_xlnet.py
@@ -211,10 +211,6 @@ class XLNetConfig(PretrainedConfig):
                  layer_norm_eps=1e-12,
 
                  dropout=0.1,
-                 dropatt=0.1,
-                 init="normal",
-                 init_range=0.1,
-                 init_std=0.02,
                  mem_len=None,
                  reuse_len=None,
                  bi_data=False,
@@ -258,11 +254,6 @@ class XLNetConfig(PretrainedConfig):
 
             dropout: float, dropout rate.
             dropatt: float, dropout rate on attention probabilities.
-            init: str, the initialization scheme, either "normal" or "uniform".
-            init_range: float, initialize the parameters with a uniform distribution
-                in [-init_range, init_range]. Only effective when init="uniform".
-            init_std: float, initialize the parameters with a normal distribution
-                with mean 0 and stddev init_std. Only effective when init="normal".
             mem_len: int, the number of tokens to cache.
             reuse_len: int, the number of tokens in the currect batch to be cached
                 and reused in the future.
@@ -297,11 +288,7 @@ class XLNetConfig(PretrainedConfig):
             self.initializer_range = initializer_range
             self.layer_norm_eps = layer_norm_eps
 
-            self.init = init
-            self.init_range = init_range
-            self.init_std = init_std
             self.dropout = dropout
-            self.dropatt = dropatt
             self.mem_len = mem_len
             self.reuse_len = reuse_len
             self.bi_data = bi_data
diff --git a/pytorch_transformers/optimization.py b/pytorch_transformers/optimization.py
index b2f2e43b1c..c7f169f0b6 100644
--- a/pytorch_transformers/optimization.py
+++ b/pytorch_transformers/optimization.py
@@ -14,174 +14,92 @@
 # limitations under the License.
 """PyTorch optimization for BERT model."""
 
+import logging
 import math
+
 import torch
 from torch.optim import Optimizer
-from torch.optim.optimizer import required
-from torch.nn.utils import clip_grad_norm_
-import logging
-import abc
-import sys
+from torch.optim.lr_scheduler import LambdaLR
 
 logger = logging.getLogger(__name__)
 
+class ConstantLRSchedule(LambdaLR):
+    def __init__(self, optimizer, last_epoch=-1):
+        super(ConstantLR, self).__init__(optimizer, lambda x: x, last_epoch=last_epoch)
 
-if sys.version_info >= (3, 4):
-    ABC = abc.ABC
-else:
-    ABC = abc.ABCMeta('ABC', (), {})
-
-
-class _LRSchedule(ABC):
-    """ Parent of all LRSchedules here. """
-    warn_t_total = False        # is set to True for schedules where progressing beyond t_total steps doesn't make sense
-    def __init__(self, warmup=0.002, t_total=-1, **kw):
-        """
-        :param warmup:  what fraction of t_total steps will be used for linear warmup
-        :param t_total: how many training steps (updates) are planned
-        :param kw:
-        """
-        super(_LRSchedule, self).__init__(**kw)
-        if t_total < 0:
-            logger.warning("t_total value of {} results in schedule not being applied".format(t_total))
-        if not 0.0 <= warmup < 1.0 and not warmup == -1:
-            raise ValueError("Invalid warmup: {} - should be in [0.0, 1.0[ or -1".format(warmup))
-        warmup = max(warmup, 0.)
-        self.warmup, self.t_total = float(warmup), float(t_total)
-        self.warned_for_t_total_at_progress = -1
-
-    def get_lr(self, step, nowarn=False):
-        """
-        :param step:    which of t_total steps we're on
-        :param nowarn:  set to True to suppress warning regarding training beyond specified 't_total' steps
-        :return:        learning rate multiplier for current update
-        """
-        if self.t_total < 0:
-            return 1.
-        progress = float(step) / self.t_total
-        ret = self.get_lr_(progress)
-        # warning for exceeding t_total (only active with warmup_linear
-        if not nowarn and self.warn_t_total and progress > 1. and progress > self.warned_for_t_total_at_progress:
-            logger.warning(
-                "Training beyond specified 't_total'. Learning rate multiplier set to {}. Please set 't_total' of {} correctly."
-                    .format(ret, self.__class__.__name__))
-            self.warned_for_t_total_at_progress = progress
-        # end warning
-        return ret
-
-    @abc.abstractmethod
-    def get_lr_(self, progress):
-        """
-        :param progress:    value between 0 and 1 (unless going beyond t_total steps) specifying training progress
-        :return:            learning rate multiplier for current update
-        """
-        return 1.
-
-
-class ConstantLR(_LRSchedule):
-    def get_lr_(self, progress):
-        return 1.
-
-
-class WarmupCosineSchedule(_LRSchedule):
+class WarmupCosineSchedule(LambdaLR):
     """
-    Linearly increases learning rate from 0 to 1 over `warmup` fraction of training steps.
-    Decreases learning rate from 1. to 0. over remaining `1 - warmup` steps following a cosine curve.
+    Linearly increases learning rate from 0 to 1 over `warmup` training steps.
+    Decreases learning rate from 1. to 0. over remaining `t_total - warmup` steps following a cosine curve.
     If `cycles` (default=0.5) is different from default, learning rate follows cosine function after warmup.
+    :param warmup:      see LRSchedule
+    :param t_total:     see LRSchedule
+    :param cycles:      number of cycles. Default: 0.5, corresponding to cosine decay from 1. at progress==warmup and 0 at progress==1.
+    :param kw:
     """
     warn_t_total = True
-    def __init__(self, warmup=0.002, t_total=-1, cycles=.5, **kw):
-        """
-        :param warmup:      see LRSchedule
-        :param t_total:     see LRSchedule
-        :param cycles:      number of cycles. Default: 0.5, corresponding to cosine decay from 1. at progress==warmup and 0 at progress==1.
-        :param kw:
-        """
-        super(WarmupCosineSchedule, self).__init__(warmup=warmup, t_total=t_total, **kw)
-        self.cycles = cycles
+    def __init__(self, optimizer, warmup_steps, t_total, cycles=.5, last_epoch=-1):
 
-    def get_lr_(self, progress):
-        if progress < self.warmup:
-            return progress / self.warmup
-        else:
-            progress = (progress - self.warmup) / (1 - self.warmup)   # progress after warmup
-            return 0.5 * (1. + math.cos(math.pi * self.cycles * 2 * progress))
+        def lr_lambda(step):
+            if step < warmup_steps:
+                return step / max(1, warmup_steps)
+            else:
+                progress = (step - warmup_steps) / max(1, t_total - warmup_steps)   # progress after warmup
+                return 0.5 * (1. + math.cos(math.pi * cycles * 2 * progress))
 
+        super(WarmupCosineSchedule, self).__init__(optimizer, lr_lambda, last_epoch=last_epoch)
 
-class WarmupCosineWithHardRestartsSchedule(WarmupCosineSchedule):
+class WarmupCosineWithHardRestartsSchedule(LambdaLR):
     """
     Linearly increases learning rate from 0 to 1 over `warmup` fraction of training steps.
     If `cycles` (default=1.) is different from default, learning rate follows `cycles` times a cosine decaying
     learning rate (with hard restarts).
     """
-    def __init__(self, warmup=0.002, t_total=-1, cycles=1., **kw):
-        super(WarmupCosineWithHardRestartsSchedule, self).__init__(warmup=warmup, t_total=t_total, cycles=cycles, **kw)
-        assert(cycles >= 1.)
+    def __init__(self, optimizer, warmup_steps, t_total, cycles=1., last_epoch=-1):
 
-    def get_lr_(self, progress):
-        if progress < self.warmup:
-            return progress / self.warmup
-        else:
-            progress = (progress - self.warmup) / (1 - self.warmup)     # progress after warmup
-            ret = 0.5 * (1. + math.cos(math.pi * ((self.cycles * progress) % 1)))
-            return ret
+        def lr_lambda(step):
+            if step < warmup_steps:
+                return step / max(1, warmup_steps)
+            else:
+                progress = (step - warmup_steps) / max(1, t_total - warmup_steps)   # progress after warmup
+                ret = 0.5 * (1. + math.cos(math.pi * ((cycles * progress) % 1)))
+                return ret
+
+        super(WarmupCosineWithHardRestartsSchedule, self).__init__(optimizer, lr_lambda, last_epoch=last_epoch)
 
 
-class WarmupCosineWithWarmupRestartsSchedule(WarmupCosineWithHardRestartsSchedule):
-    """
-    All training progress is divided in `cycles` (default=1.) parts of equal length.
-    Every part follows a schedule with the first `warmup` fraction of the training steps linearly increasing from 0. to 1.,
-    followed by a learning rate decreasing from 1. to 0. following a cosine curve.
-    """
-    def __init__(self, warmup=0.002, t_total=-1, cycles=1., **kw):
-        assert(warmup * cycles < 1.)
-        warmup = warmup * cycles if warmup >= 0 else warmup
-        super(WarmupCosineWithWarmupRestartsSchedule, self).__init__(warmup=warmup, t_total=t_total, cycles=cycles, **kw)
-
-    def get_lr_(self, progress):
-        progress = progress * self.cycles % 1.
-        if progress < self.warmup:
-            return progress / self.warmup
-        else:
-            progress = (progress - self.warmup) / (1 - self.warmup)     # progress after warmup
-            ret = 0.5 * (1. + math.cos(math.pi * progress))
-            return ret
-
-
-class WarmupConstantSchedule(_LRSchedule):
+class WarmupConstantSchedule(LambdaLR):
     """
     Linearly increases learning rate from 0 to 1 over `warmup` fraction of training steps.
     Keeps learning rate equal to 1. after warmup.
     """
-    def get_lr_(self, progress):
-        if progress < self.warmup:
-            return progress / self.warmup
-        return 1.
+    def __init__(self, optimizer, warmup_steps, last_epoch=-1):
+
+        def lr_lambda(step):
+            if step < warmup_steps:
+                return step / warmup_steps
+            return 1.
+
+        super(WarmupConstantSchedule, self).__init__(optimizer, lr_lambda, last_epoch=last_epoch)
 
 
-class WarmupLinearSchedule(_LRSchedule):
+class WarmupLinearSchedule(LambdaLR):
     """
     Linearly increases learning rate from 0 to 1 over `warmup` fraction of training steps.
     Linearly decreases learning rate from 1. to 0. over remaining `1 - warmup` steps.
     """
-    warn_t_total = True
-    def get_lr_(self, progress):
-        if progress < self.warmup:
-            return progress / self.warmup
-        return max((progress - 1.) / (self.warmup - 1.), 0.)
+    def __init__(self, optimizer, warmup_steps, t_total, last_epoch=-1):
+
+        def lr_lambda(step):
+            if step < warmup_steps:
+                return step / max(1, warmup_steps)
+            return (t_total - step) / max(1, t_total - warmup_steps)
+
+        super(WarmupLinearSchedule, self).__init__(optimizer, lr_lambda, last_epoch=last_epoch)
 
 
-SCHEDULES = {
-    None:       ConstantLR,
-    "none":     ConstantLR,
-    "warmup_cosine": WarmupCosineSchedule,
-    "warmup_constant": WarmupConstantSchedule,
-    "warmup_linear": WarmupLinearSchedule
-}
-
-
-class BertAdam(Optimizer):
-    """Implements BERT version of Adam algorithm with weight decay fix.
+class AdamW(Optimizer):
+    """ Implements Adam algorithm with weight decay fix.
 
     Parameters:
         lr: learning rate
@@ -197,46 +115,21 @@ class BertAdam(Optimizer):
         e: Adams epsilon. Default: 1e-6
         weight_decay: Weight decay. Default: 0.01
         max_grad_norm: Maximum norm for the gradients (-1 means no clipping). Default: 1.0
+        correct_bias: can be set to False to avoid correcting bias in Adam (e.g. like in Bert repository)
     """
-    def __init__(self, params, lr=required, warmup=-1, t_total=-1, schedule='warmup_linear',
-                 b1=0.9, b2=0.999, e=1e-6, weight_decay=0.01, max_grad_norm=1.0, **kwargs):
-        if lr is not required and lr < 0.0:
+    def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-6, weight_decay=0.01, correct_bias=True):
+        if lr < 0.0:
             raise ValueError("Invalid learning rate: {} - should be >= 0.0".format(lr))
-        if not isinstance(schedule, _LRSchedule) and schedule not in SCHEDULES:
-            raise ValueError("Invalid schedule parameter: {}".format(schedule))
-        if not 0.0 <= b1 < 1.0:
-            raise ValueError("Invalid b1 parameter: {} - should be in [0.0, 1.0[".format(b1))
-        if not 0.0 <= b2 < 1.0:
-            raise ValueError("Invalid b2 parameter: {} - should be in [0.0, 1.0[".format(b2))
-        if not e >= 0.0:
+        if not 0.0 <= betas[0] < 1.0:
+            raise ValueError("Invalid beta parameter: {} - should be in [0.0, 1.0[".format(betas[0]))
+        if not 0.0 <= betas[1]  < 1.0:
+            raise ValueError("Invalid beta parameter: {} - should be in [0.0, 1.0[".format(betas[1] ))
+        if not 0.0 <= eps:
             raise ValueError("Invalid epsilon value: {} - should be >= 0.0".format(e))
-        # initialize schedule object
-        if not isinstance(schedule, _LRSchedule):
-            schedule_type = SCHEDULES[schedule]
-            schedule = schedule_type(warmup=warmup, t_total=t_total)
-        else:
-            if warmup != -1 or t_total != -1:
-                logger.warning("warmup and t_total on the optimizer are ineffective when _LRSchedule object is provided as schedule. "
-                               "Please specify custom warmup and t_total in _LRSchedule object.")
-        defaults = dict(lr=lr, schedule=schedule,
-                        b1=b1, b2=b2, e=e, weight_decay=weight_decay,
-                        max_grad_norm=max_grad_norm)
+        defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay,
+                        correct_bias=correct_bias)
         super(BertAdam, self).__init__(params, defaults)
 
-    def get_lr(self):
-        lr = []
-        for group in self.param_groups:
-            for p in group['params']:
-                if p.grad is None:
-                    continue
-                state = self.state[p]
-                if len(state) == 0:
-                    return [0]
-                lr_scheduled = group['lr']
-                lr_scheduled *= group['schedule'].get_lr(state['step'])
-                lr.append(lr_scheduled)
-        return lr
-
     def step(self, closure=None):
         """Performs a single optimization step.
 
@@ -262,22 +155,28 @@ class BertAdam(Optimizer):
                 if len(state) == 0:
                     state['step'] = 0
                     # Exponential moving average of gradient values
-                    state['next_m'] = torch.zeros_like(p.data)
+                    state['exp_avg'] = torch.zeros_like(p.data)
                     # Exponential moving average of squared gradient values
-                    state['next_v'] = torch.zeros_like(p.data)
+                    state['exp_avg_sq'] = torch.zeros_like(p.data)
 
-                next_m, next_v = state['next_m'], state['next_v']
-                beta1, beta2 = group['b1'], group['b2']
+                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
+                beta1, beta2 = group['betas']
 
-                # Add grad clipping
-                if group['max_grad_norm'] > 0:
-                    clip_grad_norm_(p, group['max_grad_norm'])
+                state['step'] += 1
 
                 # Decay the first and second moment running average coefficient
                 # In-place operations to update the averages at the same time
-                next_m.mul_(beta1).add_(1 - beta1, grad)
-                next_v.mul_(beta2).addcmul_(1 - beta2, grad, grad)
-                update = next_m / (next_v.sqrt() + group['e'])
+                exp_avg.mul_(beta1).add_(1 - beta1, grad)
+                exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
+                denom = exp_avg_sq.sqrt().add_(group['eps'])
+
+                step_size = group['lr']
+                if group['correct_bias']:  # No bias correction for Bert
+                    bias_correction1 = 1 - beta1 ** state['step']
+                    bias_correction2 = 1 - beta2 ** state['step']
+                    step_size = step_size * math.sqrt(bias_correction2) / bias_correction1
+
+                p.data.addcdiv_(-step_size, exp_avg, denom)
 
                 # Just adding the square of the weights to the loss function is *not*
                 # the correct way of using L2 regularization/weight decay with Adam,
@@ -286,20 +185,8 @@ class BertAdam(Optimizer):
                 # Instead we want to decay the weights in a manner that doesn't interact
                 # with the m/v parameters. This is equivalent to adding the square
                 # of the weights to the loss with plain (non-momentum) SGD.
-                if group['weight_decay'] > 0.0:
-                    update += group['weight_decay'] * p.data
-
-                lr_scheduled = group['lr']
-                lr_scheduled *= group['schedule'].get_lr(state['step'])
-
-                update_with_lr = lr_scheduled * update
-                p.data.add_(-update_with_lr)
-
-                state['step'] += 1
-
-                # step_size = lr_scheduled * math.sqrt(bias_correction2) / bias_correction1
-                # No bias correction
-                # bias_correction1 = 1 - beta1 ** state['step']
-                # bias_correction2 = 1 - beta2 ** state['step']
+                # Add weight decay at the end (fixed version)
+                if group['weight_decay'] > 0:
+                    p.data.add_(-group['lr'] * group['weight_decay'], p.data)
 
         return loss
diff --git a/pytorch_transformers/optimization_openai.py b/pytorch_transformers/optimization_openai.py
deleted file mode 100644
index bff4ebe61f..0000000000
--- a/pytorch_transformers/optimization_openai.py
+++ /dev/null
@@ -1,127 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""PyTorch optimization for OpenAI GPT model."""
-
-import math
-import torch
-from torch.optim import Optimizer
-from torch.optim.optimizer import required
-from torch.nn.utils import clip_grad_norm_
-import logging
-from .optimization import SCHEDULES, _LRSchedule, WarmupCosineWithWarmupRestartsSchedule, \
-    WarmupCosineWithHardRestartsSchedule, WarmupCosineSchedule, WarmupLinearSchedule, WarmupConstantSchedule
-
-logger = logging.getLogger(__name__)
-
-
-class OpenAIAdam(Optimizer):
-    """Implements Open AI version of Adam algorithm with weight decay fix.
-    """
-    def __init__(self, params, lr=required, schedule='warmup_linear', warmup=-1, t_total=-1,
-                 b1=0.9, b2=0.999, e=1e-8, weight_decay=0,
-                 vector_l2=False, max_grad_norm=-1, **kwargs):
-        if lr is not required and lr < 0.0:
-            raise ValueError("Invalid learning rate: {} - should be >= 0.0".format(lr))
-        if not isinstance(schedule, _LRSchedule) and schedule not in SCHEDULES:
-            raise ValueError("Invalid schedule parameter: {}".format(schedule))
-        if not 0.0 <= b1 < 1.0:
-            raise ValueError("Invalid b1 parameter: {} - should be in [0.0, 1.0[".format(b1))
-        if not 0.0 <= b2 < 1.0:
-            raise ValueError("Invalid b2 parameter: {} - should be in [0.0, 1.0[".format(b2))
-        if not e >= 0.0:
-            raise ValueError("Invalid epsilon value: {} - should be >= 0.0".format(e))
-        # initialize schedule object
-        if not isinstance(schedule, _LRSchedule):
-            schedule_type = SCHEDULES[schedule]
-            schedule = schedule_type(warmup=warmup, t_total=t_total)
-        else:
-            if warmup != -1 or t_total != -1:
-                logger.warning("warmup and t_total on the optimizer are ineffective when _LRSchedule object is provided as schedule. "
-                               "Please specify custom warmup and t_total in _LRSchedule object.")
-        defaults = dict(lr=lr, schedule=schedule,
-                        b1=b1, b2=b2, e=e, weight_decay=weight_decay, vector_l2=vector_l2,
-                        max_grad_norm=max_grad_norm)
-        super(OpenAIAdam, self).__init__(params, defaults)
-
-    def get_lr(self):
-        lr = []
-        for group in self.param_groups:
-            for p in group['params']:
-                state = self.state[p]
-                if len(state) == 0:
-                    return [0]
-                lr_scheduled = group['lr']
-                lr_scheduled *= group['schedule'].get_lr(state['step'])
-                lr.append(lr_scheduled)
-        return lr
-
-    def step(self, closure=None):
-        """Performs a single optimization step.
-
-        Arguments:
-            closure (callable, optional): A closure that reevaluates the model
-                and returns the loss.
-        """
-        loss = None
-        if closure is not None:
-            loss = closure()
-
-        for group in self.param_groups:
-            for p in group['params']:
-                if p.grad is None:
-                    continue
-                grad = p.grad.data
-                if grad.is_sparse:
-                    raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')
-
-                state = self.state[p]
-
-                # State initialization
-                if len(state) == 0:
-                    state['step'] = 0
-                    # Exponential moving average of gradient values
-                    state['exp_avg'] = torch.zeros_like(p.data)
-                    # Exponential moving average of squared gradient values
-                    state['exp_avg_sq'] = torch.zeros_like(p.data)
-
-                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
-                beta1, beta2 = group['b1'], group['b2']
-
-                state['step'] += 1
-
-                # Add grad clipping
-                if group['max_grad_norm'] > 0:
-                    clip_grad_norm_(p, group['max_grad_norm'])
-
-                # Decay the first and second moment running average coefficient
-                exp_avg.mul_(beta1).add_(1 - beta1, grad)
-                exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
-                denom = exp_avg_sq.sqrt().add_(group['e'])
-
-                bias_correction1 = 1 - beta1 ** state['step']
-                bias_correction2 = 1 - beta2 ** state['step']
-
-                lr_scheduled = group['lr']
-                lr_scheduled *= group['schedule'].get_lr(state['step'])
-
-                step_size = lr_scheduled * math.sqrt(bias_correction2) / bias_correction1
-
-                p.data.addcdiv_(-step_size, exp_avg, denom)
-
-                # Add weight decay at the end (fixed version)
-                if (len(p.size()) > 1 or group['vector_l2']) and group['weight_decay'] > 0:
-                    p.data.add_(-lr_scheduled * group['weight_decay'], p.data)
-
-        return loss
diff --git a/pytorch_transformers/tests/optimization_test.py b/pytorch_transformers/tests/optimization_test.py
index dfbbd44b6e..7a9c93048d 100644
--- a/pytorch_transformers/tests/optimization_test.py
+++ b/pytorch_transformers/tests/optimization_test.py
@@ -20,10 +20,9 @@ import unittest
 
 import torch
 
-from pytorch_transformers import BertAdam
-from pytorch_transformers import OpenAIAdam
-from pytorch_transformers.optimization import ConstantLR, WarmupLinearSchedule, WarmupConstantSchedule, \
-    WarmupCosineWithWarmupRestartsSchedule, WarmupCosineWithHardRestartsSchedule, WarmupCosineSchedule
+from pytorch_transformers import (AdamW, ConstantLRSchedule, WarmupConstantSchedule,
+                                  WarmupCosineSchedule, WarmupCosineWithHardRestartsSchedule, WarmupLinearSchedule)
+
 import numpy as np
 
 
@@ -34,12 +33,12 @@ class OptimizationTest(unittest.TestCase):
         for a, b in zip(list1, list2):
             self.assertAlmostEqual(a, b, delta=tol)
 
-    def test_adam(self):
+    def test_adam_w(self):
         w = torch.tensor([0.1, -0.2, -0.1], requires_grad=True)
         target = torch.tensor([0.4, 0.2, -0.5])
         criterion = torch.nn.MSELoss()
         # No warmup, constant schedule, no gradient clipping
-        optimizer = BertAdam(params=[w], lr=2e-1,
+        optimizer = AdamW(params=[w], lr=2e-1,
                                           weight_decay=0.0,
                                           max_grad_norm=-1)
         for _ in range(100):
@@ -52,23 +51,13 @@ class OptimizationTest(unittest.TestCase):
 
 
 class ScheduleInitTest(unittest.TestCase):
-    def test_bert_sched_init(self):
+    def test_sched_init(self):
         m = torch.nn.Linear(50, 50)
-        optim = BertAdam(m.parameters(), lr=0.001, warmup=.1, t_total=1000, schedule=None)
+        optim = AdamW(m.parameters(), lr=0.001, warmup=.1, t_total=1000, schedule=None)
         self.assertTrue(isinstance(optim.param_groups[0]["schedule"], ConstantLR))
-        optim = BertAdam(m.parameters(), lr=0.001, warmup=.1, t_total=1000, schedule="none")
+        optim = AdamW(m.parameters(), lr=0.001, warmup=.1, t_total=1000, schedule="none")
         self.assertTrue(isinstance(optim.param_groups[0]["schedule"], ConstantLR))
-        optim = BertAdam(m.parameters(), lr=0.001, warmup=.01, t_total=1000)
-        self.assertTrue(isinstance(optim.param_groups[0]["schedule"], WarmupLinearSchedule))
-        # shouldn't fail
-
-    def test_openai_sched_init(self):
-        m = torch.nn.Linear(50, 50)
-        optim = OpenAIAdam(m.parameters(), lr=0.001, warmup=.1, t_total=1000, schedule=None)
-        self.assertTrue(isinstance(optim.param_groups[0]["schedule"], ConstantLR))
-        optim = OpenAIAdam(m.parameters(), lr=0.001, warmup=.1, t_total=1000, schedule="none")
-        self.assertTrue(isinstance(optim.param_groups[0]["schedule"], ConstantLR))
-        optim = OpenAIAdam(m.parameters(), lr=0.001, warmup=.01, t_total=1000)
+        optim = AdamW(m.parameters(), lr=0.001, warmup=.01, t_total=1000)
         self.assertTrue(isinstance(optim.param_groups[0]["schedule"], WarmupLinearSchedule))
         # shouldn't fail
 

From b21d84b0276b31cccf56ebe714fa479e865787d2 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Thu, 11 Jul 2019 15:37:34 +0200
Subject: [PATCH 094/139] update examples

---
 examples/run_bert_classifier.py      | 528 --------------------------
 examples/run_glue.py                 |  10 +-
 examples/run_xlnet_classifier.py     | 530 ---------------------------
 examples/utils.py                    |  61 ---
 pytorch_transformers/optimization.py |   4 +-
 5 files changed, 9 insertions(+), 1124 deletions(-)
 delete mode 100644 examples/run_bert_classifier.py
 delete mode 100644 examples/run_xlnet_classifier.py
 delete mode 100644 examples/utils.py

diff --git a/examples/run_bert_classifier.py b/examples/run_bert_classifier.py
deleted file mode 100644
index 27b8e6165d..0000000000
--- a/examples/run_bert_classifier.py
+++ /dev/null
@@ -1,528 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""BERT finetuning runner."""
-
-from __future__ import absolute_import, division, print_function
-
-import argparse
-import logging
-import os
-import sys
-import random
-from tqdm import tqdm, trange
-
-import numpy as np
-
-import torch
-from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
-                              TensorDataset)
-from torch.utils.data.distributed import DistributedSampler
-from torch.nn import CrossEntropyLoss, MSELoss
-
-from tensorboardX import SummaryWriter
-
-from pytorch_transformers import WEIGHTS_NAME, CONFIG_NAME
-from pytorch_transformers.modeling_bert import BertForSequenceClassification
-from pytorch_transformers.tokenization_bert import BertTokenizer
-from pytorch_transformers.optimization import BertAdam, WarmupLinearSchedule
-
-from utils_glue import processors, output_modes, convert_examples_to_features, compute_metrics
-
-if sys.version_info[0] == 2:
-    import cPickle as pickle
-else:
-    import pickle
-
-
-logger = logging.getLogger(__name__)
-
-
-def main():
-    parser = argparse.ArgumentParser()
-
-    ## Required parameters
-    parser.add_argument("--data_dir",
-                        default=None,
-                        type=str,
-                        required=True,
-                        help="The input data dir. Should contain the .tsv files (or other data files) for the task.")
-    parser.add_argument("--bert_model", default=None, type=str, required=True,
-                        help="Bert pre-trained model selected in the list: bert-base-uncased, "
-                        "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, "
-                        "bert-base-multilingual-cased, bert-base-chinese.")
-    parser.add_argument("--task_name",
-                        default=None,
-                        type=str,
-                        required=True,
-                        help="The name of the task to train.")
-    parser.add_argument("--output_dir",
-                        default=None,
-                        type=str,
-                        required=True,
-                        help="The output directory where the model predictions and checkpoints will be written.")
-
-    ## Other parameters
-    parser.add_argument("--cache_dir",
-                        default="",
-                        type=str,
-                        help="Where do you want to store the pre-trained models downloaded from s3")
-    parser.add_argument("--max_seq_length",
-                        default=128,
-                        type=int,
-                        help="The maximum total input sequence length after WordPiece tokenization. \n"
-                             "Sequences longer than this will be truncated, and sequences shorter \n"
-                             "than this will be padded.")
-    parser.add_argument("--do_train",
-                        action='store_true',
-                        help="Whether to run training.")
-    parser.add_argument("--do_eval",
-                        action='store_true',
-                        help="Whether to run eval on the dev set.")
-    parser.add_argument("--do_lower_case",
-                        action='store_true',
-                        help="Set this flag if you are using an uncased model.")
-    parser.add_argument("--train_batch_size",
-                        default=32,
-                        type=int,
-                        help="Total batch size for training.")
-    parser.add_argument("--eval_batch_size",
-                        default=8,
-                        type=int,
-                        help="Total batch size for eval.")
-    parser.add_argument("--learning_rate",
-                        default=5e-5,
-                        type=float,
-                        help="The initial learning rate for Adam.")
-    parser.add_argument("--num_train_epochs",
-                        default=3.0,
-                        type=float,
-                        help="Total number of training epochs to perform.")
-    parser.add_argument("--warmup_proportion",
-                        default=0.1,
-                        type=float,
-                        help="Proportion of training to perform linear learning rate warmup for. "
-                             "E.g., 0.1 = 10%% of training.")
-    parser.add_argument("--no_cuda",
-                        action='store_true',
-                        help="Whether not to use CUDA when available")
-    parser.add_argument('--overwrite_output_dir',
-                        action='store_true',
-                        help="Overwrite the content of the output directory")
-    parser.add_argument("--local_rank",
-                        type=int,
-                        default=-1,
-                        help="local_rank for distributed training on gpus")
-    parser.add_argument('--seed',
-                        type=int,
-                        default=42,
-                        help="random seed for initialization")
-    parser.add_argument('--gradient_accumulation_steps',
-                        type=int,
-                        default=1,
-                        help="Number of updates steps to accumulate before performing a backward/update pass.")
-    parser.add_argument('--fp16',
-                        action='store_true',
-                        help="Whether to use 16-bit float precision instead of 32-bit")
-    parser.add_argument('--loss_scale',
-                        type=float, default=0,
-                        help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
-                             "0 (default value): dynamic loss scaling.\n"
-                             "Positive power of 2: static loss scaling value.\n")
-    parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.")
-    parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.")
-    args = parser.parse_args()
-
-    if args.server_ip and args.server_port:
-        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
-        import ptvsd
-        print("Waiting for debugger attach")
-        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
-        ptvsd.wait_for_attach()
-
-    if args.local_rank == -1 or args.no_cuda:
-        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
-        n_gpu = torch.cuda.device_count()
-    else:
-        torch.cuda.set_device(args.local_rank)
-        device = torch.device("cuda", args.local_rank)
-        n_gpu = 1
-        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
-        torch.distributed.init_process_group(backend='nccl')
-    args.device = device
-
-    logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
-                        datefmt = '%m/%d/%Y %H:%M:%S',
-                        level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
-
-    logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format(
-        device, n_gpu, bool(args.local_rank != -1), args.fp16))
-
-    if args.gradient_accumulation_steps < 1:
-        raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(
-                            args.gradient_accumulation_steps))
-
-    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps
-
-    random.seed(args.seed)
-    np.random.seed(args.seed)
-    torch.manual_seed(args.seed)
-    if n_gpu > 0:
-        torch.cuda.manual_seed_all(args.seed)
-
-    if not args.do_train and not args.do_eval:
-        raise ValueError("At least one of `do_train` or `do_eval` must be True.")
-
-    if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir:
-        raise ValueError("Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(args.output_dir))
-    if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
-        os.makedirs(args.output_dir)
-
-    task_name = args.task_name.lower()
-
-    if task_name not in processors:
-        raise ValueError("Task not found: %s" % (task_name))
-
-    processor = processors[task_name]()
-    output_mode = output_modes[task_name]
-
-    label_list = processor.get_labels()
-    num_labels = len(label_list)
-
-    if args.local_rank not in [-1, 0]:
-        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
-    tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)
-    model = BertForSequenceClassification.from_pretrained(args.bert_model, num_labels=num_labels)
-    if args.local_rank == 0:
-        torch.distributed.barrier()
-
-    if args.fp16:
-        model.half()
-    model.to(device)
-    if args.local_rank != -1:
-        model = torch.nn.parallel.DistributedDataParallel(model,
-                                                          device_ids=[args.local_rank],
-                                                          output_device=args.local_rank,
-                                                          find_unused_parameters=True)
-    elif n_gpu > 1:
-        model = torch.nn.DataParallel(model)
-
-    global_step = 0
-    nb_tr_steps = 0
-    tr_loss = 0
-
-    if args.do_train:
-        if args.local_rank in [-1, 0]:
-            tb_writer = SummaryWriter()
-
-        # Prepare data loader
-        train_examples = processor.get_train_examples(args.data_dir)
-        cached_train_features_file = os.path.join(args.data_dir, 'train_{0}_{1}_{2}'.format(
-            list(filter(None, args.bert_model.split('/'))).pop(),
-                        str(args.max_seq_length),
-                        str(task_name)))
-        try:
-            with open(cached_train_features_file, "rb") as reader:
-                train_features = pickle.load(reader)
-        except:
-            train_features = convert_examples_to_features(
-                train_examples, label_list, args.max_seq_length, tokenizer, output_mode)
-            if args.local_rank == -1 or torch.distributed.get_rank() == 0:
-                logger.info("  Saving train features into cached file %s", cached_train_features_file)
-                with open(cached_train_features_file, "wb") as writer:
-                    pickle.dump(train_features, writer)
-
-        all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
-        all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
-        all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)
-
-        if output_mode == "classification":
-            all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long)
-        elif output_mode == "regression":
-            all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.float)
-
-        train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
-        if args.local_rank == -1:
-            train_sampler = RandomSampler(train_data)
-        else:
-            train_sampler = DistributedSampler(train_data)
-        train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)
-
-        num_train_optimization_steps = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
-
-        # Prepare optimizer
-
-        param_optimizer = list(model.named_parameters())
-        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
-        optimizer_grouped_parameters = [
-            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
-            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
-            ]
-        if args.fp16:
-            try:
-                from apex.optimizers import FP16_Optimizer
-                from apex.optimizers import FusedAdam
-            except ImportError:
-                raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
-
-            optimizer = FusedAdam(optimizer_grouped_parameters,
-                                  lr=args.learning_rate,
-                                  bias_correction=False,
-                                  max_grad_norm=1.0)
-            if args.loss_scale == 0:
-                optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
-            else:
-                optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale)
-            warmup_linear = WarmupLinearSchedule(warmup=args.warmup_proportion,
-                                                 t_total=num_train_optimization_steps)
-
-        else:
-            optimizer = BertAdam(optimizer_grouped_parameters,
-                                 lr=args.learning_rate,
-                                 warmup=args.warmup_proportion,
-                                 t_total=num_train_optimization_steps)
-
-        logger.info("***** Running training *****")
-        logger.info("  Num examples = %d", len(train_examples))
-        logger.info("  Batch size = %d", args.train_batch_size)
-        logger.info("  Num steps = %d", num_train_optimization_steps)
-
-        model.train()
-        for _ in trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]):
-            tr_loss = 0
-            nb_tr_examples, nb_tr_steps = 0, 0
-            for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])):
-                batch = tuple(t.to(device) for t in batch)
-                input_ids, input_mask, segment_ids, label_ids = batch
-
-                # define a new function to compute loss values for both output_modes
-                ouputs = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids)
-                loss = ouputs[0]
-
-                if n_gpu > 1:
-                    loss = loss.mean() # mean() to average on multi-gpu.
-                if args.gradient_accumulation_steps > 1:
-                    loss = loss / args.gradient_accumulation_steps
-
-                if args.fp16:
-                    optimizer.backward(loss)
-                else:
-                    loss.backward()
-
-                tr_loss += loss.item()
-                nb_tr_examples += input_ids.size(0)
-                nb_tr_steps += 1
-                if (step + 1) % args.gradient_accumulation_steps == 0:
-                    if args.fp16:
-                        # modify learning rate with special warm up BERT uses
-                        # if args.fp16 is False, BertAdam is used that handles this automatically
-                        lr_this_step = args.learning_rate * warmup_linear.get_lr(global_step, args.warmup_proportion)
-                        for param_group in optimizer.param_groups:
-                            param_group['lr'] = lr_this_step
-                    optimizer.step()
-                    optimizer.zero_grad()
-                    global_step += 1
-                    if args.local_rank in [-1, 0]:
-                        tb_writer.add_scalar('lr', optimizer.get_lr()[0], global_step)
-                        tb_writer.add_scalar('loss', loss.item(), global_step)
-
-    ### Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
-    ### Example:
-    if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
-        # Save a trained model, configuration and tokenizer
-        model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
-
-        # If we save using the predefined names, we can load using `from_pretrained`
-        output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
-        output_config_file = os.path.join(args.output_dir, CONFIG_NAME)
-
-        torch.save(model_to_save.state_dict(), output_model_file)
-        model_to_save.config.to_json_file(output_config_file)
-        tokenizer.save_vocabulary(args.output_dir)
-
-        # Load a trained model and vocabulary that you have fine-tuned
-        model = BertForSequenceClassification.from_pretrained(args.output_dir, num_labels=num_labels)
-        tokenizer = BertTokenizer.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
-
-        # Good practice: save your training arguments together with the trained model
-        output_args_file = os.path.join(args.output_dir, 'training_args.bin')
-        torch.save(args, output_args_file)
-    else:
-        model = BertForSequenceClassification.from_pretrained(args.bert_model, num_labels=num_labels)
-
-    model.to(device)
-
-    ### Evaluation
-    if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
-        eval_examples = processor.get_dev_examples(args.data_dir)
-        cached_eval_features_file = os.path.join(args.data_dir, 'dev_{0}_{1}_{2}'.format(
-            list(filter(None, args.bert_model.split('/'))).pop(),
-                        str(args.max_seq_length),
-                        str(task_name)))
-        try:
-            with open(cached_eval_features_file, "rb") as reader:
-                eval_features = pickle.load(reader)
-        except:
-            eval_features = convert_examples_to_features(
-                eval_examples, label_list, args.max_seq_length, tokenizer, output_mode)
-            if args.local_rank == -1 or torch.distributed.get_rank() == 0:
-                logger.info("  Saving eval features into cached file %s", cached_eval_features_file)
-                with open(cached_eval_features_file, "wb") as writer:
-                    pickle.dump(eval_features, writer)
-
-
-        logger.info("***** Running evaluation *****")
-        logger.info("  Num examples = %d", len(eval_examples))
-        logger.info("  Batch size = %d", args.eval_batch_size)
-        all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
-        all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
-        all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
-
-        if output_mode == "classification":
-            all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long)
-        elif output_mode == "regression":
-            all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.float)
-
-        eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
-        # Run prediction for full data
-        if args.local_rank == -1:
-            eval_sampler = SequentialSampler(eval_data)
-        else:
-            eval_sampler = DistributedSampler(eval_data)  # Note that this sampler samples randomly
-        eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)
-
-        model.eval()
-        eval_loss = 0
-        nb_eval_steps = 0
-        preds = []
-        out_label_ids = None
-
-        for input_ids, input_mask, segment_ids, label_ids in tqdm(eval_dataloader, desc="Evaluating"):
-            input_ids = input_ids.to(device)
-            input_mask = input_mask.to(device)
-            segment_ids = segment_ids.to(device)
-            label_ids = label_ids.to(device)
-
-            with torch.no_grad():
-                outputs = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids)
-                tmp_eval_loss, logits = outputs[:2]
-
-            eval_loss += tmp_eval_loss.mean().item()
-            nb_eval_steps += 1
-            if len(preds) == 0:
-                preds.append(logits.detach().cpu().numpy())
-                out_label_ids = label_ids.detach().cpu().numpy()
-            else:
-                preds[0] = np.append(
-                    preds[0], logits.detach().cpu().numpy(), axis=0)
-                out_label_ids = np.append(
-                    out_label_ids, label_ids.detach().cpu().numpy(), axis=0)
-
-        eval_loss = eval_loss / nb_eval_steps
-        preds = preds[0]
-        if output_mode == "classification":
-            preds = np.argmax(preds, axis=1)
-        elif output_mode == "regression":
-            preds = np.squeeze(preds)
-        result = compute_metrics(task_name, preds, out_label_ids)
-
-        loss = tr_loss/global_step if args.do_train else None
-
-        result['eval_loss'] = eval_loss
-        result['global_step'] = global_step
-        result['loss'] = loss
-
-        output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
-        with open(output_eval_file, "w") as writer:
-            logger.info("***** Eval results *****")
-            for key in sorted(result.keys()):
-                logger.info("  %s = %s", key, str(result[key]))
-                writer.write("%s = %s\n" % (key, str(result[key])))
-
-        # hack for MNLI-MM
-        if task_name == "mnli":
-            task_name = "mnli-mm"
-            processor = processors[task_name]()
-
-            if os.path.exists(args.output_dir + '-MM') and os.listdir(args.output_dir + '-MM') and args.do_train:
-                raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir))
-            if not os.path.exists(args.output_dir + '-MM'):
-                os.makedirs(args.output_dir + '-MM')
-
-            eval_examples = processor.get_dev_examples(args.data_dir)
-            eval_features = convert_examples_to_features(
-                eval_examples, label_list, args.max_seq_length, tokenizer, output_mode)
-            logger.info("***** Running evaluation *****")
-            logger.info("  Num examples = %d", len(eval_examples))
-            logger.info("  Batch size = %d", args.eval_batch_size)
-            all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
-            all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
-            all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
-            all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long)
-
-            eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
-            # Run prediction for full data
-            eval_sampler = SequentialSampler(eval_data)
-            eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)
-
-            model.eval()
-            eval_loss = 0
-            nb_eval_steps = 0
-            preds = []
-            out_label_ids = None
-
-            for input_ids, input_mask, segment_ids, label_ids in tqdm(eval_dataloader, desc="Evaluating"):
-                input_ids = input_ids.to(device)
-                input_mask = input_mask.to(device)
-                segment_ids = segment_ids.to(device)
-                label_ids = label_ids.to(device)
-
-                with torch.no_grad():
-                    logits = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=None)
-
-                loss_fct = CrossEntropyLoss()
-                tmp_eval_loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1))
-
-                eval_loss += tmp_eval_loss.mean().item()
-                nb_eval_steps += 1
-                if len(preds) == 0:
-                    preds.append(logits.detach().cpu().numpy())
-                    out_label_ids = label_ids.detach().cpu().numpy()
-                else:
-                    preds[0] = np.append(
-                        preds[0], logits.detach().cpu().numpy(), axis=0)
-                    out_label_ids = np.append(
-                        out_label_ids, label_ids.detach().cpu().numpy(), axis=0)
-
-            eval_loss = eval_loss / nb_eval_steps
-            preds = preds[0]
-            preds = np.argmax(preds, axis=1)
-            result = compute_metrics(task_name, preds, out_label_ids)
-
-            loss = tr_loss/global_step if args.do_train else None
-
-            result['eval_loss'] = eval_loss
-            result['global_step'] = global_step
-            result['loss'] = loss
-
-            output_eval_file = os.path.join(args.output_dir + '-MM', "eval_results.txt")
-            with open(output_eval_file, "w") as writer:
-                logger.info("***** Eval results *****")
-                for key in sorted(result.keys()):
-                    logger.info("  %s = %s", key, str(result[key]))
-                    writer.write("%s = %s\n" % (key, str(result[key])))
-
-if __name__ == "__main__":
-    main()
diff --git a/examples/run_glue.py b/examples/run_glue.py
index 93f69e1741..aaf9a9876c 100644
--- a/examples/run_glue.py
+++ b/examples/run_glue.py
@@ -74,8 +74,8 @@ def train(args, train_dataset, model, tokenizer):
         {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay},
         {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
         ]
-    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate)
-    schedule = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total)
+    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
+    scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total)
     if args.fp16:
         try:
             from apex import amp
@@ -300,6 +300,8 @@ def main():
                         help="The initial learning rate for Adam.")
     parser.add_argument("--weight_decay", default=0.0, type=float,
                         help="Weight deay if we apply some.")
+    parser.add_argument("--adam_epsilon", default=1e-8, type=float,
+                        help="Epsilon for Adam optimizer.")
     parser.add_argument("--max_grad_norm", default=1.0, type=float,
                         help="Max gradient norm.")
     parser.add_argument("--num_train_epochs", default=3.0, type=float,
@@ -358,7 +360,9 @@ def main():
     args.device = device
 
     # Setup logging
-    logging.basicConfig(level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
+    logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
+                        datefmt = '%m/%d/%Y %H:%M:%S',
+                        level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
     logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
                 args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16)
 
diff --git a/examples/run_xlnet_classifier.py b/examples/run_xlnet_classifier.py
deleted file mode 100644
index 35b0ebfbd1..0000000000
--- a/examples/run_xlnet_classifier.py
+++ /dev/null
@@ -1,530 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""BERT finetuning runner."""
-
-from __future__ import absolute_import, division, print_function
-
-import argparse
-import logging
-import os
-import sys
-import random
-from tqdm import tqdm, trange
-
-import numpy as np
-
-import torch
-from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
-                              TensorDataset)
-from torch.utils.data.distributed import DistributedSampler
-from torch.nn import CrossEntropyLoss, MSELoss
-
-from tensorboardX import SummaryWriter
-
-from pytorch_transformers import WEIGHTS_NAME, CONFIG_NAME
-from pytorch_transformers.modeling_xlnet import XLNetForSequenceClassification
-from pytorch_transformers.tokenization_xlnet import XLNetTokenizer
-from pytorch_transformers.optimization import BertAdam, WarmupLinearSchedule
-
-from utils_glue import processors, output_modes, convert_examples_to_features, compute_metrics
-
-if sys.version_info[0] == 2:
-    import cPickle as pickle
-else:
-    import pickle
-
-
-logger = logging.getLogger(__name__)
-
-
-def main():
-    parser = argparse.ArgumentParser()
-
-    ## Required parameters
-    parser.add_argument("--data_dir", default=None, type=str, required=True,
-                        help="The input data dir. Should contain the .tsv files (or other data files) for the task.")
-    parser.add_argument("--task_name", default=None, type=str, required=True,
-                        help="The name of the task to train.")
-    parser.add_argument("--output_dir", default=None, type=str, required=True,
-                        help="The output directory where the model predictions and checkpoints will be written.")
-    # training
-    parser.add_argument("--do_train", action='store_true',
-                        help="Whether to run training.")
-    parser.add_argument("--learning_rate", default=5e-5, type=float,
-                        help="The initial learning rate for Adam.")
-    parser.add_argument("--num_train_epochs", default=3.0, type=float,
-                        help="Total number of training epochs to perform.")
-    parser.add_argument("--max_steps", default=-1, type=int,
-                        help="If > 0 limit the number of training steps to perform, you should choose only one of num_train_epochs and max_steps.")
-    parser.add_argument("--warmup_proportion", default=0.1, type=float,
-                        help="Proportion of training to perform linear learning rate warmup for. "
-                             "E.g., 0.1 = 10%% of training.")
-    parser.add_argument("--clip_gradients", default=1.0, type=float,
-                        help="Clip gradient norms.")
-    parser.add_argument("--train_batch_size", default=32, type=int,
-                        help="Total batch size for training.")
-    parser.add_argument('--gradient_accumulation_steps', type=int, default=1,
-                        help="Number of updates steps to accumulate before performing a backward/update pass.")
-    parser.add_argument('--fp16', action='store_true',
-                        help="Whether to use 16-bit float precision instead of 32-bit")
-    parser.add_argument('--loss_scale', type=float, default=0,
-                        help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
-                             "0 (default value): dynamic loss scaling.\n"
-                             "Positive power of 2: static loss scaling value.\n")
-    parser.add_argument("--log_every", default=10, type=int,
-                        help="Log metrics every X training steps.")
-    # evaluation
-    parser.add_argument("--do_eval", action='store_true',
-                        help="Whether to run eval on the dev set.")
-    parser.add_argument("--eval_batch_size", default=8, type=int,
-                        help="Total batch size for eval.")
-    # Model
-    parser.add_argument("--xlnet_model", default="xlnet-large-cased", type=str,
-                        help="XLNet pre-trained model: currently only xlnet-large-cased.")
-    parser.add_argument("--do_lower_case", action='store_true',
-                        help="Set this flag if you are using an uncased model.")
-    parser.add_argument("--cache_dir", default="", type=str,
-                        help="Where do you want to store the pre-trained models downloaded from s3")
-    # task specific
-    parser.add_argument("--max_seq_length", default=128, type=int,
-                        help="The maximum total input sequence length after WordPiece tokenization. \n"
-                             "Sequences longer than this will be truncated, and sequences shorter \n"
-                             "than this will be padded.")
-    parser.add_argument('--overwrite_output_dir', action='store_true',
-                        help="Overwrite the content of the output directory")
-    # Misc
-    parser.add_argument("--no_cuda", action='store_true',
-                        help="Whether not to use CUDA when available")
-    parser.add_argument("--local_rank", type=int, default=-1,
-                        help="local_rank for distributed training on gpus")
-    parser.add_argument('--seed', type=int, default=42,
-                        help="random seed for initialization")
-    parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.")
-    parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.")
-    args = parser.parse_args()
-
-    if args.server_ip and args.server_port:
-        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
-        import ptvsd
-        print("Waiting for debugger attach")
-        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
-        ptvsd.wait_for_attach()
-
-    if args.local_rank == -1 or args.no_cuda:
-        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
-        n_gpu = torch.cuda.device_count()
-    else:
-        torch.cuda.set_device(args.local_rank)
-        device = torch.device("cuda", args.local_rank)
-        n_gpu = 1
-        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
-        torch.distributed.init_process_group(backend='nccl')
-    args.device = device
-
-    logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
-                        datefmt = '%m/%d/%Y %H:%M:%S',
-                        level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
-
-    logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format(
-        device, n_gpu, bool(args.local_rank != -1), args.fp16))
-
-    if args.gradient_accumulation_steps < 1:
-        raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(
-                            args.gradient_accumulation_steps))
-
-    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps
-
-    random.seed(args.seed)
-    np.random.seed(args.seed)
-    torch.manual_seed(args.seed)
-    if n_gpu > 0:
-        torch.cuda.manual_seed_all(args.seed)
-
-    if not args.do_train and not args.do_eval:
-        raise ValueError("At least one of `do_train` or `do_eval` must be True.")
-
-    if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir:
-        raise ValueError("Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(args.output_dir))
-    if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
-        os.makedirs(args.output_dir)
-
-    task_name = args.task_name.lower()
-
-    if task_name not in processors:
-        raise ValueError("Task not found: %s" % (task_name))
-
-    processor = processors[task_name]()
-    output_mode = output_modes[task_name]
-
-    label_list = processor.get_labels()
-    num_labels = len(label_list)
-
-    if args.local_rank not in [-1, 0]:
-        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
-    tokenizer = XLNetTokenizer.from_pretrained(args.xlnet_model, do_lower_case=args.do_lower_case)
-    model = XLNetForSequenceClassification.from_pretrained(args.xlnet_model, num_labels=num_labels)
-    if args.local_rank == 0:
-        torch.distributed.barrier()
-
-    if args.fp16:
-        model.half()
-    model.to(device)
-    if args.local_rank != -1:
-        model = torch.nn.parallel.DistributedDataParallel(model,
-                                                          device_ids=[args.local_rank],
-                                                          output_device=args.local_rank,
-                                                          find_unused_parameters=True)
-    elif n_gpu > 1:
-        model = torch.nn.DataParallel(model)
-
-    global_step = 0
-    curr_tr_loss, curr_steps = 0., 1
-
-    if args.do_train:
-        if args.local_rank in [-1, 0]:
-            tb_writer = SummaryWriter()
-
-        # Prepare data loader
-        train_examples = processor.get_train_examples(args.data_dir)
-        cached_train_features_file = os.path.join(args.data_dir, 'train_{0}_{1}_{2}'.format(
-            list(filter(None, args.xlnet_model.split('/'))).pop(),
-                        str(args.max_seq_length),
-                        str(task_name)))
-        if os.path.exists(cached_train_features_file):
-            logger.info("Loading train features for cache file %s", cached_train_features_file)
-            with open(cached_train_features_file, "rb") as reader:
-                train_features = pickle.load(reader)
-        else:
-            logger.info("No cache file at %s, preparing train features", cached_train_features_file)
-            train_features = convert_examples_to_features(
-                train_examples, label_list, args.max_seq_length, tokenizer, output_mode,
-                cls_token_at_end=True, cls_token=tokenizer.cls_token,
-                sep_token=tokenizer.sep_token, cls_token_segment_id=2,
-                pad_on_left=True, pad_token_segment_id=4)
-            if args.local_rank == -1 or torch.distributed.get_rank() == 0:
-                logger.info("  Saving train features into cached file %s", cached_train_features_file)
-                with open(cached_train_features_file, "wb") as writer:
-                    pickle.dump(train_features, writer)
-
-        all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
-        all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
-        all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)
-
-        if output_mode == "classification":
-            all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long)
-        elif output_mode == "regression":
-            all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.float)
-
-        train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
-        if args.local_rank == -1:
-            train_sampler = SequentialSampler(train_data)  # RandomSampler(train_data)
-        else:
-            train_sampler = DistributedSampler(train_data)
-        train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)
-
-        if args.max_steps > 0:
-            num_train_optimization_steps = args.max_steps
-        else:
-            num_train_optimization_steps = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
-
-        # Prepare optimizer
-
-        optimizer_grouped_parameters = model.parameters()
-        # param_optimizer = list(model.named_parameters())
-        # no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
-        # optimizer_grouped_parameters = [
-        #     {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
-        #     {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
-        #     ]
-        if args.fp16:
-            try:
-                from apex.optimizers import FP16_Optimizer
-                from apex.optimizers import FusedAdam
-            except ImportError:
-                raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
-
-            optimizer = FusedAdam(optimizer_grouped_parameters,
-                                  lr=args.learning_rate,
-                                  bias_correction=False,
-                                  max_grad_norm=1.0)
-            if args.loss_scale == 0:
-                optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
-            else:
-                optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale)
-            warmup_linear = WarmupLinearSchedule(warmup=args.warmup_proportion,
-                                                 t_total=num_train_optimization_steps)
-
-        else:
-            optimizer = BertAdam(optimizer_grouped_parameters,
-                                 lr=args.learning_rate,
-                                 warmup=args.warmup_proportion,
-                                 t_total=num_train_optimization_steps)
-
-        logger.info("***** Running training *****")
-        logger.info("  Num examples = %d", len(train_examples))
-        logger.info("  Batch size = %d", args.train_batch_size)
-        logger.info("  Num steps = %d", num_train_optimization_steps)
-
-        model.train()
-        for _ in trange(int(args.num_train_epochs) if args.max_steps <= 0 else int('Inf'),
-                        desc="Epoch", disable=args.local_rank not in [-1, 0]):
-            for step, batch in enumerate(tqdm(train_dataloader,
-                                              desc="Iteration",
-                                              disable=args.local_rank not in [-1, 0])):
-                batch = tuple(t.to(device) for t in batch)
-                input_ids, input_mask, segment_ids, label_ids = batch
-
-                # define a new function to compute loss values for both output_modes
-                loss, _ = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids)
-
-                if n_gpu > 1:
-                    loss = loss.mean() # mean() to average on multi-gpu.
-                if args.gradient_accumulation_steps > 1:
-                    loss = loss / args.gradient_accumulation_steps
-
-                if args.fp16:
-                    optimizer.backward(loss)
-                else:
-                    loss.backward()
-
-                gnorm = torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip_gradients)
-
-                curr_tr_loss += loss.item()
-                curr_steps += 1
-                if (step + 1) % args.gradient_accumulation_steps == 0:
-                    if args.fp16:
-                        # modify learning rate with special warm up BERT uses
-                        # if args.fp16 is False, BertAdam is used that handles this automatically
-                        lr_this_step = args.learning_rate * warmup_linear.get_lr(global_step, args.warmup_proportion)
-                        for param_group in optimizer.param_groups:
-                            param_group['lr'] = lr_this_step
-                    optimizer.step()
-                    optimizer.zero_grad()
-                    global_step += 1
-                    if args.local_rank in [-1, 0] and (args.log_every <= 0 or (global_step + 1) % args.log_every == 0):
-                        learning_rate = optimizer.get_lr()[0] if not args.fp16 else lr_this_step
-                        logger.info("[{}] | gnorm {:.2f} lr {:8.6f} | loss {:.2f}".format(
-                            global_step, gnorm, learning_rate, curr_tr_loss / curr_steps))
-                        tb_writer.add_scalar('lr', learning_rate, global_step)
-                        tb_writer.add_scalar('loss', curr_tr_loss / curr_steps, global_step)
-                        curr_tr_loss, curr_steps = 0., 1
-
-                    if args.max_steps > 0 and global_step > args.max_steps:
-                        break
-
-            if args.max_steps > 0 and global_step > args.max_steps:
-                break
-
-    ### Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
-    ### Example:
-    if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
-        # Save a trained model, configuration and tokenizer
-        model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
-
-        # If we save using the predefined names, we can load using `from_pretrained`
-        output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
-        output_config_file = os.path.join(args.output_dir, CONFIG_NAME)
-
-        torch.save(model_to_save.state_dict(), output_model_file)
-        model_to_save.config.to_json_file(output_config_file)
-        tokenizer.save_vocabulary(args.output_dir)
-
-        # Load a trained model and vocabulary that you have fine-tuned
-        model = XLNetForSequenceClassification.from_pretrained(args.output_dir, num_labels=num_labels)
-        tokenizer = XLNetTokenizer.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
-
-        # Good practice: save your training arguments together with the trained model
-        output_args_file = os.path.join(args.output_dir, 'training_args.bin')
-        torch.save(args, output_args_file)
-    else:
-        model = XLNetForSequenceClassification.from_pretrained(args.xlnet_model, num_labels=num_labels)
-
-    model.to(device)
-
-    ### Evaluation
-    if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
-        eval_examples = processor.get_dev_examples(args.data_dir)
-        cached_eval_features_file = os.path.join(args.data_dir, 'dev_{0}_{1}_{2}'.format(
-            list(filter(None, args.xlnet_model.split('/'))).pop(),
-                        str(args.max_seq_length),
-                        str(task_name)))
-        if os.path.exists(cached_eval_features_file):
-            logger.info("Loading eval features for cache file %s", cached_eval_features_file)
-            with open(cached_eval_features_file, "rb") as reader:
-                eval_features = pickle.load(reader)
-        else:
-            logger.info("No cache file at %s, preparing eval features", cached_eval_features_file)
-            eval_features = convert_examples_to_features(
-                eval_examples, label_list, args.max_seq_length, tokenizer, output_mode,
-                cls_token_at_end=True, cls_token=tokenizer.cls_token,
-                sep_token=tokenizer.sep_token, cls_token_segment_id=2,
-                pad_on_left=True, pad_token_segment_id=4)
-            if args.local_rank == -1 or torch.distributed.get_rank() == 0:
-                logger.info("  Saving eval features into cached file %s", cached_eval_features_file)
-                with open(cached_eval_features_file, "wb") as writer:
-                    pickle.dump(eval_features, writer)
-
-
-        logger.info("***** Running evaluation *****")
-        logger.info("  Num examples = %d", len(eval_examples))
-        logger.info("  Batch size = %d", args.eval_batch_size)
-        all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
-        all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
-        all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
-
-        if output_mode == "classification":
-            all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long)
-        elif output_mode == "regression":
-            all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.float)
-
-        eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
-        # Run prediction for full data
-        if args.local_rank == -1:
-            eval_sampler = SequentialSampler(eval_data)
-        else:
-            eval_sampler = DistributedSampler(eval_data)  # Note that this sampler samples randomly
-        eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)
-
-        model.eval()
-        eval_loss = 0
-        nb_eval_steps = 0
-        preds = []
-        out_label_ids = None
-
-        for input_ids, input_mask, segment_ids, label_ids in tqdm(eval_dataloader, desc="Evaluating"):
-            input_ids = input_ids.to(device)
-            input_mask = input_mask.to(device)
-            segment_ids = segment_ids.to(device)
-            label_ids = label_ids.to(device)
-
-            with torch.no_grad():
-                logits, _ = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask)
-
-            # create eval loss and other metric required by the task
-            if output_mode == "classification":
-                loss_fct = CrossEntropyLoss()
-                tmp_eval_loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1))
-            elif output_mode == "regression":
-                loss_fct = MSELoss()
-                tmp_eval_loss = loss_fct(logits.view(-1), label_ids.view(-1))
-
-            eval_loss += tmp_eval_loss.mean().item()
-            nb_eval_steps += 1
-            if len(preds) == 0:
-                preds.append(logits.detach().cpu().numpy())
-                out_label_ids = label_ids.detach().cpu().numpy()
-            else:
-                preds[0] = np.append(
-                    preds[0], logits.detach().cpu().numpy(), axis=0)
-                out_label_ids = np.append(
-                    out_label_ids, label_ids.detach().cpu().numpy(), axis=0)
-
-        eval_loss = eval_loss / nb_eval_steps
-        preds = preds[0]
-        if output_mode == "classification":
-            preds = np.argmax(preds, axis=1)
-        elif output_mode == "regression":
-            preds = np.squeeze(preds)
-        result = compute_metrics(task_name, preds, out_label_ids)
-
-        loss = curr_tr_loss/curr_steps if args.do_train else None
-
-        result['eval_loss'] = eval_loss
-        result['global_step'] = global_step
-        result['loss'] = loss
-
-        output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
-        with open(output_eval_file, "w") as writer:
-            logger.info("***** Eval results *****")
-            for key in sorted(result.keys()):
-                logger.info("  %s = %s", key, str(result[key]))
-                writer.write("%s = %s\n" % (key, str(result[key])))
-
-        # hack for MNLI-MM
-        if task_name == "mnli":
-            task_name = "mnli-mm"
-            processor = processors[task_name]()
-
-            if os.path.exists(args.output_dir + '-MM') and os.listdir(args.output_dir + '-MM') and args.do_train:
-                raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir))
-            if not os.path.exists(args.output_dir + '-MM'):
-                os.makedirs(args.output_dir + '-MM')
-
-            eval_examples = processor.get_dev_examples(args.data_dir)
-            eval_features = convert_examples_to_features(
-                eval_examples, label_list, args.max_seq_length, tokenizer, output_mode)
-            logger.info("***** Running evaluation *****")
-            logger.info("  Num examples = %d", len(eval_examples))
-            logger.info("  Batch size = %d", args.eval_batch_size)
-            all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
-            all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
-            all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
-            all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long)
-
-            eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
-            # Run prediction for full data
-            eval_sampler = SequentialSampler(eval_data)
-            eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)
-
-            model.eval()
-            eval_loss = 0
-            nb_eval_steps = 0
-            preds = []
-            out_label_ids = None
-
-            for input_ids, input_mask, segment_ids, label_ids in tqdm(eval_dataloader, desc="Evaluating"):
-                input_ids = input_ids.to(device)
-                input_mask = input_mask.to(device)
-                segment_ids = segment_ids.to(device)
-                label_ids = label_ids.to(device)
-
-                with torch.no_grad():
-                    logits, _ = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=None)
-
-                loss_fct = CrossEntropyLoss()
-                tmp_eval_loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1))
-
-                eval_loss += tmp_eval_loss.mean().item()
-                nb_eval_steps += 1
-                if len(preds) == 0:
-                    preds.append(logits.detach().cpu().numpy())
-                    out_label_ids = label_ids.detach().cpu().numpy()
-                else:
-                    preds[0] = np.append(
-                        preds[0], logits.detach().cpu().numpy(), axis=0)
-                    out_label_ids = np.append(
-                        out_label_ids, label_ids.detach().cpu().numpy(), axis=0)
-
-            eval_loss = eval_loss / nb_eval_steps
-            preds = preds[0]
-            preds = np.argmax(preds, axis=1)
-            result = compute_metrics(task_name, preds, out_label_ids)
-
-            loss = curr_tr_loss/curr_steps if args.do_train else None
-
-            result['eval_loss'] = eval_loss
-            result['global_step'] = global_step
-            result['loss'] = loss
-
-            output_eval_file = os.path.join(args.output_dir + '-MM', "eval_results.txt")
-            with open(output_eval_file, "w") as writer:
-                logger.info("***** Eval results *****")
-                for key in sorted(result.keys()):
-                    logger.info("  %s = %s", key, str(result[key]))
-                    writer.write("%s = %s\n" % (key, str(result[key])))
-
-if __name__ == "__main__":
-    main()
diff --git a/examples/utils.py b/examples/utils.py
deleted file mode 100644
index e4b7263efa..0000000000
--- a/examples/utils.py
+++ /dev/null
@@ -1,61 +0,0 @@
-# Copyright (c) 2019-present, the HuggingFace Inc. authors.
-# All rights reserved. This source code is licensed under the BSD-style
-# license found in the LICENSE file in the root directory of this source tree.
-import logging
-import os
-from tqdm import tqdm
-from pprint import pformat
-
-import torch
-
-from ignite.engine import Engine, Events
-from ignite.handlers import ModelCheckpoint
-from ignite.metrics import RunningAverage
-from ignite.contrib.handlers import ProgressBar
-from ignite.contrib.handlers.tensorboard_logger import OptimizerParamsHandler, OutputHandler, TensorboardLogger
-
-
-def average_distributed_scalar(scalar, args):
-    """ Average a scalar over nodes if we are in distributed training.
-        We use this for distributed evaluation.
-        Beware, such averages only works for metrics which are additive with regard
-        to the evaluation dataset, e.g. accuracy, log probabilities.
-        Doesn't work for ratio metrics like F1.
-    """
-    if args.local_rank == -1:
-        return scalar
-    scalar_t = torch.tensor(scalar, dtype=torch.float, device=args.device) / torch.distributed.get_world_size()
-    torch.distributed.all_reduce(scalar_t, op=torch.distributed.ReduceOp.SUM)
-    return scalar_t.item()
-
-
-def add_logging_and_checkpoint_saving(trainer, evaluator, metrics, model, optimizer, args, prefix=""):
-    """ Add to a PyTorch ignite training engine tensorboard logging,
-        progress bar with average loss, checkpoint saving and save training config.
-    """
-    # Add progress bar with average loss
-    RunningAverage(output_transform=lambda x: x).attach(trainer, prefix + "loss")
-    pbar = ProgressBar(persist=True)
-    pbar.attach(trainer, metric_names=[prefix + "loss"])
-    evaluator.add_event_handler(Events.COMPLETED, lambda _:
-                                pbar.log_message("Validation: %s" % pformat(evaluator.state.metrics)))
-
-    # Add tensorboard logging with training and evaluation metrics
-    tb_logger = TensorboardLogger(log_dir=None)
-    tb_logger.attach(trainer, log_handler=OutputHandler(tag="training", metric_names=[prefix + "loss"]),
-                     event_name=Events.ITERATION_COMPLETED)
-    tb_logger.attach(trainer, log_handler=OptimizerParamsHandler(optimizer),
-                     event_name=Events.ITERATION_STARTED)
-    @evaluator.on(Events.COMPLETED)
-    def tb_log_metrics(engine):
-        for name in metrics.keys():
-            tb_logger.writer.add_scalar(name, engine.state.metrics[name], trainer.state.iteration)
-
-    # Add checkpoint saving after each epoch - take care of distributed encapsulation ('getattr()')
-    checkpoint_handler = ModelCheckpoint(tb_logger.writer.log_dir, 'checkpoint', save_interval=1, n_saved=3)
-    trainer.add_event_handler(Events.EPOCH_COMPLETED, checkpoint_handler, {'mymodel': getattr(model, 'module', model)})
-
-    # Save training configuration
-    torch.save(args, os.path.join(tb_logger.writer.log_dir, CONFIG_NAME))
-
-    return checkpoint_handler, tb_logger
diff --git a/pytorch_transformers/optimization.py b/pytorch_transformers/optimization.py
index c7f169f0b6..c78818dd74 100644
--- a/pytorch_transformers/optimization.py
+++ b/pytorch_transformers/optimization.py
@@ -25,7 +25,7 @@ logger = logging.getLogger(__name__)
 
 class ConstantLRSchedule(LambdaLR):
     def __init__(self, optimizer, last_epoch=-1):
-        super(ConstantLR, self).__init__(optimizer, lambda x: x, last_epoch=last_epoch)
+        super(ConstantLRSchedule, self).__init__(optimizer, lambda x: x, last_epoch=last_epoch)
 
 class WarmupCosineSchedule(LambdaLR):
     """
@@ -128,7 +128,7 @@ class AdamW(Optimizer):
             raise ValueError("Invalid epsilon value: {} - should be >= 0.0".format(e))
         defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay,
                         correct_bias=correct_bias)
-        super(BertAdam, self).__init__(params, defaults)
+        super(AdamW, self).__init__(params, defaults)
 
     def step(self, closure=None):
         """Performs a single optimization step.

From 6135de2fa3391f6c311ceec7011f05dd712cbeb5 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Thu, 11 Jul 2019 15:39:49 +0200
Subject: [PATCH 095/139] readme update

---
 README.md | 14 ++------------
 1 file changed, 2 insertions(+), 12 deletions(-)

diff --git a/README.md b/README.md
index 7eded53d47..f916627b90 100644
--- a/README.md
+++ b/README.md
@@ -1620,20 +1620,10 @@ and unpack it to some directory `$GLUE_DIR`.
 ```shell
 export GLUE_DIR=/path/to/glue
 
-python run_xlnet_classifier.py \
- --task_name STS-B \
- --do_train \
- --do_eval \
- --data_dir $GLUE_DIR/STS-B/ \
- --max_seq_length 128 \
- --train_batch_size 8 \
- --gradient_accumulation_steps 1 \
- --learning_rate 5e-5 \
- --num_train_epochs 3.0 \
- --output_dir /tmp/mrpc_output/
+CUDA_VISIBLE_DEVICES=0,1,2,3 python ./examples/run_glue.py   --do_train  --task_name=sts-b     --data_dir=${GLUE_DIR}/STS-B   --output_dir=./proc_data/sts-b-110   --max_seq_length=128   --per_gpu_eval_batch_size=8   --per_gpu_train_batch_size=8   --max_steps=1200  --model_name=xlnet-large-cased   --overwrite_output_dir   --overwrite_cache --warmup_steps=120
 ```
 
-Our test ran on a few seeds with [the original implementation hyper-parameters](https://github.com/zihangdai/xlnet#1-sts-b-sentence-pair-relevance-regression-with-gpus) gave evaluation results between 84% and 88%.
+This hyper-parameters give evaluation results pearsonr > 0.918.
 
 ### Distributed training
 

From ccb6947dc1a5ddc9e1e6c3dc7f010385ed92c2b6 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Thu, 11 Jul 2019 17:39:47 +0200
Subject: [PATCH 096/139] optimization tests

---
 examples/run_glue.py                          | 29 +++++---
 examples/test_examples.py                     | 15 +++-
 pytorch_transformers/optimization.py          | 23 +++---
 .../tests/optimization_test.py                | 73 +++++++++++++------
 4 files changed, 91 insertions(+), 49 deletions(-)

diff --git a/examples/run_glue.py b/examples/run_glue.py
index aaf9a9876c..c3dffb4fdb 100644
--- a/examples/run_glue.py
+++ b/examples/run_glue.py
@@ -96,8 +96,10 @@ def train(args, train_dataset, model, tokenizer):
     global_step = 0
     tr_loss, logging_loss = 0.0, 0.0
     model.zero_grad()
-    for _ in trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]):
-        for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])):
+    train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0])
+    for _ in train_iterator:
+        epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
+        for step, batch in enumerate(epoch_iterator):
             model.train()
             batch = tuple(t.to(args.device) for t in batch)
             inputs = {'input_ids':      batch[0],
@@ -129,7 +131,7 @@ def train(args, train_dataset, model, tokenizer):
 
                 if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                     # Log metrics
-                    if args.local_rank == -1:  # Only evaluate when single GPU otherwise metrics may not average well
+                    if args.local_rank == -1 and args.evaluate_during_training:  # Only evaluate when single GPU otherwise metrics may not average well
                         results = evaluate(args, model, tokenizer)
                         for key, value in results.items():
                             tb_writer.add_scalar('eval_{}'.format(key), value, global_step)
@@ -148,8 +150,10 @@ def train(args, train_dataset, model, tokenizer):
                     logger.info("Saving model checkpoint to %s", output_dir)
 
             if args.max_steps > 0 and global_step > args.max_steps:
+                epoch_iterator.close()
                 break
         if args.max_steps > 0 and global_step > args.max_steps:
+            train_iterator.close()
             break
 
     return global_step, tr_loss / global_step
@@ -164,11 +168,10 @@ def evaluate(args, model, tokenizer, prefix=""):
     for eval_task, eval_output_dir in zip(eval_task_names, eval_outputs_dirs):
         eval_dataset = load_and_cache_examples(args, eval_task, tokenizer, evaluate=True)
 
-        """ Evaluate the model """
         if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]:
             os.makedirs(eval_output_dir)
 
-        args.eval_batch_size = args.per_gpu_eval_batch_size * args.n_gpu
+        args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
         # Note that DistributedSampler samples randomly
         eval_sampler = SequentialSampler(eval_dataset) if args.local_rank == -1 else DistributedSampler(eval_dataset)
         eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)
@@ -177,7 +180,7 @@ def evaluate(args, model, tokenizer, prefix=""):
         logger.info("***** Running evaluation {} *****".format(prefix))
         logger.info("  Num examples = %d", len(eval_dataset))
         logger.info("  Batch size = %d", args.eval_batch_size)
-        eval_loss = 0
+        eval_loss = 0.0
         nb_eval_steps = 0
         preds = None
         out_label_ids = None
@@ -287,6 +290,8 @@ def main():
                         help="Whether to run training.")
     parser.add_argument("--do_eval", action='store_true',
                         help="Whether to run eval on the dev set.")
+    parser.add_argument("--evaluate_during_training", action='store_true',
+                        help="Rul evaluation during training at each logging step.")
     parser.add_argument("--do_lower_case", action='store_true',
                         help="Set this flag if you are using an uncased model.")
 
@@ -364,7 +369,7 @@ def main():
                         datefmt = '%m/%d/%Y %H:%M:%S',
                         level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
     logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
-                args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16)
+                    args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16)
 
     # Setup seeds
     random.seed(args.seed)
@@ -409,6 +414,8 @@ def main():
     elif args.n_gpu > 1:
         model = torch.nn.DataParallel(model)
 
+    logger.info("Training/evaluation parameters %s", args)
+
     # Training
     if args.do_train:
         train_dataset = load_and_cache_examples(args, args.task_name, tokenizer, evaluate=False)
@@ -438,22 +445,22 @@ def main():
         model.to(args.device)
 
     # Evaluation
+    results = {}
     if args.do_eval and args.local_rank in [-1, 0]:
-        checkpoints = [args.output_dir + './' + WEIGHTS_NAME]
+        checkpoints = [args.output_dir]
         if args.eval_all_checkpoints:
             checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True)))
             logging.getLogger("pytorch_transformers.modeling_utils").setLevel(logging.WARN)  # Reduce logging
         logger.info("Evaluate the following checkpoints: %s", checkpoints)
-        results = {}
         for checkpoint in checkpoints:
-            global_step = int(checkpoint.split('-')[-1])
+            global_step = checkpoint.split('-')[-1]
             model = model_class.from_pretrained(checkpoint)
             model.to(args.device)
             result = evaluate(args, model, tokenizer, prefix=global_step)
             result = dict((k + '_{}'.format(global_step), v) for k, v in result.items())
             results.update(result)
 
-        return results
+    return results
 
 
 if __name__ == "__main__":
diff --git a/examples/test_examples.py b/examples/test_examples.py
index 56c30efae4..dec59358b8 100644
--- a/examples/test_examples.py
+++ b/examples/test_examples.py
@@ -45,9 +45,18 @@ class ExamplesTests(unittest.TestCase):
         stream_handler = logging.StreamHandler(sys.stdout)
         logger.addHandler(stream_handler)
 
-        testargs = ["run_glue.py", "--data_dir=./examples/tests_samples/MRPC/",
-                    "--task_name=mrpc", "--do_train", "--do_eval", "--output_dir=./examples/tests_samples/temp_dir",
-                    "--train_batch_size=4", "--eval_batch_size=2", "--num_train_epochs=2.0", "--overwrite_output_dir"]
+        testargs = ["run_glue.py",
+                    "--data_dir=./examples/tests_samples/MRPC/",
+                    "--task_name=mrpc",
+                    "--do_train",
+                    "--do_eval",
+                    "--output_dir=./examples/tests_samples/temp_dir",
+                    "--per_gpu_train_batch_size=2",
+                    "--per_gpu_eval_batch_size=1",
+                    "--learning_rate=1e-4",
+                    "--max_steps=10",
+                    "--warmup_steps=2",
+                    "--overwrite_output_dir"]
         model_name = "--model_name=bert-base-uncased"
         with patch.object(sys, 'argv', testargs + [model_name]):
             result = run_glue.main()
diff --git a/pytorch_transformers/optimization.py b/pytorch_transformers/optimization.py
index c78818dd74..8d224f1294 100644
--- a/pytorch_transformers/optimization.py
+++ b/pytorch_transformers/optimization.py
@@ -25,7 +25,7 @@ logger = logging.getLogger(__name__)
 
 class ConstantLRSchedule(LambdaLR):
     def __init__(self, optimizer, last_epoch=-1):
-        super(ConstantLRSchedule, self).__init__(optimizer, lambda x: x, last_epoch=last_epoch)
+        super(ConstantLRSchedule, self).__init__(optimizer, lambda _: 1.0, last_epoch=last_epoch)
 
 class WarmupCosineSchedule(LambdaLR):
     """
@@ -42,10 +42,10 @@ class WarmupCosineSchedule(LambdaLR):
 
         def lr_lambda(step):
             if step < warmup_steps:
-                return step / max(1, warmup_steps)
+                return float(step) / float(max(1.0, warmup_steps))
             else:
-                progress = (step - warmup_steps) / max(1, t_total - warmup_steps)   # progress after warmup
-                return 0.5 * (1. + math.cos(math.pi * cycles * 2 * progress))
+                progress = float(step - warmup_steps) / float(max(1, t_total - warmup_steps))   # progress after warmup
+                return 0.5 * (1. + math.cos(math.pi * float(cycles) * 2.0 * progress))
 
         super(WarmupCosineSchedule, self).__init__(optimizer, lr_lambda, last_epoch=last_epoch)
 
@@ -59,11 +59,12 @@ class WarmupCosineWithHardRestartsSchedule(LambdaLR):
 
         def lr_lambda(step):
             if step < warmup_steps:
-                return step / max(1, warmup_steps)
+                return float(step) / float(max(1, warmup_steps))
             else:
-                progress = (step - warmup_steps) / max(1, t_total - warmup_steps)   # progress after warmup
-                ret = 0.5 * (1. + math.cos(math.pi * ((cycles * progress) % 1)))
-                return ret
+                progress = float(step - warmup_steps) / float(max(1, t_total - warmup_steps))   # progress after warmup
+                if progress >= 1.0:
+                    return 0.0
+                return 0.5 * (1. + math.cos(math.pi * ((float(cycles) * progress) % 1.0)))
 
         super(WarmupCosineWithHardRestartsSchedule, self).__init__(optimizer, lr_lambda, last_epoch=last_epoch)
 
@@ -77,7 +78,7 @@ class WarmupConstantSchedule(LambdaLR):
 
         def lr_lambda(step):
             if step < warmup_steps:
-                return step / warmup_steps
+                return float(step) / float(max(1.0, warmup_steps))
             return 1.
 
         super(WarmupConstantSchedule, self).__init__(optimizer, lr_lambda, last_epoch=last_epoch)
@@ -92,8 +93,8 @@ class WarmupLinearSchedule(LambdaLR):
 
         def lr_lambda(step):
             if step < warmup_steps:
-                return step / max(1, warmup_steps)
-            return (t_total - step) / max(1, t_total - warmup_steps)
+                return float(step) / float(max(1, warmup_steps))
+            return float(t_total - step) / float(max(1.0, t_total - warmup_steps))
 
         super(WarmupLinearSchedule, self).__init__(optimizer, lr_lambda, last_epoch=last_epoch)
 
diff --git a/pytorch_transformers/tests/optimization_test.py b/pytorch_transformers/tests/optimization_test.py
index 7a9c93048d..ef1a1b1d50 100644
--- a/pytorch_transformers/tests/optimization_test.py
+++ b/pytorch_transformers/tests/optimization_test.py
@@ -26,6 +26,13 @@ from pytorch_transformers import (AdamW, ConstantLRSchedule, WarmupConstantSched
 import numpy as np
 
 
+def unwrap_schedule(scheduler, num_steps=10):
+    lrs = []
+    for _ in range(num_steps):
+        scheduler.step()
+        lrs.append(scheduler.get_lr())
+    return lrs
+
 class OptimizationTest(unittest.TestCase):
 
     def assertListAlmostEqual(self, list1, list2, tol):
@@ -38,9 +45,7 @@ class OptimizationTest(unittest.TestCase):
         target = torch.tensor([0.4, 0.2, -0.5])
         criterion = torch.nn.MSELoss()
         # No warmup, constant schedule, no gradient clipping
-        optimizer = AdamW(params=[w], lr=2e-1,
-                                          weight_decay=0.0,
-                                          max_grad_norm=-1)
+        optimizer = AdamW(params=[w], lr=2e-1, weight_decay=0.0)
         for _ in range(100):
             loss = criterion(w, target)
             loss.backward()
@@ -51,29 +56,49 @@ class OptimizationTest(unittest.TestCase):
 
 
 class ScheduleInitTest(unittest.TestCase):
-    def test_sched_init(self):
-        m = torch.nn.Linear(50, 50)
-        optim = AdamW(m.parameters(), lr=0.001, warmup=.1, t_total=1000, schedule=None)
-        self.assertTrue(isinstance(optim.param_groups[0]["schedule"], ConstantLR))
-        optim = AdamW(m.parameters(), lr=0.001, warmup=.1, t_total=1000, schedule="none")
-        self.assertTrue(isinstance(optim.param_groups[0]["schedule"], ConstantLR))
-        optim = AdamW(m.parameters(), lr=0.001, warmup=.01, t_total=1000)
-        self.assertTrue(isinstance(optim.param_groups[0]["schedule"], WarmupLinearSchedule))
-        # shouldn't fail
+    m = torch.nn.Linear(50, 50)
+    optimizer = AdamW(m.parameters(), lr=10.)
+    num_steps = 10
 
+    def assertListAlmostEqual(self, list1, list2, tol):
+        self.assertEqual(len(list1), len(list2))
+        for a, b in zip(list1, list2):
+            self.assertAlmostEqual(a, b, delta=tol)
 
-class WarmupCosineWithRestartsTest(unittest.TestCase):
-    def test_it(self):
-        m = WarmupCosineWithWarmupRestartsSchedule(warmup=0.05, t_total=1000., cycles=5)
-        x = np.arange(0, 1000)
-        y = [m.get_lr(xe) for xe in x]
-        y = np.asarray(y)
-        expected_zeros = y[[0, 200, 400, 600, 800]]
-        print(expected_zeros)
-        expected_ones = y[[50, 250, 450, 650, 850]]
-        print(expected_ones)
-        self.assertTrue(np.allclose(expected_ones, 1))
-        self.assertTrue(np.allclose(expected_zeros, 0))
+    def test_constant_scheduler(self):
+        scheduler = ConstantLRSchedule(self.optimizer)
+        lrs = unwrap_schedule(scheduler, self.num_steps)
+        expected_learning_rates = [10.] * self.num_steps
+        self.assertEqual(len(lrs[0]), 1)
+        self.assertListEqual([l[0] for l in lrs], expected_learning_rates)
+
+    def test_warmup_constant_scheduler(self):
+        scheduler = WarmupConstantSchedule(self.optimizer, warmup_steps=4)
+        lrs = unwrap_schedule(scheduler, self.num_steps)
+        expected_learning_rates = [2.5, 5.0, 7.5, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0]
+        self.assertEqual(len(lrs[0]), 1)
+        self.assertListEqual([l[0] for l in lrs], expected_learning_rates)
+
+    def test_warmup_linear_scheduler(self):
+        scheduler = WarmupLinearSchedule(self.optimizer, warmup_steps=2, t_total=10)
+        lrs = unwrap_schedule(scheduler, self.num_steps)
+        expected_learning_rates = [5.0, 10.0, 8.75, 7.5, 6.25, 5.0, 3.75, 2.5, 1.25, 0.0]
+        self.assertEqual(len(lrs[0]), 1)
+        self.assertListEqual([l[0] for l in lrs], expected_learning_rates)
+
+    def test_warmup_cosine_scheduler(self):
+        scheduler = WarmupCosineSchedule(self.optimizer, warmup_steps=2, t_total=10)
+        lrs = unwrap_schedule(scheduler, self.num_steps)
+        expected_learning_rates = [5.0, 10.0, 9.61, 8.53, 6.91, 5.0, 3.08, 1.46, 0.38, 0.0]
+        self.assertEqual(len(lrs[0]), 1)
+        self.assertListAlmostEqual([l[0] for l in lrs], expected_learning_rates, tol=1e-2)
+
+    def test_warmup_cosine_hard_restart_scheduler(self):
+        scheduler = WarmupCosineWithHardRestartsSchedule(self.optimizer, warmup_steps=2, cycles=2, t_total=10)
+        lrs = unwrap_schedule(scheduler, self.num_steps)
+        expected_learning_rates = [5.0, 10.0, 8.53, 5.0, 1.46, 10.0, 8.53, 5.0, 1.46, 0.0]
+        self.assertEqual(len(lrs[0]), 1)
+        self.assertListAlmostEqual([l[0] for l in lrs], expected_learning_rates, tol=1e-2)
 
 
 if __name__ == "__main__":

From 6491575fd56124084b1715a0557cbda806a7b7f6 Mon Sep 17 00:00:00 2001
From: LysandreJik <lysandre.debut@reseau.eseo.fr>
Date: Thu, 11 Jul 2019 12:38:21 -0400
Subject: [PATCH 097/139] Added TorchScript disclaimer. CSS modifications.

---
 docs/source/_static/css/huggingface.css | 34 +++++++++++-
 docs/source/conf.py                     |  4 +-
 docs/source/examples.rst                | 69 ++++++++++++++++++++++---
 docs/source/torchscript.rst             |  7 +++
 4 files changed, 105 insertions(+), 9 deletions(-)

diff --git a/docs/source/_static/css/huggingface.css b/docs/source/_static/css/huggingface.css
index 15b5030972..84740cb4df 100644
--- a/docs/source/_static/css/huggingface.css
+++ b/docs/source/_static/css/huggingface.css
@@ -1,3 +1,5 @@
+huggingface.css
+
 /* The literal code blocks */
 .rst-content tt.literal, .rst-content tt.literal, .rst-content code.literal {
     color: #6670FF;
@@ -31,12 +33,12 @@
 
 /* When a list item that does belong to the selected block from the toc tree is hovered */
 .wy-menu-vertical li.current a:hover{
-    background-color: #FB8D68;
+    background-color: #B6C0FF;
 }
 
 /* When a list item that does NOT belong to the selected block from the toc tree is hovered. */
 .wy-menu-vertical li a:hover{
-    background-color: #FB8D68;
+    background-color: #A7AFFB;
 }
 
 /* The text items on the toc tree */
@@ -114,6 +116,11 @@ a {
     text-transform: uppercase;
 }
 
+/* It would be better for table to be visible without horizontal scrolling */
+.wy-table-responsive table td, .wy-table-responsive table th{
+    white-space: normal;
+}
+
 .footer {
     margin-top: 20px;
 }
@@ -127,6 +134,28 @@ a {
     margin: 2px 5px 0 0;
 }
 
+/* class and method names in doc */
+.rst-content dl:not(.docutils) tt.descname, .rst-content dl:not(.docutils) tt.descclassname, .rst-content dl:not(.docutils) tt.descname, .rst-content dl:not(.docutils) code.descname, .rst-content dl:not(.docutils) tt.descclassname, .rst-content dl:not(.docutils) code.descclassname{
+    font-family: Calibre;
+    font-size: 20px !important;
+}
+
+/* class name in doc*/
+.rst-content dl:not(.docutils) tt.descname, .rst-content dl:not(.docutils) tt.descname, .rst-content dl:not(.docutils) code.descname{
+    margin-right: 10px;
+    font-family: Calibre-Medium;
+}
+
+/* Method and class parameters */
+.sig-param{
+    line-height: 23px;
+}
+
+/* Class introduction "class" string at beginning */
+.rst-content dl:not(.docutils) .property{
+    font-size: 18px;
+    color: black;
+}
 
 
 /* FONTS */
@@ -167,3 +196,4 @@ h2, .rst-content .toctree-wrapper p.caption, h3, h4, h5, h6, legend{
     src: url(./Calibre-Thin.otf);
     font-weight:400;
 }
+
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 16866b5e5c..79df358631 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -85,7 +85,9 @@ html_theme = 'sphinx_rtd_theme'
 # further.  For a list of options available for each theme, see the
 # documentation.
 #
-# html_theme_options = {}
+html_theme_options = {
+    'analytics_id': 'UA-83738774-2'
+}
 
 # Add any paths that contain custom static files (such as style sheets) here,
 # relative to this directory. They are copied after the builtin static files,
diff --git a/docs/source/examples.rst b/docs/source/examples.rst
index aee4066c2c..51c8d850b9 100644
--- a/docs/source/examples.rst
+++ b/docs/source/examples.rst
@@ -1,3 +1,5 @@
+examples.rst
+
 Examples
 ================================================
 
@@ -39,7 +41,13 @@ Note: To use *Distributed Training*\ , you will need to run one training script
 
 .. code-block:: bash
 
-   python -m torch.distributed.launch --nproc_per_node=4 --nnodes=2 --node_rank=$THIS_MACHINE_INDEX --master_addr="192.168.1.1" --master_port=1234 run_bert_classifier.py (--arg1 --arg2 --arg3 and all other arguments of the run_classifier script)
+    python -m torch.distributed.launch \
+        --nproc_per_node=4 \
+        --nnodes=2 \
+        --node_rank=$THIS_MACHINE_INDEX \
+        --master_addr="192.168.1.1" \
+        --master_port=1234 run_bert_classifier.py \
+        (--arg1 --arg2 --arg3 and all other arguments of the run_classifier script)
 
 Where ``$THIS_MACHINE_INDEX`` is an sequential index assigned to each of your machine (0, 1, 2...) and the machine with rank 0 has an IP address ``192.168.1.1`` and an open port ``1234``.
 
@@ -186,7 +194,19 @@ Here is an example using distributed training on 8 V100 GPUs and Bert Whole Word
 
 .. code-block:: bash
 
-   python -m torch.distributed.launch --nproc_per_node 8 run_bert_classifier.py   --bert_model bert-large-uncased-whole-word-masking    --task_name MRPC --do_train   --do_eval   --do_lower_case   --data_dir $GLUE_DIR/MRPC/   --max_seq_length 128   --train_batch_size 8   --learning_rate 2e-5   --num_train_epochs 3.0  --output_dir /tmp/mrpc_output/
+    python -m torch.distributed.launch \
+        --nproc_per_node 8 run_bert_classifier.py \
+        --bert_model bert-large-uncased-whole-word-masking \
+        --task_name MRPC \
+        --do_train \
+        --do_eval \
+        --do_lower_case \
+        --data_dir $GLUE_DIR/MRPC/ \
+        --max_seq_length 128 \
+        --train_batch_size 8 \
+        --learning_rate 2e-5 \
+        --num_train_epochs 3.0 \
+         --output_dir /tmp/mrpc_output/
 
 Training with these hyper-parameters gave us the following results:
 
@@ -203,7 +223,20 @@ Here is an example on MNLI:
 
 .. code-block:: bash
 
-   python -m torch.distributed.launch --nproc_per_node 8 run_bert_classifier.py   --bert_model bert-large-uncased-whole-word-masking    --task_name mnli --do_train   --do_eval   --do_lower_case   --data_dir /datadrive/bert_data/glue_data//MNLI/   --max_seq_length 128   --train_batch_size 8   --learning_rate 2e-5   --num_train_epochs 3.0   --output_dir ../models/wwm-uncased-finetuned-mnli/ --overwrite_output_dir
+    python -m torch.distributed.launch \
+        --nproc_per_node 8 run_bert_classifier.py \
+        --bert_model bert-large-uncased-whole-word-masking \
+        --task_name mnli \
+        --do_train \
+        --do_eval \
+        --do_lower_case \
+        --data_dir /datadrive/bert_data/glue_data//MNLI/ \
+        --max_seq_length 128 \
+        --train_batch_size 8 \
+        --learning_rate 2e-5 \
+        --num_train_epochs 3.0 \
+        --output_dir ../models/wwm-uncased-finetuned-mnli/ \
+        --overwrite_output_dir
 
 .. code-block:: bash
 
@@ -293,7 +326,20 @@ And here is the model provided as ``bert-large-cased-whole-word-masking-finetune
 
 .. code-block:: bash
 
-   python -m torch.distributed.launch --nproc_per_node=8  run_bert_squad.py  --bert_model bert-large-cased-whole-word-masking   --do_train  --do_predict  --do_lower_case  --train_file $SQUAD_DIR/train-v1.1.json  --predict_file $SQUAD_DIR/dev-v1.1.json  --learning_rate 3e-5  --num_train_epochs 2  --max_seq_length 384  --doc_stride 128  --output_dir ../models/wwm_cased_finetuned_squad/  --train_batch_size 24  --gradient_accumulation_steps 12
+    python -m torch.distributed.launch --nproc_per_node=8  run_bert_squad.py \
+        --bert_model bert-large-cased-whole-word-masking \
+        --do_train \
+        --do_predict \
+        --do_lower_case \
+        --train_file $SQUAD_DIR/train-v1.1.json \
+        --predict_file $SQUAD_DIR/dev-v1.1.json \
+        --learning_rate 3e-5 \
+        --num_train_epochs 2 \
+        --max_seq_length 384 \
+        --doc_stride 128 \
+        --output_dir ../models/wwm_cased_finetuned_squad/ \
+        --train_batch_size 24 \
+        --gradient_accumulation_steps 12
 
 Training with these hyper-parameters gave us the following results:
 
@@ -563,7 +609,18 @@ Here is an example on MNLI:
 
 .. code-block:: bash
 
-   python -m torch.distributed.launch --nproc_per_node 8 run_bert_classifier.py   --bert_model bert-large-uncased-whole-word-masking    --task_name mnli --do_train   --do_eval   --data_dir /datadrive/bert_data/glue_data//MNLI/   --max_seq_length 128   --train_batch_size 8   --learning_rate 2e-5   --num_train_epochs 3.0   --output_dir ../models/wwm-uncased-finetuned-mnli/ --overwrite_output_dir
+    python -m torch.distributed.launch --nproc_per_node 8 run_bert_classifier.py \
+        --bert_model bert-large-uncased-whole-word-masking \
+        --task_name mnli \
+        --do_train \
+        --do_eval \
+        --data_dir /datadrive/bert_data/glue_data//MNLI/ \
+        --max_seq_length 128 \
+        --train_batch_size 8 \
+        --learning_rate 2e-5 \
+        --num_train_epochs 3.0 \
+        --output_dir ../models/wwm-uncased-finetuned-mnli/ \
+        --overwrite_output_dir
 
 .. code-block:: bash
 
@@ -579,4 +636,4 @@ Here is an example on MNLI:
      global_step = 18408
      loss = 0.04755385363816904
 
-This is the example of the ``bert-large-uncased-whole-word-masking-finetuned-mnli`` model.
\ No newline at end of file
+This is the example of the ``bert-large-uncased-whole-word-masking-finetuned-mnli`` model.
diff --git a/docs/source/torchscript.rst b/docs/source/torchscript.rst
index f5eb97f69d..1b84559567 100644
--- a/docs/source/torchscript.rst
+++ b/docs/source/torchscript.rst
@@ -1,6 +1,13 @@
 TorchScript
 ================================================
 
+.. note::
+    This is the very beginning of our experiments with TorchScript and we are still exploring its capabilities
+    with variable-input-size models. It is a focus of interest to us and we will deepen our analysis in upcoming
+    releases, with more code examples, a more flexible implementation, and benchmarks comparing python-based codes
+    with compiled TorchScript.
+
+
 According to Pytorch's documentation: "TorchScript is a way to create serializable and optimizable models from PyTorch code".
 Pytorch's two modules `JIT and TRACE <https://pytorch.org/docs/stable/jit.html>`_ allow the developer to export
 their model to be re-used in other programs, such as efficiency-oriented C++ programs.

From 92a782b10836d8d8aae85a8c17932f36729d01bd Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Thu, 11 Jul 2019 22:20:10 +0200
Subject: [PATCH 098/139] fix run_glue test

---
 examples/run_glue.py                 | 18 ++++++++++++------
 pytorch_transformers/optimization.py | 10 +++++-----
 2 files changed, 17 insertions(+), 11 deletions(-)

diff --git a/examples/run_glue.py b/examples/run_glue.py
index c3dffb4fdb..7e615804c1 100644
--- a/examples/run_glue.py
+++ b/examples/run_glue.py
@@ -53,6 +53,15 @@ MODEL_CLASSES = {
     'xlm': (XLMConfig, XLMForSequenceClassification, XLMTokenizer),
 }
 
+
+def set_seed(args):
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+    torch.manual_seed(args.seed)
+    if args.n_gpu > 0:
+        torch.cuda.manual_seed_all(args.seed)
+
+
 def train(args, train_dataset, model, tokenizer):
     """ Train the model """
     if args.local_rank in [-1, 0]:
@@ -97,6 +106,7 @@ def train(args, train_dataset, model, tokenizer):
     tr_loss, logging_loss = 0.0, 0.0
     model.zero_grad()
     train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0])
+    set_seed(args)  # Added here for reproductibility (even between python 2 and 3)
     for _ in train_iterator:
         epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
         for step, batch in enumerate(epoch_iterator):
@@ -371,12 +381,8 @@ def main():
     logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
                     args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16)
 
-    # Setup seeds
-    random.seed(args.seed)
-    np.random.seed(args.seed)
-    torch.manual_seed(args.seed)
-    if args.n_gpu > 0:
-        torch.cuda.manual_seed_all(args.seed)
+    # Set seed
+    set_seed(args)
 
     # Prepare GLUE task
     args.task_name = args.task_name.lower()
diff --git a/pytorch_transformers/optimization.py b/pytorch_transformers/optimization.py
index 8d224f1294..f0ac914341 100644
--- a/pytorch_transformers/optimization.py
+++ b/pytorch_transformers/optimization.py
@@ -167,14 +167,14 @@ class AdamW(Optimizer):
 
                 # Decay the first and second moment running average coefficient
                 # In-place operations to update the averages at the same time
-                exp_avg.mul_(beta1).add_(1 - beta1, grad)
-                exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
+                exp_avg.mul_(beta1).add_(1.0 - beta1, grad)
+                exp_avg_sq.mul_(beta2).addcmul_(1.0 - beta2, grad, grad)
                 denom = exp_avg_sq.sqrt().add_(group['eps'])
 
                 step_size = group['lr']
                 if group['correct_bias']:  # No bias correction for Bert
-                    bias_correction1 = 1 - beta1 ** state['step']
-                    bias_correction2 = 1 - beta2 ** state['step']
+                    bias_correction1 = 1.0 - beta1 ** state['step']
+                    bias_correction2 = 1.0 - beta2 ** state['step']
                     step_size = step_size * math.sqrt(bias_correction2) / bias_correction1
 
                 p.data.addcdiv_(-step_size, exp_avg, denom)
@@ -187,7 +187,7 @@ class AdamW(Optimizer):
                 # with the m/v parameters. This is equivalent to adding the square
                 # of the weights to the loss with plain (non-momentum) SGD.
                 # Add weight decay at the end (fixed version)
-                if group['weight_decay'] > 0:
+                if group['weight_decay'] > 0.0:
                     p.data.add_(-group['lr'] * group['weight_decay'], p.data)
 
         return loss

From c6bf1a400df220ddbe6f74ffd6456d0728d51e4f Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Thu, 11 Jul 2019 22:29:08 +0200
Subject: [PATCH 099/139] fix test examples et model pretrained

---
 examples/test_examples.py                         | 3 ++-
 pytorch_transformers/modeling_utils.py            | 2 +-
 pytorch_transformers/tests/modeling_utils_test.py | 1 +
 3 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/examples/test_examples.py b/examples/test_examples.py
index dec59358b8..2e6ed45063 100644
--- a/examples/test_examples.py
+++ b/examples/test_examples.py
@@ -56,7 +56,8 @@ class ExamplesTests(unittest.TestCase):
                     "--learning_rate=1e-4",
                     "--max_steps=10",
                     "--warmup_steps=2",
-                    "--overwrite_output_dir"]
+                    "--overwrite_output_dir",
+                    "--seed=42"]
         model_name = "--model_name=bert-base-uncased"
         with patch.object(sys, 'argv', testargs + [model_name]):
             result = run_glue.main()
diff --git a/pytorch_transformers/modeling_utils.py b/pytorch_transformers/modeling_utils.py
index c304e7fdf0..a9445ecad5 100644
--- a/pytorch_transformers/modeling_utils.py
+++ b/pytorch_transformers/modeling_utils.py
@@ -49,7 +49,7 @@ class PretrainedConfig(object):
         self.torchscript = kwargs.pop('torchscript', False)
 
     @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
+    def from_pretrained(cls, pretrained_model_name_or_path, *input, **kwargs):
         """
         Instantiate a PretrainedConfig from a pre-trained model configuration.
 
diff --git a/pytorch_transformers/tests/modeling_utils_test.py b/pytorch_transformers/tests/modeling_utils_test.py
index a168c24611..4944f41228 100644
--- a/pytorch_transformers/tests/modeling_utils_test.py
+++ b/pytorch_transformers/tests/modeling_utils_test.py
@@ -30,6 +30,7 @@ class ModelUtilsTest(unittest.TestCase):
             self.assertIsNotNone(config)
             self.assertIsInstance(config, PretrainedConfig)
 
+            model = BertModel.from_pretrained(model_name)
             model, loading_info = BertModel.from_pretrained(model_name, output_loading_info=True)
             self.assertIsNotNone(model)
             self.assertIsInstance(model, PreTrainedModel)

From 2b644785f06917810bcacedeaabcf07b1ca76fc4 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Thu, 11 Jul 2019 22:31:50 +0200
Subject: [PATCH 100/139] add tests on examples and large circle ci config

---
 .circleci/config.yml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 51b0ffa04e..a9e4ea06f2 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -5,6 +5,7 @@ jobs:
         docker:
             - image: circleci/python:3.5
         resource_class: large
+        parallelism: 4
         steps:
             - checkout
             - run: sudo pip install --progress-bar off .
@@ -12,10 +13,12 @@ jobs:
             - run: sudo pip install spacy ftfy==4.4.3
             - run: sudo python -m spacy download en
             - run: python -m pytest -sv ./pytorch_transformers/tests/ --cov
+            - run: python -m pytest -sv ./examples/
             - run: codecov
     build_py2:
         working_directory: ~/pytorch-transformers
         resource_class: large
+        parallelism: 4
         docker:
             - image: circleci/python:2.7
         steps:
@@ -25,6 +28,7 @@ jobs:
             - run: sudo pip install spacy ftfy==4.4.3
             - run: sudo python -m spacy download en
             - run: python -m pytest -sv ./pytorch_transformers/tests/ --cov
+            - run: python -m pytest -sv ./examples/
             - run: codecov
 workflows:
   version: 2

From 6b13f4cb3a0eed03ce89ddf1b5d4b054d749d722 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Thu, 11 Jul 2019 22:36:35 +0200
Subject: [PATCH 101/139] update circle-ci

---
 .circleci/config.yml | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index a9e4ea06f2..78358d1188 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -10,8 +10,7 @@ jobs:
             - checkout
             - run: sudo pip install --progress-bar off .
             - run: sudo pip install pytest codecov pytest-cov
-            - run: sudo pip install spacy ftfy==4.4.3
-            - run: sudo python -m spacy download en
+            - run: sudo pip install tensorboardX scikit-learn
             - run: python -m pytest -sv ./pytorch_transformers/tests/ --cov
             - run: python -m pytest -sv ./examples/
             - run: codecov
@@ -25,8 +24,7 @@ jobs:
             - checkout
             - run: sudo pip install --progress-bar off .
             - run: sudo pip install pytest codecov pytest-cov
-            - run: sudo pip install spacy ftfy==4.4.3
-            - run: sudo python -m spacy download en
+            - run: sudo pip install tensorboardX scikit-learn
             - run: python -m pytest -sv ./pytorch_transformers/tests/ --cov
             - run: python -m pytest -sv ./examples/
             - run: codecov

From 273617b86dbe5cd15afb795e994dffc44e09e2df Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Thu, 11 Jul 2019 22:45:03 +0200
Subject: [PATCH 102/139] update config - fix gpt/gpt-2 from pretrained

---
 .circleci/config.yml                    | 2 +-
 pytorch_transformers/modeling_gpt2.py   | 2 +-
 pytorch_transformers/modeling_openai.py | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 78358d1188..65e392d2da 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -24,7 +24,7 @@ jobs:
             - checkout
             - run: sudo pip install --progress-bar off .
             - run: sudo pip install pytest codecov pytest-cov
-            - run: sudo pip install tensorboardX scikit-learn
+            - run: sudo pip install tensorboardX scikit-learn mock
             - run: python -m pytest -sv ./pytorch_transformers/tests/ --cov
             - run: python -m pytest -sv ./examples/
             - run: codecov
diff --git a/pytorch_transformers/modeling_gpt2.py b/pytorch_transformers/modeling_gpt2.py
index 66ff4e7185..29d1cbae42 100644
--- a/pytorch_transformers/modeling_gpt2.py
+++ b/pytorch_transformers/modeling_gpt2.py
@@ -423,7 +423,7 @@ class GPT2PreTrainedModel(PreTrainedModel):
         """
         num_special_tokens = kwargs.pop('num_special_tokens', None)
 
-        model = PreTrainedModel.from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs)
+        model = super(PreTrainedModel, cls).from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
 
         # Add additional embeddings for special tokens if needed
         # This step also make sure we are still sharing the output and input embeddings after loading weights
diff --git a/pytorch_transformers/modeling_openai.py b/pytorch_transformers/modeling_openai.py
index c81d082c70..aa35b163f1 100644
--- a/pytorch_transformers/modeling_openai.py
+++ b/pytorch_transformers/modeling_openai.py
@@ -431,7 +431,7 @@ class OpenAIGPTPreTrainedModel(PreTrainedModel):
         num_special_tokens = kwargs.get('num_special_tokens', None)
         kwargs.pop('num_special_tokens', None)
 
-        model = PreTrainedModel.from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs)
+        model = super(PreTrainedModel, cls).from_pretrained(pretrained_model_name_or_path, pretrained_model_name_or_path, *inputs, **kwargs)
 
         # Add additional embeddings for special tokens if needed
         # This step also make sure we are still sharing the output and input embeddings after loading weights

From 50e62a4cb4d503e3559b88838b8cf9f745fef516 Mon Sep 17 00:00:00 2001
From: LysandreJik <lysandre.debut@reseau.eseo.fr>
Date: Thu, 11 Jul 2019 16:50:21 -0400
Subject: [PATCH 103/139] fix gpt/gpt-2 from pretrained

---
 pytorch_transformers/modeling_gpt2.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pytorch_transformers/modeling_gpt2.py b/pytorch_transformers/modeling_gpt2.py
index 29d1cbae42..495e002529 100644
--- a/pytorch_transformers/modeling_gpt2.py
+++ b/pytorch_transformers/modeling_gpt2.py
@@ -423,7 +423,7 @@ class GPT2PreTrainedModel(PreTrainedModel):
         """
         num_special_tokens = kwargs.pop('num_special_tokens', None)
 
-        model = super(PreTrainedModel, cls).from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
+        model = super().from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
 
         # Add additional embeddings for special tokens if needed
         # This step also make sure we are still sharing the output and input embeddings after loading weights

From bd404735a7f282a41b11e240eb7c880e329567c3 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Fri, 12 Jul 2019 00:02:49 +0200
Subject: [PATCH 104/139] embeddings resizing + tie_weights

---
 pytorch_transformers/modeling_bert.py         |  53 +++++--
 pytorch_transformers/modeling_gpt2.py         | 148 +++--------------
 pytorch_transformers/modeling_openai.py       | 150 ++++--------------
 pytorch_transformers/modeling_transfo_xl.py   |   7 +
 pytorch_transformers/modeling_utils.py        |  41 ++++-
 pytorch_transformers/modeling_xlm.py          |  13 +-
 pytorch_transformers/modeling_xlnet.py        |  12 +-
 .../tests/modeling_bert_test.py               |   2 +-
 ...sts_commons.py => modeling_common_test.py} |  45 +++++-
 .../tests/modeling_gpt2_test.py               |   2 +-
 .../tests/modeling_openai_test.py             |   2 +-
 .../tests/modeling_transfo_xl_test.py         |   2 +-
 .../tests/modeling_utils_test.py              |  47 ------
 .../tests/modeling_xlm_test.py                |   2 +-
 .../tests/modeling_xlnet_test.py              |   2 +-
 15 files changed, 196 insertions(+), 332 deletions(-)
 rename pytorch_transformers/tests/{modeling_tests_commons.py => modeling_common_test.py} (91%)
 delete mode 100644 pytorch_transformers/tests/modeling_utils_test.py

diff --git a/pytorch_transformers/modeling_bert.py b/pytorch_transformers/modeling_bert.py
index 6da6a5e507..d88c57bb79 100644
--- a/pytorch_transformers/modeling_bert.py
+++ b/pytorch_transformers/modeling_bert.py
@@ -507,23 +507,17 @@ class BertPredictionHeadTransform(nn.Module):
 
 
 class BertLMPredictionHead(nn.Module):
-    def __init__(self, config, bert_model_embedding_weights):
+    def __init__(self, config):
         super(BertLMPredictionHead, self).__init__()
         self.transform = BertPredictionHeadTransform(config)
-        self.torchscript = config.torchscript
 
         # The output weights are the same as the input embeddings, but there is
         # an output-only bias for each token.
-        self.decoder = nn.Linear(bert_model_embedding_weights.size(1),
-                                 bert_model_embedding_weights.size(0),
+        self.decoder = nn.Linear(config.hidden_size,
+                                 config.vocab_size,
                                  bias=False)
 
-        if self.torchscript:
-            self.decoder.weight = nn.Parameter(bert_model_embedding_weights.clone())
-        else:
-            self.decoder.weight = bert_model_embedding_weights
-
-        self.bias = nn.Parameter(torch.zeros(bert_model_embedding_weights.size(0)))
+        self.bias = nn.Parameter(torch.zeros(config.vocab_size))
 
     def forward(self, hidden_states):
         hidden_states = self.transform(hidden_states)
@@ -532,9 +526,9 @@ class BertLMPredictionHead(nn.Module):
 
 
 class BertOnlyMLMHead(nn.Module):
-    def __init__(self, config, bert_model_embedding_weights):
+    def __init__(self, config):
         super(BertOnlyMLMHead, self).__init__()
-        self.predictions = BertLMPredictionHead(config, bert_model_embedding_weights)
+        self.predictions = BertLMPredictionHead(config)
 
     def forward(self, sequence_output):
         prediction_scores = self.predictions(sequence_output)
@@ -552,9 +546,9 @@ class BertOnlyNSPHead(nn.Module):
 
 
 class BertPreTrainingHeads(nn.Module):
-    def __init__(self, config, bert_model_embedding_weights):
+    def __init__(self, config):
         super(BertPreTrainingHeads, self).__init__()
-        self.predictions = BertLMPredictionHead(config, bert_model_embedding_weights)
+        self.predictions = BertLMPredictionHead(config)
         self.seq_relationship = nn.Linear(config.hidden_size, 2)
 
     def forward(self, sequence_output, pooled_output):
@@ -619,6 +613,11 @@ class BertModel(BertPreTrainedModel):
 
         self.apply(self.init_weights)
 
+    def _resize_token_embeddings(self, new_num_tokens):
+        old_embeddings = self.embeddings.word_embeddings
+        new_embeddings = self._get_resized_embeddings(old_embeddings, new_num_tokens)
+        self.embeddings.word_embeddings = new_embeddings
+
     def _prune_heads(self, heads_to_prune):
         """ Prunes heads of the model.
             heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
@@ -750,9 +749,20 @@ class BertForPreTraining(BertPreTrainedModel):
         super(BertForPreTraining, self).__init__(config)
 
         self.bert = BertModel(config)
-        self.cls = BertPreTrainingHeads(config, self.bert.embeddings.word_embeddings.weight)
+        self.cls = BertPreTrainingHeads(config)
 
         self.apply(self.init_weights)
+        self.tie_weights()
+
+    def tie_weights(self):
+        """ Make sure we are sharing the input and output embeddings.
+            Export to TorchScript can't handle parameter sharing so we are cloning them instead.
+        """
+        input_embeddings = self.bert.embeddings.word_embeddings.weight
+        if self.config.torchscript:
+            self.cls.predictions.decoder.weight = nn.Parameter(input_embeddings.clone())
+        else:
+            self.cls.predictions.decoder.weight = input_embeddings  # Tied weights
 
     def forward(self, input_ids, token_type_ids=None, attention_mask=None, masked_lm_labels=None,
                 next_sentence_label=None, head_mask=None):
@@ -845,9 +855,20 @@ class BertForMaskedLM(BertPreTrainedModel):
         super(BertForMaskedLM, self).__init__(config)
 
         self.bert = BertModel(config)
-        self.cls = BertOnlyMLMHead(config, self.bert.embeddings.word_embeddings.weight)
+        self.cls = BertOnlyMLMHead(config)
 
         self.apply(self.init_weights)
+        self.tie_weights()
+
+    def tie_weights(self):
+        """ Make sure we are sharing the input and output embeddings.
+            Export to TorchScript can't handle parameter sharing so we are cloning them instead.
+        """
+        input_embeddings = self.bert.embeddings.word_embeddings.weight
+        if self.config.torchscript:
+            self.cls.predictions.decoder.weight = nn.Parameter(input_embeddings.clone())
+        else:
+            self.cls.predictions.decoder.weight = input_embeddings  # Tied weights
 
     def forward(self, input_ids, token_type_ids=None, attention_mask=None, masked_lm_labels=None, head_mask=None):
         """
diff --git a/pytorch_transformers/modeling_gpt2.py b/pytorch_transformers/modeling_gpt2.py
index 495e002529..06f933147f 100644
--- a/pytorch_transformers/modeling_gpt2.py
+++ b/pytorch_transformers/modeling_gpt2.py
@@ -104,7 +104,6 @@ class GPT2Config(PretrainedConfig):
 
     Args:
         vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `GPT2Model` or a configuration json file.
-        n_special: The number of special tokens to learn during fine-tuning ('[SEP]', '[CLF]', ...)
         n_positions: Number of positional embeddings.
         n_ctx: Size of the causal mask (usually same as n_positions).
         n_embd: Dimensionality of the embeddings and hidden states.
@@ -119,14 +118,12 @@ class GPT2Config(PretrainedConfig):
         embd_pdrop: The dropout ratio for the embeddings.
         initializer_range: The sttdev of the truncated_normal_initializer for
             initializing all weight matrices.
-        predict_special_tokens: should we predict special tokens (when the model has a LM head)
     """
     pretrained_config_archive_map = GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP
 
     def __init__(
         self,
         vocab_size_or_config_json_file=50257,
-        n_special=0,
         n_positions=1024,
         n_ctx=1024,
         n_embd=768,
@@ -137,7 +134,6 @@ class GPT2Config(PretrainedConfig):
         attn_pdrop=0.1,
         layer_norm_epsilon=1e-5,
         initializer_range=0.02,
-        predict_special_tokens=True,
 
         num_labels=1,
         summary_type='token_ids',
@@ -151,7 +147,6 @@ class GPT2Config(PretrainedConfig):
 
         Args:
             vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `GPT2Model` or a configuration json file.
-            n_special: The number of special tokens to learn during fine-tuning ('[SEP]', '[CLF]', ...)
             n_positions: Number of positional embeddings.
             n_ctx: Size of the causal mask (usually same as n_positions).
             n_embd: Dimensionality of the embeddings and hidden states.
@@ -166,7 +161,6 @@ class GPT2Config(PretrainedConfig):
             embd_pdrop: The dropout ratio for the embeddings.
             initializer_range: The sttdev of the truncated_normal_initializer for
                 initializing all weight matrices.
-            predict_special_tokens: should we predict special tokens (when the model has a LM head)
         """
         super(GPT2Config, self).__init__(**kwargs)
 
@@ -178,7 +172,6 @@ class GPT2Config(PretrainedConfig):
                 self.__dict__[key] = value
         elif isinstance(vocab_size_or_config_json_file, int):
             self.vocab_size = vocab_size_or_config_json_file
-            self.n_special = n_special
             self.n_ctx = n_ctx
             self.n_positions = n_positions
             self.n_embd = n_embd
@@ -189,7 +182,6 @@ class GPT2Config(PretrainedConfig):
             self.attn_pdrop = attn_pdrop
             self.layer_norm_epsilon = layer_norm_epsilon
             self.initializer_range = initializer_range
-            self.predict_special_tokens = predict_special_tokens
 
             self.num_labels = num_labels
             self.summary_type = summary_type
@@ -203,10 +195,6 @@ class GPT2Config(PretrainedConfig):
                 "or the path to a pretrained model config file (str)"
             )
 
-    @property
-    def total_tokens_embeddings(self):
-        return self.vocab_size + self.n_special
-
     @property
     def hidden_size(self):
         return self.n_embd
@@ -347,34 +335,6 @@ class Block(nn.Module):
         return outputs  # x, present, (attentions)
 
 
-class GPT2LMHead(nn.Module):
-    """ Language Model Head for the transformer """
-
-    def __init__(self, model_embeddings_weights, config):
-        super(GPT2LMHead, self).__init__()
-        self.n_embd = config.n_embd
-        self.vocab_size = config.vocab_size
-        self.predict_special_tokens = config.predict_special_tokens
-        self.torchscript = config.torchscript
-        embed_shape = model_embeddings_weights.shape
-        self.decoder = nn.Linear(embed_shape[1], embed_shape[0], bias=False)
-        self.set_embeddings_weights(model_embeddings_weights)
-
-    def set_embeddings_weights(self, model_embeddings_weights, predict_special_tokens=True):
-        self.predict_special_tokens = predict_special_tokens
-        # Export to TorchScript can't handle parameter sharing so we are cloning them.
-        if self.torchscript:
-            self.decoder.weight = nn.Parameter(model_embeddings_weights.clone())
-        else:
-            self.decoder.weight = model_embeddings_weights  # Tied weights
-
-    def forward(self, hidden_state):
-        lm_logits = self.decoder(hidden_state)
-        if not self.predict_special_tokens:
-            lm_logits = lm_logits[..., :self.vocab_size]
-        return lm_logits
-
-
 class GPT2PreTrainedModel(PreTrainedModel):
     """ An abstract class to handle weights initialization and
         a simple interface for dowloading and loading pretrained models.
@@ -400,36 +360,6 @@ class GPT2PreTrainedModel(PreTrainedModel):
             module.bias.data.zero_()
             module.weight.data.fill_(1.0)
 
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
-        """
-        Instantiate a GPT2PreTrainedModel from a pre-trained model file or a pytorch state dict.
-        Download and cache the pre-trained model file if needed.
-
-        Params:
-            pretrained_model_name_or_path: either:
-                - a str with the name of a pre-trained model to load selected in the list of:
-                    . `gpt2`
-                - a path or url to a pretrained model archive containing:
-                    . `gpt2_config.json` a configuration file for the model
-                    . `pytorch_model.bin` a PyTorch dump of a GPT2Model instance
-                - a path or url to a pretrained model archive containing:
-                    . `gpt2_config.json` a configuration file for the model
-                    . a TensorFlow checkpoint with trained weights
-            from_tf: should we load the weights from a locally saved TensorFlow checkpoint
-            cache_dir: an optional path to a folder in which the pre-trained models will be cached.
-            state_dict: an optional state dictionary (collections.OrderedDict object) to use instead of pre-trained models
-            *inputs, **kwargs: additional input for the specific GPT2 class
-        """
-        num_special_tokens = kwargs.pop('num_special_tokens', None)
-
-        model = super().from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
-
-        # Add additional embeddings for special tokens if needed
-        # This step also make sure we are still sharing the output and input embeddings after loading weights
-        model.set_num_special_tokens(num_special_tokens)
-        return model
-
 
 class GPT2Model(GPT2PreTrainedModel):
     """OpenAI GPT-2 model ("Language Models are Unsupervised Multitask Learners").
@@ -447,13 +377,13 @@ class GPT2Model(GPT2PreTrainedModel):
          config.vocab_size - 1,                                     ______________________
          config.vocab_size,
          ...                                                        -> special embeddings
-         config.vocab_size + config.n_special - 1]                  ______________________
+         config.vocab_size + n_special - 1]                  ______________________
 
-    where total_tokens_embeddings can be obtained as config.total_tokens_embeddings and is equal to
+    where total_tokens_embeddings is equal to
 
     ::
 
-        total_tokens_embeddings = config.vocab_size + config.n_special
+        total_tokens_embeddings = vocab_size + n_special
 
     You should use the associated indices to index the embeddings.
 
@@ -474,7 +404,7 @@ class GPT2Model(GPT2PreTrainedModel):
         self.output_hidden_states = config.output_hidden_states
         self.output_attentions = config.output_attentions
 
-        self.wte = nn.Embedding(config.total_tokens_embeddings, config.n_embd)
+        self.wte = nn.Embedding(config.vocab_size, config.n_embd)
         self.wpe = nn.Embedding(config.n_positions, config.n_embd)
         self.drop = nn.Dropout(config.embd_pdrop)
         self.h = nn.ModuleList([Block(config.n_ctx, config, scale=True) for _ in range(config.n_layer)])
@@ -482,26 +412,8 @@ class GPT2Model(GPT2PreTrainedModel):
 
         self.apply(self.init_weights)
 
-    def set_num_special_tokens(self, num_special_tokens=None):
-        """
-        Update input embeddings with new embedding matrix if needed.
-
-        Args:
-            num_special_tokens: Special tokens to be added to the embedding matrix
-
-        TODO Lysandre filled args
-        """
-        if num_special_tokens is None or self.config.n_special == num_special_tokens:
-            return
-        # Update config
-        self.config.n_special = num_special_tokens
-        # Build new embeddings and initialize all new embeddings (in particular the special tokens)
-        old_embed = self.wte
-        self.wte = nn.Embedding(self.config.total_tokens_embeddings, self.config.n_embd)
-        self.wte.to(old_embed.weight.device)
-        self.init_weights(self.wte)
-        # Copy word embeddings from the previous weights
-        self.wte.weight.data[:self.config.vocab_size, :] = old_embed.weight.data[:self.config.vocab_size, :]
+    def _resize_token_embeddings(self, new_num_tokens):
+        self.wte = self._get_resized_embeddings(self.wte, new_num_tokens)
 
     def _prune_heads(self, heads_to_prune):
         """ Prunes heads of the model.
@@ -641,23 +553,20 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
     def __init__(self, config):
         super(GPT2LMHeadModel, self).__init__(config)
         self.transformer = GPT2Model(config)
-        self.lm_head = GPT2LMHead(self.transformer.wte.weight, config)
+        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
+
         self.apply(self.init_weights)
+        self.tie_weights()
 
-    def set_num_special_tokens(self, num_special_tokens, predict_special_tokens=True):
+    def tie_weights(self):
+        """ Make sure we are sharing the input and output embeddings.
+            Export to TorchScript can't handle parameter sharing so we are cloning them instead.
         """
-        Update input and output embeddings with new embedding matrix. Make sure we are sharing the embeddings.
-
-        Args:
-            num_special_tokens: Special tokens to be added to the embedding matrix
-            predict_special_tokens: if set to True, the model will try and predict the specified ``num_special_tokens``.
-                Defaults to True.
-
-        TODO Lysandre filled args
-        """
-        self.config.predict_special_tokens = self.transformer.config.predict_special_tokens = predict_special_tokens
-        self.transformer.set_num_special_tokens(num_special_tokens)
-        self.lm_head.set_embeddings_weights(self.transformer.wte.weight, predict_special_tokens=predict_special_tokens)
+        input_embeddings = self.transformer.wte.weight
+        if self.config.torchscript:
+            self.lm_head.weight = nn.Parameter(input_embeddings.clone())
+        else:
+            self.lm_head.weight = input_embeddings  # Tied weights
 
     def forward(self, input_ids, position_ids=None, token_type_ids=None, lm_labels=None, past=None, head_mask=None):
         """
@@ -740,25 +649,20 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
     def __init__(self, config):
         super(GPT2DoubleHeadsModel, self).__init__(config)
         self.transformer = GPT2Model(config)
-        self.lm_head = GPT2LMHead(self.transformer.wte.weight, config)
+        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
         self.multiple_choice_head = SequenceSummary(config)
 
         self.apply(self.init_weights)
 
-    def set_num_special_tokens(self, num_special_tokens, predict_special_tokens=True):
+    def tie_weights(self):
+        """ Make sure we are sharing the input and output embeddings.
+            Export to TorchScript can't handle parameter sharing so we are cloning them instead.
         """
-        Update input and output embeddings with new embedding matrix.Make sure we are sharing the embeddings
-
-        Args:
-            num_special_tokens: Special tokens to be added to the embedding matrix
-            predict_special_tokens: if set to True, the model will try and predict the specified ``num_special_tokens``.
-                Defaults to True.
-
-        TODO Lysandre filled args
-        """
-        self.config.predict_special_tokens = self.transformer.config.predict_special_tokens = predict_special_tokens
-        self.transformer.set_num_special_tokens(num_special_tokens)
-        self.lm_head.set_embeddings_weights(self.transformer.wte.weight, predict_special_tokens=predict_special_tokens)
+        input_embeddings = self.transformer.wte.weight
+        if self.config.torchscript:
+            self.lm_head.weight = nn.Parameter(input_embeddings.clone())
+        else:
+            self.lm_head.weight = input_embeddings  # Tied weights
 
     def forward(self, input_ids, mc_token_ids=None, lm_labels=None, mc_labels=None, token_type_ids=None,
                 position_ids=None, past=None, head_mask=None):
diff --git a/pytorch_transformers/modeling_openai.py b/pytorch_transformers/modeling_openai.py
index aa35b163f1..ebf1035d21 100644
--- a/pytorch_transformers/modeling_openai.py
+++ b/pytorch_transformers/modeling_openai.py
@@ -156,7 +156,6 @@ class OpenAIGPTConfig(PretrainedConfig):
     def __init__(
         self,
         vocab_size_or_config_json_file=40478,
-        n_special=0,
         n_positions=512,
         n_ctx=512,
         n_embd=768,
@@ -190,7 +189,6 @@ class OpenAIGPTConfig(PretrainedConfig):
                 self.__dict__[key] = value
         elif isinstance(vocab_size_or_config_json_file, int):
             self.vocab_size = vocab_size_or_config_json_file
-            self.n_special = n_special
             self.n_ctx = n_ctx
             self.n_positions = n_positions
             self.n_embd = n_embd
@@ -216,10 +214,6 @@ class OpenAIGPTConfig(PretrainedConfig):
                 "or the path to a pretrained model config file (str)"
             )
 
-    @property
-    def total_tokens_embeddings(self):
-        return self.vocab_size + self.n_special
-
     @property
     def hidden_size(self):
         return self.n_embd
@@ -355,34 +349,6 @@ class Block(nn.Module):
         return outputs
 
 
-class OpenAIGPTLMHead(nn.Module):
-    """ Language Model Head for the transformer """
-
-    def __init__(self, model_embeddings_weights, config):
-        super(OpenAIGPTLMHead, self).__init__()
-        self.n_embd = config.n_embd
-        self.vocab_size = config.vocab_size
-        self.predict_special_tokens = config.predict_special_tokens
-        self.torchscript = config.torchscript
-        embed_shape = model_embeddings_weights.shape
-        self.decoder = nn.Linear(embed_shape[1], embed_shape[0], bias=False)
-        self.set_embeddings_weights(model_embeddings_weights)
-
-    def set_embeddings_weights(self, model_embeddings_weights, predict_special_tokens=True):
-        self.predict_special_tokens = predict_special_tokens
-
-        if self.torchscript:
-            self.decoder.weight = nn.Parameter(model_embeddings_weights.clone())
-        else:
-            self.decoder.weight = model_embeddings_weights  # Tied weights
-
-    def forward(self, hidden_state):
-        lm_logits = self.decoder(hidden_state)
-        if not self.predict_special_tokens:
-            lm_logits = lm_logits[..., :self.vocab_size]
-        return lm_logits
-
-
 class OpenAIGPTPreTrainedModel(PreTrainedModel):
     """ An abstract class to handle weights initialization and
         a simple interface for dowloading and loading pretrained models.
@@ -408,36 +374,6 @@ class OpenAIGPTPreTrainedModel(PreTrainedModel):
             module.bias.data.zero_()
             module.weight.data.fill_(1.0)
 
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
-        """
-        Instantiate a OpenAIGPTPreTrainedModel from a pre-trained model file or a pytorch state dict.
-        Download and cache the pre-trained model file if needed.
-
-        Params:
-            pretrained_model_name_or_path: either:
-                - a str with the name of a pre-trained model to load selected in the list of:
-                - a path or url to a pretrained model archive containing:
-                    . `config.json` a configuration file for the model
-                    . `pytorch_model.bin` a PyTorch dump of a OpenAIGPTModel instance
-                - a path or url to a pretrained model archive containing:
-                    . `config.json` a configuration file for the model
-                    . a series of NumPy files containing OpenAI TensorFlow trained weights
-            from_tf: should we load the weights from a locally saved TensorFlow checkpoint
-            cache_dir: an optional path to a folder in which the pre-trained models will be cached.
-            state_dict: an optional state dictionnary (collections.OrderedDict object) to use instead of pre-trained models
-            *inputs, **kwargs: additional input for the specific OpenAI-GPT class
-        """
-        num_special_tokens = kwargs.get('num_special_tokens', None)
-        kwargs.pop('num_special_tokens', None)
-
-        model = super(PreTrainedModel, cls).from_pretrained(pretrained_model_name_or_path, pretrained_model_name_or_path, *inputs, **kwargs)
-
-        # Add additional embeddings for special tokens if needed
-        # This step also make sure we are still sharing the output and input embeddings after loading weights
-        model.set_num_special_tokens(num_special_tokens)
-        return model
-
 
 class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
     """OpenAI GPT model ("Improving Language Understanding by Generative Pre-Training").
@@ -457,13 +393,13 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
          config.vocab_size - 1,                                     ______________________
          config.vocab_size,
          ...                                                        -> special embeddings
-         config.vocab_size + config.n_special - 1]                  ______________________
+         config.vocab_size + n_special - 1]                  ______________________
 
-    where ``total_tokens_embeddings`` can be obtained as ``config.total_tokens_embeddings`` and is:
+    where ``total_tokens_embeddings``  is:
 
     ::
 
-        total_tokens_embeddings = config.vocab_size + config.n_special
+        total_tokens_embeddings = config.vocab_size + n_special
 
     You should use the associated indices to index the embeddings.
 
@@ -485,34 +421,15 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
         self.output_attentions = config.output_attentions
         self.output_hidden_states = config.output_hidden_states
 
-        self.tokens_embed = nn.Embedding(config.total_tokens_embeddings, config.n_embd)
+        self.tokens_embed = nn.Embedding(config.vocab_size, config.n_embd)
         self.positions_embed = nn.Embedding(config.n_positions, config.n_embd)
         self.drop = nn.Dropout(config.embd_pdrop)
         self.h = nn.ModuleList([Block(config.n_ctx, config, scale=True) for _ in range(config.n_layer)])
 
         self.apply(self.init_weights)
 
-    def set_num_special_tokens(self, num_special_tokens=None):
-        """
-        Update input embeddings with new embedding matrice if needed
-
-        Args:
-            num_special_tokens: Special tokens to be added to the embedding matrix
-
-        TODO Lysandre filled Args
-
-        """
-        if num_special_tokens is None or self.config.n_special == num_special_tokens:
-            return
-        # Update config
-        self.config.n_special = num_special_tokens
-        # Build new embeddings and initialize all new embeddings (in particular the special tokens)
-        old_embed = self.tokens_embed
-        self.tokens_embed = nn.Embedding(self.config.total_tokens_embeddings, self.config.n_embd)
-        self.tokens_embed.to(old_embed.weight.device)
-        self.init_weights(self.tokens_embed)
-        # Copy word embeddings from the previous weights
-        self.tokens_embed.weight.data[:self.config.vocab_size, :] = old_embed.weight.data[:self.config.vocab_size, :]
+    def _resize_token_embeddings(self, new_num_tokens):
+        self.tokens_embed = self._get_resized_embeddings(self.tokens_embed, new_num_tokens)
 
     def _prune_heads(self, heads_to_prune):
         """ Prunes heads of the model.
@@ -657,24 +574,20 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
     def __init__(self, config):
         super(OpenAIGPTLMHeadModel, self).__init__(config)
         self.transformer = OpenAIGPTModel(config)
-        self.lm_head = OpenAIGPTLMHead(self.transformer.tokens_embed.weight, config)
+        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
+
         self.apply(self.init_weights)
+        self.tie_weights()
 
-    def set_num_special_tokens(self, num_special_tokens, predict_special_tokens=True):
+    def tie_weights(self):
+        """ Make sure we are sharing the input and output embeddings.
+            Export to TorchScript can't handle parameter sharing so we are cloning them instead.
         """
-        Update input and output embeddings with new embedding matrix. Make sure we are sharing the embeddings
-
-        Args:
-            num_special_tokens: Special tokens to be added to the embedding matrix
-            predict_special_tokens: if set to True, the model will try and predict the specified ``num_special_tokens``.
-                Defaults to True.
-
-        TODO Lysandre filled Args
-
-        """
-        self.config.predict_special_tokens = self.transformer.config.predict_special_tokens = predict_special_tokens
-        self.transformer.set_num_special_tokens(num_special_tokens)
-        self.lm_head.set_embeddings_weights(self.transformer.tokens_embed.weight, predict_special_tokens=predict_special_tokens)
+        input_embeddings = self.transformer.tokens_embed.weight
+        if self.config.torchscript:
+            self.lm_head.weight = nn.Parameter(input_embeddings.clone())
+        else:
+            self.lm_head.weight = input_embeddings  # Tied weights
 
     def forward(self, input_ids, position_ids=None, token_type_ids=None, lm_labels=None, head_mask=None):
         """
@@ -747,13 +660,13 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
          config.vocab_size - 1,                                     ______________________
          config.vocab_size,
          ...                                                        -> special embeddings
-         config.vocab_size + config.n_special - 1]                  ______________________
+         config.vocab_size + n_special - 1]                  ______________________
 
-    where ``total_tokens_embeddings`` can be obtained as ``config.total_tokens_embeddings`` and is:
+    where ``total_tokens_embeddings`` is:
 
     ::
 
-        total_tokens_embeddings = config.vocab_size + config.n_special
+        total_tokens_embeddings = config.vocab_size + .n_special
 
     You should use the associate indices to index the embeddings.
 
@@ -773,24 +686,21 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
         super(OpenAIGPTDoubleHeadsModel, self).__init__(config)
 
         self.transformer = OpenAIGPTModel(config)
-        self.lm_head = OpenAIGPTLMHead(self.transformer.tokens_embed.weight, config)
+        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
         self.multiple_choice_head = SequenceSummary(config)
 
         self.apply(self.init_weights)
+        self.tie_weights()
 
-    def set_num_special_tokens(self, num_special_tokens, predict_special_tokens=True):
-        """ Update input and output embeddings with new embedding matrix. Make sure we are sharing the embeddings.
-
-        Args:
-            num_special_tokens: Special tokens to be added to the embedding matrix
-            predict_special_tokens: if set to True, the model will try and predict the specified ``num_special_tokens``.
-                Defaults to True.
-
-        TODO Lysandre filled Args
+    def tie_weights(self):
+        """ Make sure we are sharing the input and output embeddings.
+            Export to TorchScript can't handle parameter sharing so we are cloning them instead.
         """
-        self.config.predict_special_tokens = self.transformer.config.predict_special_tokens = predict_special_tokens
-        self.transformer.set_num_special_tokens(num_special_tokens)
-        self.lm_head.set_embeddings_weights(self.transformer.tokens_embed.weight, predict_special_tokens=predict_special_tokens)
+        input_embeddings = self.transformer.tokens_embed.weight
+        if self.config.torchscript:
+            self.lm_head.weight = nn.Parameter(input_embeddings.clone())
+        else:
+            self.lm_head.weight = input_embeddings  # Tied weights
 
     def forward(self, input_ids, mc_token_ids=None, lm_labels=None, mc_labels=None, token_type_ids=None,
                 position_ids=None, head_mask=None):
diff --git a/pytorch_transformers/modeling_transfo_xl.py b/pytorch_transformers/modeling_transfo_xl.py
index b194f43b68..71f80a9eea 100644
--- a/pytorch_transformers/modeling_transfo_xl.py
+++ b/pytorch_transformers/modeling_transfo_xl.py
@@ -287,6 +287,10 @@ class TransfoXLConfig(PretrainedConfig):
             raise ValueError("First argument must be either a vocabulary size (int)"
                              "or the path to a pretrained model config file (str)")
 
+    @property
+    def vocab_size(self):
+        return self.n_token
+
     @property
     def hidden_size(self):
         return self.d_model
@@ -998,6 +1002,9 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
 
         self.apply(self.init_weights)
 
+    def _resize_token_embeddings(self, new_num_tokens):
+        raise NotImplementedError
+
     def backward_compatible(self):
         self.sample_softmax = -1
 
diff --git a/pytorch_transformers/modeling_utils.py b/pytorch_transformers/modeling_utils.py
index a9445ecad5..8fdfda4720 100644
--- a/pytorch_transformers/modeling_utils.py
+++ b/pytorch_transformers/modeling_utils.py
@@ -151,6 +151,7 @@ class PreTrainedModel(nn.Module):
     pretrained_model_archive_map = {}
     load_tf_weights = lambda model, config, path: None
     base_model_prefix = ""
+    input_embeddings = None
 
     def __init__(self, config, *inputs, **kwargs):
         super(PreTrainedModel, self).__init__()
@@ -164,12 +165,48 @@ class PreTrainedModel(nn.Module):
         # Save config in model
         self.config = config
 
+    def _get_resized_embeddings(self, old_embeddings, new_num_tokens):
+        # Build new embeddings
+        old_num_tokens, old_embedding_dim = old_embeddings.weight.size()
+        new_embeddings = nn.Embedding(new_num_tokens, old_embedding_dim)
+        new_embeddings.to(old_embeddings.weight.device)
+
+        # initialize all new embeddings (in particular added tokens)
+        self.init_weights(new_embeddings)
+
+        # Copy word embeddings from the previous weights
+        num_tokens_to_copy = min(old_num_tokens, new_num_tokens)
+        new_embeddings.weight.data[:num_tokens_to_copy, :] = old_embeddings.weight.data[:num_tokens_to_copy, :]
+
+        return new_embeddings
+
+    def resize_token_embeddings(self, new_num_tokens):
+        """ Resize input token embeddings matrix.
+
+        Args:
+            new_num_tokens: New number of tokens in the embedding matrix.
+                Increasing the size will add newly initialized vectors at the end
+                Reducing the size will remove vectors from the end
+        """
+        if new_num_tokens == self.config.vocab_size:
+            return
+        base_model = getattr(self, self.base_model_prefix, self)  # get the base model if needed
+        base_model._resize_token_embeddings(new_num_tokens)
+
+        # Update base model and current model config
+        self.config.vocab_size = new_num_tokens
+        base_model.vocab_size = new_num_tokens
+
+        # Tie weights again if needed
+        if hasattr(self, 'tie_weights'):
+            self.tie_weights()
+
     def prune_heads(self, heads_to_prune):
         """ Prunes heads of the base model.
             heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
         """
-        model_to_prune = getattr(self, self.base_model_prefix, self)  # get the base model if needed
-        model_to_prune._prune_heads(heads_to_prune)
+        base_model = getattr(self, self.base_model_prefix, self)  # get the base model if needed
+        base_model._prune_heads(heads_to_prune)
 
     def save_pretrained(self, save_directory):
         """ Save a model with its configuration file to a directory, so that it
diff --git a/pytorch_transformers/modeling_xlm.py b/pytorch_transformers/modeling_xlm.py
index 7567a0f24b..3d5b35fae6 100644
--- a/pytorch_transformers/modeling_xlm.py
+++ b/pytorch_transformers/modeling_xlm.py
@@ -104,7 +104,6 @@ class XLMConfig(PretrainedConfig):
 
     def __init__(self,
                  vocab_size_or_config_json_file=30145,
-                 n_special=0,
                  emb_dim=2048,
                  n_layers=12,
                  n_heads=16,
@@ -148,7 +147,6 @@ class XLMConfig(PretrainedConfig):
                 self.__dict__[key] = value
         elif isinstance(vocab_size_or_config_json_file, int):
             self.n_words = vocab_size_or_config_json_file
-            self.n_special = n_special
             self.emb_dim = emb_dim
             self.n_layers = n_layers
             self.n_heads = n_heads
@@ -183,8 +181,8 @@ class XLMConfig(PretrainedConfig):
                              "or the path to a pretrained model config file (str)")
 
     @property
-    def total_tokens_embeddings(self):
-        return self.n_words + self.n_special
+    def vocab_size(self):
+        return self.n_words
 
     @property
     def hidden_size(self):
@@ -479,6 +477,9 @@ class XLMModel(XLMPreTrainedModel):
 
         self.apply(self.init_weights)
 
+    def _resize_token_embeddings(self, new_num_tokens):
+        self.embeddings = self._get_resized_embeddings(self.embeddings, new_num_tokens)
+
     def _prune_heads(self, heads_to_prune):
         """ Prunes heads of the model.
             heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
@@ -718,8 +719,6 @@ class XLMWithLMHeadModel(XLMPreTrainedModel):
     """
     def __init__(self, config):
         super(XLMWithLMHeadModel, self).__init__(config)
-        self.torchscript = config.torchscript
-
         self.transformer = XLMModel(config)
         self.pred_layer = XLMPredLayer(config)
 
@@ -729,7 +728,7 @@ class XLMWithLMHeadModel(XLMPreTrainedModel):
     def tie_weights(self):
         """ Make sure we are sharing the embeddings
         """
-        if self.torchscript:
+        if self.config.torchscript:
             self.pred_layer.proj.weight = nn.Parameter(self.transformer.embeddings.weight.clone())
         else:
             self.pred_layer.proj.weight = self.transformer.embeddings.weight
diff --git a/pytorch_transformers/modeling_xlnet.py b/pytorch_transformers/modeling_xlnet.py
index a5f95957c3..36c068e3a3 100644
--- a/pytorch_transformers/modeling_xlnet.py
+++ b/pytorch_transformers/modeling_xlnet.py
@@ -312,6 +312,10 @@ class XLNetConfig(PretrainedConfig):
             raise ValueError("First argument must be either a vocabulary size (int)"
                              "or the path to a pretrained model config file (str)")
 
+    @property
+    def vocab_size(self):
+        return self.n_token
+
     @property
     def hidden_size(self):
         return self.d_model
@@ -654,6 +658,9 @@ class XLNetModel(XLNetPreTrainedModel):
 
         self.apply(self.init_weights)
 
+    def _resize_token_embeddings(self, new_num_tokens):
+        self.word_embedding = self._get_resized_embeddings(self.word_embedding, new_num_tokens)
+
     def _prune_heads(self, heads_to_prune):
         logger.info("Head pruning is not implemented for XLNet")
         pass
@@ -970,20 +977,17 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
         super(XLNetLMHeadModel, self).__init__(config)
         self.attn_type = config.attn_type
         self.same_length = config.same_length
-        self.torchscript = config.torchscript
 
         self.transformer = XLNetModel(config)
         self.lm_loss = nn.Linear(config.d_model, config.n_token, bias=True)
 
-        # Tie weights
-
         self.apply(self.init_weights)
         self.tie_weights()
 
     def tie_weights(self):
         """ Make sure we are sharing the embeddings
         """
-        if self.torchscript:
+        if self.config.torchscript:
             self.lm_loss.weight = nn.Parameter(self.transformer.word_embedding.weight.clone())
         else:
             self.lm_loss.weight = self.transformer.word_embedding.weight
diff --git a/pytorch_transformers/tests/modeling_bert_test.py b/pytorch_transformers/tests/modeling_bert_test.py
index fbdce29366..4ab0c9d157 100644
--- a/pytorch_transformers/tests/modeling_bert_test.py
+++ b/pytorch_transformers/tests/modeling_bert_test.py
@@ -26,7 +26,7 @@ from pytorch_transformers import (BertConfig, BertModel, BertForMaskedLM,
                                      BertForTokenClassification, BertForMultipleChoice)
 from pytorch_transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP
 
-from .modeling_tests_commons import (create_and_check_commons, ConfigTester, ids_tensor)
+from .modeling_common_test import (create_and_check_commons, ConfigTester, ids_tensor)
 
 
 class BertModelTest(unittest.TestCase):
diff --git a/pytorch_transformers/tests/modeling_tests_commons.py b/pytorch_transformers/tests/modeling_common_test.py
similarity index 91%
rename from pytorch_transformers/tests/modeling_tests_commons.py
rename to pytorch_transformers/tests/modeling_common_test.py
index 5535177aaa..98849216fa 100644
--- a/pytorch_transformers/tests/modeling_tests_commons.py
+++ b/pytorch_transformers/tests/modeling_common_test.py
@@ -22,8 +22,15 @@ import shutil
 import json
 import random
 
+import unittest
+import logging
+
 import torch
 
+from pytorch_transformers import PretrainedConfig, PreTrainedModel
+from pytorch_transformers.modeling_bert import BertModel, BertConfig, BERT_PRETRAINED_MODEL_ARCHIVE_MAP
+
+
 def _config_zero_init(config):
     configs_no_init = copy.deepcopy(config)
     for key in configs_no_init.__dict__.keys():
@@ -242,6 +249,7 @@ class ConfigTester(object):
 
     def create_and_test_config_common_properties(self):
         config = self.config_class(**self.inputs_dict)
+        self.parent.assertTrue(hasattr(config, 'vocab_size'))
         self.parent.assertTrue(hasattr(config, 'hidden_size'))
         self.parent.assertTrue(hasattr(config, 'num_attention_heads'))
         self.parent.assertTrue(hasattr(config, 'num_hidden_layers'))
@@ -276,7 +284,6 @@ class GPTModelTester(object):
                     use_token_type_ids=True,
                     use_labels=True,
                     vocab_size=99,
-                    n_special=1,
                     n_positions=33,
                     hidden_size=32,
                     num_hidden_layers=5,
@@ -299,7 +306,6 @@ class GPTModelTester(object):
         self.use_token_type_ids = use_token_type_ids
         self.use_labels = use_labels
         self.vocab_size = vocab_size
-        self.n_special = n_special
         self.n_positions = n_positions
         self.hidden_size = hidden_size
         self.num_hidden_layers = num_hidden_layers
@@ -316,7 +322,7 @@ class GPTModelTester(object):
         self.all_model_classes = (base_model_class, lm_head_model_class, double_head_model_class)
 
     def prepare_config_and_inputs(self):
-        total_num_tokens = self.vocab_size + self.n_special
+        total_num_tokens = self.vocab_size
         input_ids = ids_tensor([self.batch_size, self.n_choices, self.seq_length], total_num_tokens)
 
         position_ids = None
@@ -338,7 +344,6 @@ class GPTModelTester(object):
 
         config = self.config_class(
             vocab_size_or_config_json_file=self.vocab_size,
-            n_special=self.n_special,
             n_positions=self.n_positions,
             n_embd=self.hidden_size,
             n_layer=self.num_hidden_layers,
@@ -370,7 +375,7 @@ class GPTModelTester(object):
         outputs = model(input_ids, position_ids, token_type_ids, lm_labels)
         loss, lm_logits = outputs[:2]
 
-        total_voc = self.n_special + self.vocab_size
+        total_voc = self.vocab_size
         self.parent.assertListEqual(
             list(lm_logits.size()),
             [self.batch_size, self.n_choices, self.seq_length, total_voc])
@@ -400,7 +405,7 @@ class GPTModelTester(object):
         lm_loss, mc_loss, lm_logits, mc_logits = outputs[:4]
         loss = [lm_loss, mc_loss]
 
-        total_voc = self.n_special + self.vocab_size
+        total_voc = self.vocab_size
         self.parent.assertListEqual(
             list(lm_logits.size()),
             [self.batch_size, self.n_choices, self.seq_length, total_voc])
@@ -441,6 +446,30 @@ class GPTModelTester(object):
         self.create_and_check_commons(*config_and_inputs)
 
     def run_slow_tests(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        self.create_and_check_model_from_pretrained(*config_and_inputs)
+        self.create_and_check_model_from_pretrained()
 
+
+class ModelUtilsTest(unittest.TestCase):
+    def test_model_from_pretrained(self):
+        logging.basicConfig(level=logging.INFO)
+        for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
+            config = BertConfig.from_pretrained(model_name)
+            self.assertIsNotNone(config)
+            self.assertIsInstance(config, PretrainedConfig)
+
+            model = BertModel.from_pretrained(model_name)
+            model, loading_info = BertModel.from_pretrained(model_name, output_loading_info=True)
+            self.assertIsNotNone(model)
+            self.assertIsInstance(model, PreTrainedModel)
+            for value in loading_info.values():
+                self.assertEqual(len(value), 0)
+
+            config = BertConfig.from_pretrained(model_name, output_attentions=True, output_hidden_states=True)
+            model = BertModel.from_pretrained(model_name, output_attentions=True, output_hidden_states=True)
+            self.assertEqual(model.config.output_attentions, True)
+            self.assertEqual(model.config.output_hidden_states, True)
+            self.assertEqual(model.config, config)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/pytorch_transformers/tests/modeling_gpt2_test.py b/pytorch_transformers/tests/modeling_gpt2_test.py
index 7400c9f64d..00a9cb4614 100644
--- a/pytorch_transformers/tests/modeling_gpt2_test.py
+++ b/pytorch_transformers/tests/modeling_gpt2_test.py
@@ -28,7 +28,7 @@ import torch
 from pytorch_transformers import (GPT2Config, GPT2Model,
                                      GPT2LMHeadModel, GPT2DoubleHeadsModel)
 
-from .modeling_tests_commons import (create_and_check_commons, ConfigTester, GPTModelTester)
+from .modeling_common_test import (create_and_check_commons, ConfigTester, GPTModelTester)
 
 class GPT2ModelTest(unittest.TestCase):
 
diff --git a/pytorch_transformers/tests/modeling_openai_test.py b/pytorch_transformers/tests/modeling_openai_test.py
index 27263ecb24..4f57f4661b 100644
--- a/pytorch_transformers/tests/modeling_openai_test.py
+++ b/pytorch_transformers/tests/modeling_openai_test.py
@@ -24,7 +24,7 @@ import torch
 from pytorch_transformers import (OpenAIGPTConfig, OpenAIGPTModel,
                                      OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel)
 
-from .modeling_tests_commons import (create_and_check_commons, ConfigTester, GPTModelTester)
+from .modeling_common_test import (create_and_check_commons, ConfigTester, GPTModelTester)
 
 class OpenAIModelTest(unittest.TestCase):
 
diff --git a/pytorch_transformers/tests/modeling_transfo_xl_test.py b/pytorch_transformers/tests/modeling_transfo_xl_test.py
index 49ba1addf1..9631cd6034 100644
--- a/pytorch_transformers/tests/modeling_transfo_xl_test.py
+++ b/pytorch_transformers/tests/modeling_transfo_xl_test.py
@@ -28,7 +28,7 @@ import torch
 from pytorch_transformers import (TransfoXLConfig, TransfoXLModel, TransfoXLLMHeadModel)
 from pytorch_transformers.modeling_transfo_xl import TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP
 
-from .modeling_tests_commons import ConfigTester, create_and_check_commons, ids_tensor
+from .modeling_common_test import ConfigTester, create_and_check_commons, ids_tensor
 
 class TransfoXLModelTest(unittest.TestCase):
     class TransfoXLModelTester(object):
diff --git a/pytorch_transformers/tests/modeling_utils_test.py b/pytorch_transformers/tests/modeling_utils_test.py
deleted file mode 100644
index 4944f41228..0000000000
--- a/pytorch_transformers/tests/modeling_utils_test.py
+++ /dev/null
@@ -1,47 +0,0 @@
-# coding=utf-8
-# Copyright 2018 HuggingFace Inc..
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import unittest
-import logging
-
-from pytorch_transformers import PretrainedConfig, PreTrainedModel
-from pytorch_transformers.modeling_bert import BertModel, BertConfig, BERT_PRETRAINED_MODEL_ARCHIVE_MAP
-
-class ModelUtilsTest(unittest.TestCase):
-    def test_model_from_pretrained(self):
-        logging.basicConfig(level=logging.INFO)
-        for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
-            config = BertConfig.from_pretrained(model_name)
-            self.assertIsNotNone(config)
-            self.assertIsInstance(config, PretrainedConfig)
-
-            model = BertModel.from_pretrained(model_name)
-            model, loading_info = BertModel.from_pretrained(model_name, output_loading_info=True)
-            self.assertIsNotNone(model)
-            self.assertIsInstance(model, PreTrainedModel)
-            for value in loading_info.values():
-                self.assertEqual(len(value), 0)
-
-            config = BertConfig.from_pretrained(model_name, output_attentions=True, output_hidden_states=True)
-            model = BertModel.from_pretrained(model_name, output_attentions=True, output_hidden_states=True)
-            self.assertEqual(model.config.output_attentions, True)
-            self.assertEqual(model.config.output_hidden_states, True)
-            self.assertEqual(model.config, config)
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/pytorch_transformers/tests/modeling_xlm_test.py b/pytorch_transformers/tests/modeling_xlm_test.py
index 6e2e082d19..9d6bc4054d 100644
--- a/pytorch_transformers/tests/modeling_xlm_test.py
+++ b/pytorch_transformers/tests/modeling_xlm_test.py
@@ -23,7 +23,7 @@ import pytest
 from pytorch_transformers import (XLMConfig, XLMModel, XLMWithLMHeadModel, XLMForQuestionAnswering, XLMForSequenceClassification)
 from pytorch_transformers.modeling_xlm import XLM_PRETRAINED_MODEL_ARCHIVE_MAP
 
-from .modeling_tests_commons import (create_and_check_commons, ConfigTester, ids_tensor)
+from .modeling_common_test import (create_and_check_commons, ConfigTester, ids_tensor)
 
 
 class XLMModelTest(unittest.TestCase):
diff --git a/pytorch_transformers/tests/modeling_xlnet_test.py b/pytorch_transformers/tests/modeling_xlnet_test.py
index e167e2d2e8..41c114ce9c 100644
--- a/pytorch_transformers/tests/modeling_xlnet_test.py
+++ b/pytorch_transformers/tests/modeling_xlnet_test.py
@@ -28,7 +28,7 @@ import torch
 from pytorch_transformers import (XLNetConfig, XLNetModel, XLNetLMHeadModel, XLNetForSequenceClassification, XLNetForQuestionAnswering)
 from pytorch_transformers.modeling_xlnet import XLNET_PRETRAINED_MODEL_ARCHIVE_MAP
 
-from .modeling_tests_commons import ConfigTester, create_and_check_commons, ids_tensor
+from .modeling_common_test import ConfigTester, create_and_check_commons, ids_tensor
 
 class XLNetModelTest(unittest.TestCase):
     class XLNetModelTester(object):

From e3fb4310d6ce63ea55e814544fa47207bc3f72f9 Mon Sep 17 00:00:00 2001
From: LysandreJik <lysandre.debut@reseau.eseo.fr>
Date: Thu, 11 Jul 2019 18:44:29 -0400
Subject: [PATCH 105/139] From pretrained correct initialization. Unknown token
 handling for gpt2.

---
 pytorch_transformers/modeling_gpt2.py     | 2 +-
 pytorch_transformers/modeling_openai.py   | 2 +-
 pytorch_transformers/tokenization_gpt2.py | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/pytorch_transformers/modeling_gpt2.py b/pytorch_transformers/modeling_gpt2.py
index 495e002529..2b8ec88a50 100644
--- a/pytorch_transformers/modeling_gpt2.py
+++ b/pytorch_transformers/modeling_gpt2.py
@@ -423,7 +423,7 @@ class GPT2PreTrainedModel(PreTrainedModel):
         """
         num_special_tokens = kwargs.pop('num_special_tokens', None)
 
-        model = super().from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
+        model = super(GPT2PreTrainedModel, cls).from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
 
         # Add additional embeddings for special tokens if needed
         # This step also make sure we are still sharing the output and input embeddings after loading weights
diff --git a/pytorch_transformers/modeling_openai.py b/pytorch_transformers/modeling_openai.py
index aa35b163f1..de7ab8ae4b 100644
--- a/pytorch_transformers/modeling_openai.py
+++ b/pytorch_transformers/modeling_openai.py
@@ -431,7 +431,7 @@ class OpenAIGPTPreTrainedModel(PreTrainedModel):
         num_special_tokens = kwargs.get('num_special_tokens', None)
         kwargs.pop('num_special_tokens', None)
 
-        model = super(PreTrainedModel, cls).from_pretrained(pretrained_model_name_or_path, pretrained_model_name_or_path, *inputs, **kwargs)
+        model = super(OpenAIGPTPreTrainedModel, cls).from_pretrained(pretrained_model_name_or_path, pretrained_model_name_or_path, *inputs, **kwargs)
 
         # Add additional embeddings for special tokens if needed
         # This step also make sure we are still sharing the output and input embeddings after loading weights
diff --git a/pytorch_transformers/tokenization_gpt2.py b/pytorch_transformers/tokenization_gpt2.py
index af1ad2cf8f..6084dc3e05 100644
--- a/pytorch_transformers/tokenization_gpt2.py
+++ b/pytorch_transformers/tokenization_gpt2.py
@@ -177,11 +177,11 @@ class GPT2Tokenizer(PreTrainedTokenizer):
 
     def _convert_token_to_id(self, token):
         """ Converts a token (str/unicode) in an id using the vocab. """
-        return self.encoder.get(token, self.encoder.get(self.unk_token))
+        return self.encoder.get(token)
 
     def _convert_id_to_token(self, index):
         """Converts an index (integer) in a token (string/unicode) using the vocab."""
-        return self.decoder.get(index, self.unk_token)
+        return self.decoder.get(index)
 
     def _convert_ids_to_string(self, tokens_ids):
         """Converts a sequence of ids in a string."""

From 3821ecbf4ac442cbaad7a1fc0d8c20136bbfe32a Mon Sep 17 00:00:00 2001
From: LysandreJik <lysandre.debut@reseau.eseo.fr>
Date: Thu, 11 Jul 2019 20:16:28 -0400
Subject: [PATCH 106/139] Byte order mark management in TSV glue reading.

---
 examples/utils_glue.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/utils_glue.py b/examples/utils_glue.py
index 5ad36abf10..bba9a901a8 100644
--- a/examples/utils_glue.py
+++ b/examples/utils_glue.py
@@ -78,7 +78,7 @@ class DataProcessor(object):
     @classmethod
     def _read_tsv(cls, input_file, quotechar=None):
         """Reads a tab separated value file."""
-        with open(input_file, "r", encoding="utf-8") as f:
+        with open(input_file, "r", encoding="utf-8-sig") as f:
             reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
             lines = []
             for line in reader:

From 6c2ee16c0418a09c13cd59bf285f56feb001d3b5 Mon Sep 17 00:00:00 2001
From: LysandreJik <lysandre.debut@reseau.eseo.fr>
Date: Thu, 11 Jul 2019 22:09:16 -0400
Subject: [PATCH 107/139] Test suite testing the tie_weights function as well
 as the resize_token_embeddings function. Patched an issue relating to the
 tied weights I had introduced with the TorchScript addition. Byte order mark
 management in TSV glue reading.

---
 examples/utils_glue.py                        |  2 +-
 pytorch_transformers/modeling_bert.py         |  4 +-
 pytorch_transformers/modeling_gpt2.py         |  4 +-
 pytorch_transformers/modeling_openai.py       |  4 +-
 .../tests/modeling_common_test.py             | 74 +++++++++++++++++++
 5 files changed, 81 insertions(+), 7 deletions(-)

diff --git a/examples/utils_glue.py b/examples/utils_glue.py
index 5ad36abf10..bba9a901a8 100644
--- a/examples/utils_glue.py
+++ b/examples/utils_glue.py
@@ -78,7 +78,7 @@ class DataProcessor(object):
     @classmethod
     def _read_tsv(cls, input_file, quotechar=None):
         """Reads a tab separated value file."""
-        with open(input_file, "r", encoding="utf-8") as f:
+        with open(input_file, "r", encoding="utf-8-sig") as f:
             reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
             lines = []
             for line in reader:
diff --git a/pytorch_transformers/modeling_bert.py b/pytorch_transformers/modeling_bert.py
index d88c57bb79..23b2e76ec7 100644
--- a/pytorch_transformers/modeling_bert.py
+++ b/pytorch_transformers/modeling_bert.py
@@ -762,7 +762,7 @@ class BertForPreTraining(BertPreTrainedModel):
         if self.config.torchscript:
             self.cls.predictions.decoder.weight = nn.Parameter(input_embeddings.clone())
         else:
-            self.cls.predictions.decoder.weight = input_embeddings  # Tied weights
+            self.cls.predictions.decoder = self.bert.embeddings.word_embeddings  # Tied weights
 
     def forward(self, input_ids, token_type_ids=None, attention_mask=None, masked_lm_labels=None,
                 next_sentence_label=None, head_mask=None):
@@ -868,7 +868,7 @@ class BertForMaskedLM(BertPreTrainedModel):
         if self.config.torchscript:
             self.cls.predictions.decoder.weight = nn.Parameter(input_embeddings.clone())
         else:
-            self.cls.predictions.decoder.weight = input_embeddings  # Tied weights
+            self.cls.predictions.decoder = self.bert.embeddings.word_embeddings  # Tied weights
 
     def forward(self, input_ids, token_type_ids=None, attention_mask=None, masked_lm_labels=None, head_mask=None):
         """
diff --git a/pytorch_transformers/modeling_gpt2.py b/pytorch_transformers/modeling_gpt2.py
index 06f933147f..5823bad322 100644
--- a/pytorch_transformers/modeling_gpt2.py
+++ b/pytorch_transformers/modeling_gpt2.py
@@ -566,7 +566,7 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
         if self.config.torchscript:
             self.lm_head.weight = nn.Parameter(input_embeddings.clone())
         else:
-            self.lm_head.weight = input_embeddings  # Tied weights
+            self.lm_head = self.transformer.wte  # Tied weights
 
     def forward(self, input_ids, position_ids=None, token_type_ids=None, lm_labels=None, past=None, head_mask=None):
         """
@@ -662,7 +662,7 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
         if self.config.torchscript:
             self.lm_head.weight = nn.Parameter(input_embeddings.clone())
         else:
-            self.lm_head.weight = input_embeddings  # Tied weights
+            self.lm_head = self.transformer.wte  # Tied weights
 
     def forward(self, input_ids, mc_token_ids=None, lm_labels=None, mc_labels=None, token_type_ids=None,
                 position_ids=None, past=None, head_mask=None):
diff --git a/pytorch_transformers/modeling_openai.py b/pytorch_transformers/modeling_openai.py
index ebf1035d21..47a07e77b3 100644
--- a/pytorch_transformers/modeling_openai.py
+++ b/pytorch_transformers/modeling_openai.py
@@ -587,7 +587,7 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
         if self.config.torchscript:
             self.lm_head.weight = nn.Parameter(input_embeddings.clone())
         else:
-            self.lm_head.weight = input_embeddings  # Tied weights
+            self.lm_head = self.transformer.tokens_embed  # Tied weights
 
     def forward(self, input_ids, position_ids=None, token_type_ids=None, lm_labels=None, head_mask=None):
         """
@@ -700,7 +700,7 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
         if self.config.torchscript:
             self.lm_head.weight = nn.Parameter(input_embeddings.clone())
         else:
-            self.lm_head.weight = input_embeddings  # Tied weights
+            self.lm_head = self.transformer.tokens_embed  # Tied weights
 
     def forward(self, input_ids, mc_token_ids=None, lm_labels=None, mc_labels=None, token_type_ids=None,
                 position_ids=None, head_mask=None):
diff --git a/pytorch_transformers/tests/modeling_common_test.py b/pytorch_transformers/tests/modeling_common_test.py
index 98849216fa..9f14c181bb 100644
--- a/pytorch_transformers/tests/modeling_common_test.py
+++ b/pytorch_transformers/tests/modeling_common_test.py
@@ -29,6 +29,7 @@ import torch
 
 from pytorch_transformers import PretrainedConfig, PreTrainedModel
 from pytorch_transformers.modeling_bert import BertModel, BertConfig, BERT_PRETRAINED_MODEL_ARCHIVE_MAP
+from pytorch_transformers.modeling_gpt2 import GPT2LMHeadModel, GPT2Config, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP
 
 
 def _config_zero_init(config):
@@ -470,6 +471,79 @@ class ModelUtilsTest(unittest.TestCase):
             self.assertEqual(model.config.output_hidden_states, True)
             self.assertEqual(model.config, config)
 
+    def test_resize_tokens_embeddings(self):
+        logging.basicConfig(level=logging.INFO)
+
+
+        for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
+            config = BertConfig.from_pretrained(model_name)
+            model = BertModel.from_pretrained(model_name)
+
+            model_vocab_size = config.vocab_size
+            # Retrieve the embeddings and clone theme
+            cloned_embeddings = model.embeddings.word_embeddings.weight.clone()
+
+            # Check that resizing the token embeddings with a larger vocab size increases the model's vocab size
+            model.resize_token_embeddings(model_vocab_size + 10)
+            self.assertEqual(model.config.vocab_size, model_vocab_size + 10)
+            # Check that it actually resizes the embeddings matrix
+            self.assertEqual(model.embeddings.word_embeddings.weight.shape[0], cloned_embeddings.shape[0] + 10)
+
+            # Check that resizing the token embeddings with a smaller vocab size decreases the model's vocab size
+            model.resize_token_embeddings(model_vocab_size)
+            self.assertEqual(model.config.vocab_size, model_vocab_size)
+            # Check that it actually resizes the embeddings matrix
+            self.assertEqual(model.embeddings.word_embeddings.weight.shape[0], cloned_embeddings.shape[0])
+
+            # Check that adding and removing tokens has not modified the first part of the embedding matrix.
+            models_equal = True
+            for p1, p2 in zip(cloned_embeddings, model.embeddings.word_embeddings.weight):
+                if p1.data.ne(p2.data).sum() > 0:
+                    models_equal = False
+
+            self.assertTrue(models_equal)
+
+    def test_tie_model_weights(self):
+        logging.basicConfig(level=logging.INFO)
+
+        def check_same_values(layer_1, layer_2):
+            equal = True
+            for p1, p2 in zip(layer_1.weight, layer_2.weight):
+                if p1.data.ne(p2.data).sum() > 0:
+                    equal = False
+            return equal
+
+        for model_name in list(GPT2_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
+            config = GPT2Config.from_pretrained(model_name)
+            model = GPT2LMHeadModel.from_pretrained(model_name)
+
+            # Get the embeddings and decoding layer
+            embeddings = model.transformer.wte
+            decoding = model.lm_head
+
+            # Check that the embedding layer and decoding layer are the same in size and in value
+            self.assertTrue(embeddings.weight.shape, decoding.weight.shape)
+            self.assertTrue(check_same_values(embeddings, decoding))
+
+            # Check that after modification, they remain the same.
+            embeddings.weight.data.div_(2)
+            # Check that the embedding layer and decoding layer are the same in size and in value
+            self.assertTrue(embeddings.weight.shape, decoding.weight.shape)
+            self.assertTrue(check_same_values(embeddings, decoding))
+
+            # Check that after modification, they remain the same.
+            decoding.weight.data.div_(4)
+            # Check that the embedding layer and decoding layer are the same in size and in value
+            self.assertTrue(embeddings.weight.shape, decoding.weight.shape)
+            self.assertTrue(check_same_values(embeddings, decoding))
+
+            # Check that after resize they remain tied.
+            model.resize_token_embeddings(config.vocab_size + 10)
+            decoding.weight.data.mul_(20)
+            # Check that the embedding layer and decoding layer are the same in size and in value
+            self.assertTrue(embeddings.weight.shape, decoding.weight.shape)
+            self.assertTrue(check_same_values(embeddings, decoding))
+
 
 if __name__ == "__main__":
     unittest.main()

From 3fbceed8d2a53f89c2e8556e97400192f642a0e5 Mon Sep 17 00:00:00 2001
From: LysandreJik <lysandre.debut@reseau.eseo.fr>
Date: Thu, 11 Jul 2019 22:29:55 -0400
Subject: [PATCH 108/139] Fix layer reference loss + previous attempted fix

---
 pytorch_transformers/modeling_bert.py              | 4 ++--
 pytorch_transformers/modeling_gpt2.py              | 4 ++--
 pytorch_transformers/modeling_openai.py            | 4 ++--
 pytorch_transformers/tests/modeling_common_test.py | 4 ++--
 4 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/pytorch_transformers/modeling_bert.py b/pytorch_transformers/modeling_bert.py
index 23b2e76ec7..d88c57bb79 100644
--- a/pytorch_transformers/modeling_bert.py
+++ b/pytorch_transformers/modeling_bert.py
@@ -762,7 +762,7 @@ class BertForPreTraining(BertPreTrainedModel):
         if self.config.torchscript:
             self.cls.predictions.decoder.weight = nn.Parameter(input_embeddings.clone())
         else:
-            self.cls.predictions.decoder = self.bert.embeddings.word_embeddings  # Tied weights
+            self.cls.predictions.decoder.weight = input_embeddings  # Tied weights
 
     def forward(self, input_ids, token_type_ids=None, attention_mask=None, masked_lm_labels=None,
                 next_sentence_label=None, head_mask=None):
@@ -868,7 +868,7 @@ class BertForMaskedLM(BertPreTrainedModel):
         if self.config.torchscript:
             self.cls.predictions.decoder.weight = nn.Parameter(input_embeddings.clone())
         else:
-            self.cls.predictions.decoder = self.bert.embeddings.word_embeddings  # Tied weights
+            self.cls.predictions.decoder.weight = input_embeddings  # Tied weights
 
     def forward(self, input_ids, token_type_ids=None, attention_mask=None, masked_lm_labels=None, head_mask=None):
         """
diff --git a/pytorch_transformers/modeling_gpt2.py b/pytorch_transformers/modeling_gpt2.py
index 5823bad322..06f933147f 100644
--- a/pytorch_transformers/modeling_gpt2.py
+++ b/pytorch_transformers/modeling_gpt2.py
@@ -566,7 +566,7 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
         if self.config.torchscript:
             self.lm_head.weight = nn.Parameter(input_embeddings.clone())
         else:
-            self.lm_head = self.transformer.wte  # Tied weights
+            self.lm_head.weight = input_embeddings  # Tied weights
 
     def forward(self, input_ids, position_ids=None, token_type_ids=None, lm_labels=None, past=None, head_mask=None):
         """
@@ -662,7 +662,7 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
         if self.config.torchscript:
             self.lm_head.weight = nn.Parameter(input_embeddings.clone())
         else:
-            self.lm_head = self.transformer.wte  # Tied weights
+            self.lm_head.weight = input_embeddings  # Tied weights
 
     def forward(self, input_ids, mc_token_ids=None, lm_labels=None, mc_labels=None, token_type_ids=None,
                 position_ids=None, past=None, head_mask=None):
diff --git a/pytorch_transformers/modeling_openai.py b/pytorch_transformers/modeling_openai.py
index 47a07e77b3..ebf1035d21 100644
--- a/pytorch_transformers/modeling_openai.py
+++ b/pytorch_transformers/modeling_openai.py
@@ -587,7 +587,7 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
         if self.config.torchscript:
             self.lm_head.weight = nn.Parameter(input_embeddings.clone())
         else:
-            self.lm_head = self.transformer.tokens_embed  # Tied weights
+            self.lm_head.weight = input_embeddings  # Tied weights
 
     def forward(self, input_ids, position_ids=None, token_type_ids=None, lm_labels=None, head_mask=None):
         """
@@ -700,7 +700,7 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
         if self.config.torchscript:
             self.lm_head.weight = nn.Parameter(input_embeddings.clone())
         else:
-            self.lm_head = self.transformer.tokens_embed  # Tied weights
+            self.lm_head.weight = input_embeddings  # Tied weights
 
     def forward(self, input_ids, mc_token_ids=None, lm_labels=None, mc_labels=None, token_type_ids=None,
                 position_ids=None, head_mask=None):
diff --git a/pytorch_transformers/tests/modeling_common_test.py b/pytorch_transformers/tests/modeling_common_test.py
index 9f14c181bb..557d8aede0 100644
--- a/pytorch_transformers/tests/modeling_common_test.py
+++ b/pytorch_transformers/tests/modeling_common_test.py
@@ -541,8 +541,8 @@ class ModelUtilsTest(unittest.TestCase):
             model.resize_token_embeddings(config.vocab_size + 10)
             decoding.weight.data.mul_(20)
             # Check that the embedding layer and decoding layer are the same in size and in value
-            self.assertTrue(embeddings.weight.shape, decoding.weight.shape)
-            self.assertTrue(check_same_values(embeddings, decoding))
+            self.assertTrue(model.transformer.wte.weight.shape, model.lm_head.weight.shape)
+            self.assertTrue(check_same_values(model.transformer.wte, model.lm_head))
 
 
 if __name__ == "__main__":

From 2918b7d2a09d7253b338c004258866da41cd6642 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Fri, 12 Jul 2019 10:57:58 +0200
Subject: [PATCH 109/139] updating tests

---
 pytorch_transformers/modeling_bert.py         |  15 +-
 pytorch_transformers/modeling_gpt2.py         |  15 +-
 pytorch_transformers/modeling_openai.py       |  15 +-
 pytorch_transformers/modeling_transfo_xl.py   |  19 +-
 pytorch_transformers/modeling_utils.py        |  47 +-
 pytorch_transformers/modeling_xlm.py          |  10 +-
 pytorch_transformers/modeling_xlnet.py        |  13 +-
 .../tests/modeling_bert_test.py               |  88 +-
 .../tests/modeling_common_test.py             | 848 +++++++++---------
 .../tests/modeling_gpt2_test.py               |  13 +-
 .../tests/modeling_openai_test.py             |   7 +-
 .../tests/modeling_transfo_xl_test.py         |  56 +-
 .../tests/modeling_xlm_test.py                |  51 +-
 .../tests/modeling_xlnet_test.py              |  71 +-
 14 files changed, 672 insertions(+), 596 deletions(-)

diff --git a/pytorch_transformers/modeling_bert.py b/pytorch_transformers/modeling_bert.py
index d88c57bb79..8c75925a07 100644
--- a/pytorch_transformers/modeling_bert.py
+++ b/pytorch_transformers/modeling_bert.py
@@ -617,6 +617,7 @@ class BertModel(BertPreTrainedModel):
         old_embeddings = self.embeddings.word_embeddings
         new_embeddings = self._get_resized_embeddings(old_embeddings, new_num_tokens)
         self.embeddings.word_embeddings = new_embeddings
+        return self.embeddings.word_embeddings
 
     def _prune_heads(self, heads_to_prune):
         """ Prunes heads of the model.
@@ -758,11 +759,8 @@ class BertForPreTraining(BertPreTrainedModel):
         """ Make sure we are sharing the input and output embeddings.
             Export to TorchScript can't handle parameter sharing so we are cloning them instead.
         """
-        input_embeddings = self.bert.embeddings.word_embeddings.weight
-        if self.config.torchscript:
-            self.cls.predictions.decoder.weight = nn.Parameter(input_embeddings.clone())
-        else:
-            self.cls.predictions.decoder.weight = input_embeddings  # Tied weights
+        self._tie_or_clone_weights(self.cls.predictions.decoder,
+                                   self.bert.embeddings.word_embeddings)
 
     def forward(self, input_ids, token_type_ids=None, attention_mask=None, masked_lm_labels=None,
                 next_sentence_label=None, head_mask=None):
@@ -864,11 +862,8 @@ class BertForMaskedLM(BertPreTrainedModel):
         """ Make sure we are sharing the input and output embeddings.
             Export to TorchScript can't handle parameter sharing so we are cloning them instead.
         """
-        input_embeddings = self.bert.embeddings.word_embeddings.weight
-        if self.config.torchscript:
-            self.cls.predictions.decoder.weight = nn.Parameter(input_embeddings.clone())
-        else:
-            self.cls.predictions.decoder.weight = input_embeddings  # Tied weights
+        self._tie_or_clone_weights(self.cls.predictions.decoder,
+                                   self.bert.embeddings.word_embeddings)
 
     def forward(self, input_ids, token_type_ids=None, attention_mask=None, masked_lm_labels=None, head_mask=None):
         """
diff --git a/pytorch_transformers/modeling_gpt2.py b/pytorch_transformers/modeling_gpt2.py
index 06f933147f..b5fc6fc49b 100644
--- a/pytorch_transformers/modeling_gpt2.py
+++ b/pytorch_transformers/modeling_gpt2.py
@@ -414,6 +414,7 @@ class GPT2Model(GPT2PreTrainedModel):
 
     def _resize_token_embeddings(self, new_num_tokens):
         self.wte = self._get_resized_embeddings(self.wte, new_num_tokens)
+        return self.wte
 
     def _prune_heads(self, heads_to_prune):
         """ Prunes heads of the model.
@@ -562,11 +563,8 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
         """ Make sure we are sharing the input and output embeddings.
             Export to TorchScript can't handle parameter sharing so we are cloning them instead.
         """
-        input_embeddings = self.transformer.wte.weight
-        if self.config.torchscript:
-            self.lm_head.weight = nn.Parameter(input_embeddings.clone())
-        else:
-            self.lm_head.weight = input_embeddings  # Tied weights
+        self._tie_or_clone_weights(self.lm_head,
+                                   self.transformer.wte)
 
     def forward(self, input_ids, position_ids=None, token_type_ids=None, lm_labels=None, past=None, head_mask=None):
         """
@@ -658,11 +656,8 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
         """ Make sure we are sharing the input and output embeddings.
             Export to TorchScript can't handle parameter sharing so we are cloning them instead.
         """
-        input_embeddings = self.transformer.wte.weight
-        if self.config.torchscript:
-            self.lm_head.weight = nn.Parameter(input_embeddings.clone())
-        else:
-            self.lm_head.weight = input_embeddings  # Tied weights
+        self._tie_or_clone_weights(self.lm_head,
+                                   self.transformer.wte)
 
     def forward(self, input_ids, mc_token_ids=None, lm_labels=None, mc_labels=None, token_type_ids=None,
                 position_ids=None, past=None, head_mask=None):
diff --git a/pytorch_transformers/modeling_openai.py b/pytorch_transformers/modeling_openai.py
index ebf1035d21..9fb4720e93 100644
--- a/pytorch_transformers/modeling_openai.py
+++ b/pytorch_transformers/modeling_openai.py
@@ -430,6 +430,7 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
 
     def _resize_token_embeddings(self, new_num_tokens):
         self.tokens_embed = self._get_resized_embeddings(self.tokens_embed, new_num_tokens)
+        return self.tokens_embed
 
     def _prune_heads(self, heads_to_prune):
         """ Prunes heads of the model.
@@ -583,11 +584,8 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
         """ Make sure we are sharing the input and output embeddings.
             Export to TorchScript can't handle parameter sharing so we are cloning them instead.
         """
-        input_embeddings = self.transformer.tokens_embed.weight
-        if self.config.torchscript:
-            self.lm_head.weight = nn.Parameter(input_embeddings.clone())
-        else:
-            self.lm_head.weight = input_embeddings  # Tied weights
+        self._tie_or_clone_weights(self.lm_head,
+                                   self.transformer.tokens_embed)
 
     def forward(self, input_ids, position_ids=None, token_type_ids=None, lm_labels=None, head_mask=None):
         """
@@ -696,11 +694,8 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
         """ Make sure we are sharing the input and output embeddings.
             Export to TorchScript can't handle parameter sharing so we are cloning them instead.
         """
-        input_embeddings = self.transformer.tokens_embed.weight
-        if self.config.torchscript:
-            self.lm_head.weight = nn.Parameter(input_embeddings.clone())
-        else:
-            self.lm_head.weight = input_embeddings  # Tied weights
+        self._tie_or_clone_weights(self.lm_head,
+                                   self.transformer.tokens_embed)
 
     def forward(self, input_ids, mc_token_ids=None, lm_labels=None, mc_labels=None, token_type_ids=None,
                 position_ids=None, head_mask=None):
diff --git a/pytorch_transformers/modeling_transfo_xl.py b/pytorch_transformers/modeling_transfo_xl.py
index 71f80a9eea..b31723168a 100644
--- a/pytorch_transformers/modeling_transfo_xl.py
+++ b/pytorch_transformers/modeling_transfo_xl.py
@@ -291,6 +291,10 @@ class TransfoXLConfig(PretrainedConfig):
     def vocab_size(self):
         return self.n_token
 
+    @vocab_size.setter
+    def vocab_size(self, value):
+        self.n_token = value
+
     @property
     def hidden_size(self):
         return self.d_model
@@ -1003,7 +1007,7 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
         self.apply(self.init_weights)
 
     def _resize_token_embeddings(self, new_num_tokens):
-        raise NotImplementedError
+        return self.word_emb
 
     def backward_compatible(self):
         self.sample_softmax = -1
@@ -1280,13 +1284,20 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel):
         else:
             if self.config.tie_weight:
                 for i in range(len(self.crit.out_layers)):
-                    self.crit.out_layers[i].weight = self.transformer.word_emb.emb_layers[i].weight
+                    self._tie_or_clone_weights(self.crit.out_layers[i],
+                                               self.transformer.word_emb.emb_layers[i])
             if self.config.tie_projs:
                 for i, tie_proj in enumerate(self.config.tie_projs):
                     if tie_proj and self.config.div_val == 1 and self.config.d_model != self.config.d_embed:
-                        self.crit.out_projs[i] = self.transformer.word_emb.emb_projs[0]
+                        if self.config.torchscript:
+                            self.crit.out_projs[i] = nn.Parameter(self.transformer.word_emb.emb_projs[0].clone())
+                        else:
+                            self.crit.out_projs[i] = self.transformer.word_emb.emb_projs[0]
                     elif tie_proj and self.config.div_val != 1:
-                        self.crit.out_projs[i] = self.transformer.word_emb.emb_projs[i]
+                        if self.config.torchscript:
+                            self.crit.out_projs[i] = nn.Parameter(self.transformer.word_emb.emb_projs[i].clone())
+                        else:
+                            self.crit.out_projs[i] = self.transformer.word_emb.emb_projs[i]
 
     def reset_length(self, tgt_len, ext_len, mem_len):
         self.transformer.reset_length(tgt_len, ext_len, mem_len)
diff --git a/pytorch_transformers/modeling_utils.py b/pytorch_transformers/modeling_utils.py
index 8fdfda4720..9ca3a3d090 100644
--- a/pytorch_transformers/modeling_utils.py
+++ b/pytorch_transformers/modeling_utils.py
@@ -165,9 +165,27 @@ class PreTrainedModel(nn.Module):
         # Save config in model
         self.config = config
 
-    def _get_resized_embeddings(self, old_embeddings, new_num_tokens):
-        # Build new embeddings
+    def _get_resized_embeddings(self, old_embeddings, new_num_tokens=None):
+        """ Build a resized Embedding Module from a provided token Embedding Module.
+            Increasing the size will add newly initialized vectors at the end
+            Reducing the size will remove vectors from the end
+
+        Args:
+            new_num_tokens: (Optional) New number of tokens in the embedding matrix.
+                Increasing the size will add newly initialized vectors at the end
+                Reducing the size will remove vectors from the end
+                If not provided or None: return the provided token Embedding Module.
+        Return:
+            Pointer to the resized Embedding Module or the old Embedding Module if new_num_tokens is None
+        """
+        if new_num_tokens is None:
+            return old_embeddings
+
         old_num_tokens, old_embedding_dim = old_embeddings.weight.size()
+        if old_num_tokens == new_num_tokens:
+            return old_embeddings
+
+        # Build new embeddings
         new_embeddings = nn.Embedding(new_num_tokens, old_embedding_dim)
         new_embeddings.to(old_embeddings.weight.device)
 
@@ -180,18 +198,29 @@ class PreTrainedModel(nn.Module):
 
         return new_embeddings
 
-    def resize_token_embeddings(self, new_num_tokens):
-        """ Resize input token embeddings matrix.
+    def _tie_or_clone_weights(self, first_module, second_module):
+        """ Tie or clone module weights depending of weither we are using TorchScript or not
+        """
+        if self.config.torchscript:
+            first_module.weight = nn.Parameter(second_module.weight.clone())
+        else:
+            first_module.weight = second_module.weight
+
+    def resize_token_embeddings(self, new_num_tokens=None):
+        """ Resize input token embeddings matrix of the model if new_num_tokens != config.vocab_size.
 
         Args:
-            new_num_tokens: New number of tokens in the embedding matrix.
+            new_num_tokens: (Optional) New number of tokens in the embedding matrix.
                 Increasing the size will add newly initialized vectors at the end
                 Reducing the size will remove vectors from the end
+                If not provided or None: does nothing.
+        Return:
+            Pointer to the input tokens Embedding Module of the model
         """
-        if new_num_tokens == self.config.vocab_size:
-            return
         base_model = getattr(self, self.base_model_prefix, self)  # get the base model if needed
-        base_model._resize_token_embeddings(new_num_tokens)
+        model_embeds = base_model._resize_token_embeddings(new_num_tokens)
+        if new_num_tokens is None:
+            return model_embeds
 
         # Update base model and current model config
         self.config.vocab_size = new_num_tokens
@@ -201,6 +230,8 @@ class PreTrainedModel(nn.Module):
         if hasattr(self, 'tie_weights'):
             self.tie_weights()
 
+        return model_embeds
+
     def prune_heads(self, heads_to_prune):
         """ Prunes heads of the base model.
             heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
diff --git a/pytorch_transformers/modeling_xlm.py b/pytorch_transformers/modeling_xlm.py
index 3d5b35fae6..755e504b7d 100644
--- a/pytorch_transformers/modeling_xlm.py
+++ b/pytorch_transformers/modeling_xlm.py
@@ -184,6 +184,10 @@ class XLMConfig(PretrainedConfig):
     def vocab_size(self):
         return self.n_words
 
+    @vocab_size.setter
+    def vocab_size(self, value):
+        self.n_words = value
+
     @property
     def hidden_size(self):
         return self.emb_dim
@@ -479,6 +483,7 @@ class XLMModel(XLMPreTrainedModel):
 
     def _resize_token_embeddings(self, new_num_tokens):
         self.embeddings = self._get_resized_embeddings(self.embeddings, new_num_tokens)
+        return self.embeddings
 
     def _prune_heads(self, heads_to_prune):
         """ Prunes heads of the model.
@@ -728,10 +733,7 @@ class XLMWithLMHeadModel(XLMPreTrainedModel):
     def tie_weights(self):
         """ Make sure we are sharing the embeddings
         """
-        if self.config.torchscript:
-            self.pred_layer.proj.weight = nn.Parameter(self.transformer.embeddings.weight.clone())
-        else:
-            self.pred_layer.proj.weight = self.transformer.embeddings.weight
+        self._tie_or_clone_weights(self.pred_layer.proj, self.transformer.embeddings)
 
     def forward(self, input_ids, lengths=None, positions=None, langs=None, token_type_ids=None,
                 attention_mask=None, cache=None, labels=None, head_mask=None):
diff --git a/pytorch_transformers/modeling_xlnet.py b/pytorch_transformers/modeling_xlnet.py
index 36c068e3a3..051cc4e112 100644
--- a/pytorch_transformers/modeling_xlnet.py
+++ b/pytorch_transformers/modeling_xlnet.py
@@ -316,6 +316,10 @@ class XLNetConfig(PretrainedConfig):
     def vocab_size(self):
         return self.n_token
 
+    @vocab_size.setter
+    def vocab_size(self, value):
+        self.n_token = value
+
     @property
     def hidden_size(self):
         return self.d_model
@@ -660,10 +664,10 @@ class XLNetModel(XLNetPreTrainedModel):
 
     def _resize_token_embeddings(self, new_num_tokens):
         self.word_embedding = self._get_resized_embeddings(self.word_embedding, new_num_tokens)
+        return self.word_embedding
 
     def _prune_heads(self, heads_to_prune):
-        logger.info("Head pruning is not implemented for XLNet")
-        pass
+        raise NotImplementedError
 
     def create_mask(self, qlen, mlen):
         """
@@ -987,10 +991,7 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
     def tie_weights(self):
         """ Make sure we are sharing the embeddings
         """
-        if self.config.torchscript:
-            self.lm_loss.weight = nn.Parameter(self.transformer.word_embedding.weight.clone())
-        else:
-            self.lm_loss.weight = self.transformer.word_embedding.weight
+        self._tie_or_clone_weights(self.lm_loss, self.transformer.word_embedding)
 
     def forward(self, input_ids, token_type_ids=None, input_mask=None, attention_mask=None,
                 mems=None, perm_mask=None, target_mapping=None, inp_q=None,
diff --git a/pytorch_transformers/tests/modeling_bert_test.py b/pytorch_transformers/tests/modeling_bert_test.py
index 4ab0c9d157..ac5d2636a9 100644
--- a/pytorch_transformers/tests/modeling_bert_test.py
+++ b/pytorch_transformers/tests/modeling_bert_test.py
@@ -26,10 +26,15 @@ from pytorch_transformers import (BertConfig, BertModel, BertForMaskedLM,
                                      BertForTokenClassification, BertForMultipleChoice)
 from pytorch_transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP
 
-from .modeling_common_test import (create_and_check_commons, ConfigTester, ids_tensor)
+from .modeling_common_test import (CommonTestCases, ConfigTester, ids_tensor)
 
 
-class BertModelTest(unittest.TestCase):
+class BertModelTest(CommonTestCases.CommonModelTester):
+
+    all_model_classes = (BertModel, BertForMaskedLM, BertForNextSentencePrediction,
+            BertForPreTraining, BertForQuestionAnswering, BertForSequenceClassification,
+            BertForTokenClassification)
+
     class BertModelTester(object):
 
         def __init__(self,
@@ -55,9 +60,6 @@ class BertModelTest(unittest.TestCase):
                      num_labels=3,
                      num_choices=4,
                      scope=None,
-                     all_model_classes = (BertModel, BertForMaskedLM, BertForNextSentencePrediction,
-                             BertForPreTraining, BertForQuestionAnswering, BertForSequenceClassification,
-                             BertForTokenClassification),
                     ):
             self.parent = parent
             self.batch_size = batch_size
@@ -81,7 +83,6 @@ class BertModelTest(unittest.TestCase):
             self.num_labels = num_labels
             self.num_choices = num_choices
             self.scope = scope
-            self.all_model_classes = all_model_classes
 
         def prepare_config_and_inputs(self):
             input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
@@ -253,16 +254,51 @@ class BertModelTest(unittest.TestCase):
             self.check_loss_output(result)
 
 
-        def create_and_check_bert_commons(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+        def prepare_config_and_inputs_for_common(self):
+            config_and_inputs = self.prepare_config_and_inputs()
+            (config, input_ids, token_type_ids, input_mask,
+             sequence_labels, token_labels, choice_labels) = config_and_inputs
             inputs_dict = {'input_ids': input_ids, 'token_type_ids': token_type_ids, 'attention_mask': input_mask}
-            create_and_check_commons(self, config, inputs_dict)
+            return config, inputs_dict
 
-    def test_default(self):
-        self.run_tester(BertModelTest.BertModelTester(self))
+    def setUp(self):
+        self.model_tester = BertModelTest.BertModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=BertConfig, hidden_size=37)
 
     def test_config(self):
-        config_tester = ConfigTester(self, config_class=BertConfig, hidden_size=37)
-        config_tester.run_common_tests()
+        self.config_tester.run_common_tests()
+
+    def test_bert_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_bert_model(*config_and_inputs)
+
+    def test_for_masked_lm(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_bert_for_masked_lm(*config_and_inputs)
+
+    def test_for_multiple_choice(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_bert_for_multiple_choice(*config_and_inputs)
+
+    def test_for_next_sequence_prediction(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_bert_for_next_sequence_prediction(*config_and_inputs)
+
+    def test_for_pretraining(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_bert_for_pretraining(*config_and_inputs)
+
+    def test_for_question_answering(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_bert_for_question_answering(*config_and_inputs)
+
+    def test_for_sequence_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_bert_for_sequence_classification(*config_and_inputs)
+
+    def test_for_token_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_bert_for_token_classification(*config_and_inputs)
 
     @pytest.mark.slow
     def test_model_from_pretrained(self):
@@ -272,33 +308,5 @@ class BertModelTest(unittest.TestCase):
             shutil.rmtree(cache_dir)
             self.assertIsNotNone(model)
 
-    def run_tester(self, tester):
-        config_and_inputs = tester.prepare_config_and_inputs()
-        tester.create_and_check_bert_model(*config_and_inputs)
-
-        config_and_inputs = tester.prepare_config_and_inputs()
-        tester.create_and_check_bert_for_masked_lm(*config_and_inputs)
-
-        config_and_inputs = tester.prepare_config_and_inputs()
-        tester.create_and_check_bert_for_multiple_choice(*config_and_inputs)
-
-        config_and_inputs = tester.prepare_config_and_inputs()
-        tester.create_and_check_bert_for_next_sequence_prediction(*config_and_inputs)
-
-        config_and_inputs = tester.prepare_config_and_inputs()
-        tester.create_and_check_bert_for_pretraining(*config_and_inputs)
-
-        config_and_inputs = tester.prepare_config_and_inputs()
-        tester.create_and_check_bert_for_question_answering(*config_and_inputs)
-
-        config_and_inputs = tester.prepare_config_and_inputs()
-        tester.create_and_check_bert_for_sequence_classification(*config_and_inputs)
-
-        config_and_inputs = tester.prepare_config_and_inputs()
-        tester.create_and_check_bert_for_token_classification(*config_and_inputs)
-
-        config_and_inputs = tester.prepare_config_and_inputs()
-        tester.create_and_check_bert_commons(*config_and_inputs)
-
 if __name__ == "__main__":
     unittest.main()
diff --git a/pytorch_transformers/tests/modeling_common_test.py b/pytorch_transformers/tests/modeling_common_test.py
index 557d8aede0..5ea98d68e2 100644
--- a/pytorch_transformers/tests/modeling_common_test.py
+++ b/pytorch_transformers/tests/modeling_common_test.py
@@ -39,207 +39,471 @@ def _config_zero_init(config):
             setattr(configs_no_init, key, 0.0)
     return configs_no_init
 
-def _create_and_check_torchscript_output_attentions(tester, model_classes, config, inputs_dict):
-    config.output_attentions = True
-    _create_and_check_torchscript(tester, model_classes, config, inputs_dict)
+class CommonTestCases:
 
-def _create_and_check_torchscript_output_hidden_state(tester, model_classes, config, inputs_dict):
-    config.output_hidden_states = True
-    _create_and_check_torchscript(tester, model_classes, config, inputs_dict)
+    class CommonModelTester(unittest.TestCase):
 
-def _create_and_check_torchscript(tester, model_classes, config, inputs_dict):
-    configs_no_init = _config_zero_init(config)  # To be sure we have no Nan
-    configs_no_init.torchscript = True
-    for model_class in model_classes:
-        model = model_class(config=configs_no_init)
-        model.eval()
-        inputs = inputs_dict['input_ids']  # Let's keep only input_ids
+        model_tester = None
+        all_model_classes = ()
+        test_torchscript = True
+        test_pruning = True
+        test_resize_embeddings = True
 
-        try:
-            torch.jit.trace(model, inputs)
-        except RuntimeError:
-            tester.parent.fail("Couldn't trace module.")
+        def test_initialization(self):
+            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
-        try:
-            traced_gpt2 = torch.jit.trace(model, inputs)
-            torch.jit.save(traced_gpt2, "traced_model.pt")
-        except RuntimeError:
-            tester.parent.fail("Couldn't save module.")
+            configs_no_init = _config_zero_init(config)
+            for model_class in self.all_model_classes:
+                model = model_class(config=configs_no_init)
+                for name, param in model.named_parameters():
+                    if param.requires_grad:
+                        self.assertIn(param.data.mean().item(), [0.0, 1.0],
+                        msg="Parameter {} of model {} seems not properly initialized".format(name, model_class))
 
-        try:
-            loaded_model = torch.jit.load("traced_model.pt")
-            os.remove("traced_model.pt")
-        except ValueError:
-            tester.parent.fail("Couldn't load module.")
+        def test_attention_outputs(self):
+            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
-        model.eval()
-        loaded_model.eval()
+            for model_class in self.all_model_classes:
+                config.output_attentions = True
+                config.output_hidden_states = False
+                model = model_class(config)
+                model.eval()
+                outputs = model(**inputs_dict)
+                attentions = outputs[-1]
+                self.assertEqual(model.config.output_attentions, True)
+                self.assertEqual(model.config.output_hidden_states, False)
+                self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
+                self.assertListEqual(
+                    list(attentions[0].shape[-3:]),
+                    [self.model_tester.num_attention_heads,
+                    self.model_tester.seq_length,
+                    self.model_tester.key_len if hasattr(self.model_tester, 'key_len') else self.model_tester.seq_length])
+                out_len = len(outputs)
 
-        model_params = model.parameters()
-        loaded_model_params = loaded_model.parameters()
+                # Check attention is always last and order is fine
+                config.output_attentions = True
+                config.output_hidden_states = True
+                model = model_class(config)
+                model.eval()
+                outputs = model(**inputs_dict)
+                self.assertEqual(out_len+1, len(outputs))
+                self.assertEqual(model.config.output_attentions, True)
+                self.assertEqual(model.config.output_hidden_states, True)
 
-        models_equal = True
-        for p1, p2 in zip(model_params, loaded_model_params):
-            if p1.data.ne(p2.data).sum() > 0:
-                models_equal = False
+                attentions = outputs[-1]
+                self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
+                self.assertListEqual(
+                    list(attentions[0].shape[-3:]),
+                    [self.model_tester.num_attention_heads,
+                    self.model_tester.seq_length,
+                    self.model_tester.key_len if hasattr(self.model_tester, 'key_len') else self.model_tester.seq_length])
 
-        tester.parent.assertTrue(models_equal)
+        def test_torchscript(self):
+            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
-def _create_and_check_initialization(tester, model_classes, config, inputs_dict):
-    configs_no_init = _config_zero_init(config)
-    for model_class in model_classes:
-        model = model_class(config=configs_no_init)
-        for name, param in model.named_parameters():
-            if param.requires_grad:
-                tester.parent.assertIn(param.data.mean().item(), [0.0, 1.0],
-                                       msg="Parameter {} of model {} seems not properly initialized".format(name, model_class))
+            self._create_and_check_torchscript(config, inputs_dict)
 
-def _create_and_check_for_headmasking(tester, model_classes, config, inputs_dict):
-    configs_no_init = _config_zero_init(config)  # To be sure we have no Nan
-    for model_class in model_classes:
-        config.output_attentions = True
-        config.output_hidden_states = True
-        model = model_class(config=configs_no_init)
-        model.eval()
+        def test_torchscript_output_attentions(self):
+            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
-        # Prepare head_mask
-        # Set require_grad after having prepared the tensor to avoid error (leaf variable has been moved into the graph interior) 
-        head_mask = torch.ones(tester.num_hidden_layers, tester.num_attention_heads)
-        head_mask[0, 0] = 0
-        head_mask[-1, :-1] = 0
-        head_mask.requires_grad_(requires_grad=True)
-        inputs = inputs_dict.copy()
-        inputs['head_mask'] = head_mask
+            config.output_attentions = True
+            self._create_and_check_torchscript(config, inputs_dict)
 
-        outputs = model(**inputs)
+        def test_torchscript_output_hidden_state(self):
+            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
-        # Test that we can get a gradient back for importance score computation
-        output = sum(t.sum() for t in outputs[0])
-        output = output.sum()
-        output.backward()
-        multihead_outputs = head_mask.grad
+            config.output_hidden_states = True
+            self._create_and_check_torchscript(config, inputs_dict)
 
-        attentions = outputs[-1]
-        hidden_states = outputs[-2]
+        def _create_and_check_torchscript(self, config, inputs_dict):
+            if not self.test_torchscript:
+                return
 
-        # Remove Nan
+            configs_no_init = _config_zero_init(config)  # To be sure we have no Nan
+            configs_no_init.torchscript = True
+            for model_class in self.all_model_classes:
+                model = model_class(config=configs_no_init)
+                model.eval()
+                inputs = inputs_dict['input_ids']  # Let's keep only input_ids
 
-        tester.parent.assertIsNotNone(multihead_outputs)
-        tester.parent.assertEqual(len(multihead_outputs), tester.num_hidden_layers)
-        tester.parent.assertAlmostEqual(
-            attentions[0][..., 0, :, :].flatten().sum().item(), 0.0)
-        tester.parent.assertNotEqual(
-            attentions[0][..., -1, :, :].flatten().sum().item(), 0.0)
-        tester.parent.assertNotEqual(
-            attentions[1][..., 0, :, :].flatten().sum().item(), 0.0)
-        tester.parent.assertAlmostEqual(
-            attentions[-1][..., -2, :, :].flatten().sum().item(), 0.0)
-        tester.parent.assertNotEqual(
-            attentions[-1][..., -1, :, :].flatten().sum().item(), 0.0)
+                try:
+                    torch.jit.trace(model, inputs)
+                except RuntimeError:
+                    self.fail("Couldn't trace module.")
+
+                try:
+                    traced_gpt2 = torch.jit.trace(model, inputs)
+                    torch.jit.save(traced_gpt2, "traced_model.pt")
+                except RuntimeError:
+                    self.fail("Couldn't save module.")
+
+                try:
+                    loaded_model = torch.jit.load("traced_model.pt")
+                    os.remove("traced_model.pt")
+                except ValueError:
+                    self.fail("Couldn't load module.")
+
+                model.eval()
+                loaded_model.eval()
+
+                model_params = model.parameters()
+                loaded_model_params = loaded_model.parameters()
+
+                models_equal = True
+                for p1, p2 in zip(model_params, loaded_model_params):
+                    if p1.data.ne(p2.data).sum() > 0:
+                        models_equal = False
+
+                self.assertTrue(models_equal)
 
 
-def _create_and_check_for_head_pruning(tester, model_classes, config, inputs_dict):
-    for model_class in model_classes:
-        config.output_attentions = True
-        config.output_hidden_states = False
-        model = model_class(config=config)
-        model.eval()
-        heads_to_prune = {0: list(range(1, tester.num_attention_heads)),
-                          -1: [0]}
-        model.prune_heads(heads_to_prune)
-        outputs = model(**inputs_dict)
+        def test_headmasking(self):
+            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
-        attentions = outputs[-1]
+            config.output_attentions = True
+            config.output_hidden_states = True
+            configs_no_init = _config_zero_init(config)  # To be sure we have no Nan
+            for model_class in self.all_model_classes:
+                model = model_class(config=configs_no_init)
+                model.eval()
 
-        tester.parent.assertEqual(
-            attentions[0].shape[-3], 1)
-        tester.parent.assertEqual(
-            attentions[1].shape[-3], tester.num_attention_heads)
-        tester.parent.assertEqual(
-            attentions[-1].shape[-3], tester.num_attention_heads - 1)
+                # Prepare head_mask
+                # Set require_grad after having prepared the tensor to avoid error (leaf variable has been moved into the graph interior) 
+                head_mask = torch.ones(self.model_tester.num_hidden_layers, self.model_tester.num_attention_heads)
+                head_mask[0, 0] = 0
+                head_mask[-1, :-1] = 0
+                head_mask.requires_grad_(requires_grad=True)
+                inputs = inputs_dict.copy()
+                inputs['head_mask'] = head_mask
+
+                outputs = model(**inputs)
+
+                # Test that we can get a gradient back for importance score computation
+                output = sum(t.sum() for t in outputs[0])
+                output = output.sum()
+                output.backward()
+                multihead_outputs = head_mask.grad
+
+                attentions = outputs[-1]
+                hidden_states = outputs[-2]
+
+                # Remove Nan
+
+                self.assertIsNotNone(multihead_outputs)
+                self.assertEqual(len(multihead_outputs), self.model_tester.num_hidden_layers)
+                self.assertAlmostEqual(
+                    attentions[0][..., 0, :, :].flatten().sum().item(), 0.0)
+                self.assertNotEqual(
+                    attentions[0][..., -1, :, :].flatten().sum().item(), 0.0)
+                self.assertNotEqual(
+                    attentions[1][..., 0, :, :].flatten().sum().item(), 0.0)
+                self.assertAlmostEqual(
+                    attentions[-1][..., -2, :, :].flatten().sum().item(), 0.0)
+                self.assertNotEqual(
+                    attentions[-1][..., -1, :, :].flatten().sum().item(), 0.0)
 
 
-def _create_and_check_for_attentions(tester, model_classes, config, inputs_dict):
-    for model_class in model_classes:
-        config.output_attentions = True
-        config.output_hidden_states = False
-        model = model_class(config)
-        model.eval()
-        outputs = model(**inputs_dict)
-        attentions = outputs[-1]
-        tester.parent.assertEqual(model.config.output_attentions, True)
-        tester.parent.assertEqual(model.config.output_hidden_states, False)
-        tester.parent.assertEqual(len(attentions), tester.num_hidden_layers)
-        tester.parent.assertListEqual(
-            list(attentions[0].shape[-3:]),
-            [tester.num_attention_heads,
-             tester.seq_length,
-             tester.key_len if hasattr(tester, 'key_len') else tester.seq_length])
-        out_len = len(outputs)
+        def test_head_pruning(self):
+            if not self.test_pruning:
+                return
 
-        # Check attention is always last and order is fine
-        config.output_attentions = True
-        config.output_hidden_states = True
-        model = model_class(config)
-        model.eval()
-        outputs = model(**inputs_dict)
-        tester.parent.assertEqual(out_len+1, len(outputs))
-        tester.parent.assertEqual(model.config.output_attentions, True)
-        tester.parent.assertEqual(model.config.output_hidden_states, True)
+            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
-        attentions = outputs[-1]
-        tester.parent.assertEqual(len(attentions), tester.num_hidden_layers)
-        tester.parent.assertListEqual(
-            list(attentions[0].shape[-3:]),
-            [tester.num_attention_heads,
-             tester.seq_length,
-             tester.key_len if hasattr(tester, 'key_len') else tester.seq_length])
+            for model_class in self.all_model_classes:
+                config.output_attentions = True
+                config.output_hidden_states = False
+                model = model_class(config=config)
+                model.eval()
+                heads_to_prune = {0: list(range(1, self.model_tester.num_attention_heads)),
+                                -1: [0]}
+                model.prune_heads(heads_to_prune)
+                outputs = model(**inputs_dict)
 
-def _create_and_check_for_hidden_states(tester, model_classes, config, inputs_dict):
-    for model_class in model_classes:
-        config.output_hidden_states = True
-        config.output_attentions = False
-        model = model_class(config)
-        model.eval()
-        outputs = model(**inputs_dict)
-        hidden_states = outputs[-1]
-        tester.parent.assertEqual(model.config.output_attentions, False)
-        tester.parent.assertEqual(model.config.output_hidden_states, True)
-        tester.parent.assertEqual(len(hidden_states), tester.num_hidden_layers + 1)
-        tester.parent.assertListEqual(
-            list(hidden_states[0].shape[-2:]),
-            [tester.seq_length, tester.hidden_size])
+                attentions = outputs[-1]
+
+                self.assertEqual(
+                    attentions[0].shape[-3], 1)
+                self.assertEqual(
+                    attentions[1].shape[-3], self.model_tester.num_attention_heads)
+                self.assertEqual(
+                    attentions[-1].shape[-3], self.model_tester.num_attention_heads - 1)
 
 
-def create_and_check_commons(tester, config, inputs_dict, test_pruning=True, test_torchscript=True):
-    _create_and_check_initialization(tester, tester.all_model_classes, config, inputs_dict)
-    _create_and_check_for_attentions(tester, tester.all_model_classes, config, inputs_dict)
-    _create_and_check_for_headmasking(tester, tester.all_model_classes, config, inputs_dict)
-    _create_and_check_for_hidden_states(tester, tester.all_model_classes, config, inputs_dict)
+        def test_hidden_states_output(self):
+            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
-    if test_torchscript:
-        _create_and_check_torchscript(tester, tester.all_model_classes, config, inputs_dict)
-        _create_and_check_torchscript_output_attentions(tester, tester.all_model_classes, config, inputs_dict)
-        _create_and_check_torchscript_output_hidden_state(tester, tester.all_model_classes, config, inputs_dict)
+            for model_class in self.all_model_classes:
+                config.output_hidden_states = True
+                config.output_attentions = False
+                model = model_class(config)
+                model.eval()
+                outputs = model(**inputs_dict)
+                hidden_states = outputs[-1]
+                self.assertEqual(model.config.output_attentions, False)
+                self.assertEqual(model.config.output_hidden_states, True)
+                self.assertEqual(len(hidden_states), self.model_tester.num_hidden_layers + 1)
+                self.assertListEqual(
+                    list(hidden_states[0].shape[-2:]),
+                    [self.model_tester.seq_length, self.model_tester.hidden_size])
 
-    if test_pruning:
-        _create_and_check_for_head_pruning(tester, tester.all_model_classes, config, inputs_dict)
+        def test_resize_tokens_embeddings(self):
+            original_config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+            if not self.test_resize_embeddings:
+                return
+
+            for model_class in self.all_model_classes:
+                config = copy.deepcopy(original_config)
+                model = model_class(config)
+
+                model_vocab_size = config.vocab_size
+                # Retrieve the embeddings and clone theme
+                model_embed = model.resize_token_embeddings(model_vocab_size)
+                cloned_embeddings = model_embed.weight.clone()
+
+                # Check that resizing the token embeddings with a larger vocab size increases the model's vocab size
+                model_embed = model.resize_token_embeddings(model_vocab_size + 10)
+                self.assertEqual(model.config.vocab_size, model_vocab_size + 10)
+                # Check that it actually resizes the embeddings matrix
+                self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] + 10)
+
+                # Check that resizing the token embeddings with a smaller vocab size decreases the model's vocab size
+                model_embed = model.resize_token_embeddings(model_vocab_size - 15)
+                self.assertEqual(model.config.vocab_size, model_vocab_size - 15)
+                # Check that it actually resizes the embeddings matrix
+                self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] - 15)
+
+                # Check that adding and removing tokens has not modified the first part of the embedding matrix.
+                models_equal = True
+                for p1, p2 in zip(cloned_embeddings, model_embed.weight):
+                    if p1.data.ne(p2.data).sum() > 0:
+                        models_equal = False
+
+                self.assertTrue(models_equal)
+
+        def test_tie_model_weights(self):
+            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+            def check_same_values(layer_1, layer_2):
+                equal = True
+                for p1, p2 in zip(layer_1.weight, layer_2.weight):
+                    if p1.data.ne(p2.data).sum() > 0:
+                        equal = False
+                return equal
+
+            for model_class in self.all_model_classes:
+                if not hasattr(model_class, 'tie_weights'):
+                    continue
+
+                config.torchscript = True
+                model_not_tied = model_class(config)
+                params_not_tied = list(model_not_tied.parameters())
+
+                config_tied = copy.deepcopy(config)
+                config_tied.torchscript = False
+                model_tied = model_class(config_tied)
+                params_tied = list(model_tied.parameters())
+
+                # Check that the embedding layer and decoding layer are the same in size and in value
+                self.assertGreater(len(params_not_tied), len(params_tied))
+                # self.assertTrue(check_same_values(embeddings, decoding))
+
+                # # Check that after modification, they remain the same.
+                # embeddings.weight.data.div_(2)
+                # # Check that the embedding layer and decoding layer are the same in size and in value
+                # self.assertTrue(embeddings.weight.shape, decoding.weight.shape)
+                # self.assertTrue(check_same_values(embeddings, decoding))
+
+                # # Check that after modification, they remain the same.
+                # decoding.weight.data.div_(4)
+                # # Check that the embedding layer and decoding layer are the same in size and in value
+                # self.assertTrue(embeddings.weight.shape, decoding.weight.shape)
+                # self.assertTrue(check_same_values(embeddings, decoding))
+
+                # Check that after resize they remain tied.
+                model_tied.resize_token_embeddings(config.vocab_size + 10)
+                params_tied_2 = list(model_tied.parameters())
+                self.assertGreater(len(params_not_tied), len(params_tied))
+                self.assertEqual(len(params_tied_2), len(params_tied))
+
+                # decoding.weight.data.mul_(20)
+                # # Check that the embedding layer and decoding layer are the same in size and in value
+                # self.assertTrue(model.transformer.wte.weight.shape, model.lm_head.weight.shape)
+                # self.assertTrue(check_same_values(model.transformer.wte, model.lm_head))
 
 
-def ids_tensor(shape, vocab_size, rng=None, name=None):
-    """Creates a random int32 tensor of the shape within the vocab size."""
-    if rng is None:
-        rng = random.Random()
+    class GPTModelTester(CommonModelTester):
 
-    total_dims = 1
-    for dim in shape:
-        total_dims *= dim
+        def __init__(self,
+                        parent,
+                        batch_size=13,
+                        seq_length=7,
+                        is_training=True,
+                        use_position_ids=True,
+                        use_token_type_ids=True,
+                        use_labels=True,
+                        vocab_size=99,
+                        n_positions=33,
+                        hidden_size=32,
+                        num_hidden_layers=5,
+                        num_attention_heads=4,
+                        n_choices=3,
+                        type_sequence_label_size=2,
+                        initializer_range=0.02,
+                        num_labels=3,
+                        scope=None,
+                        config_class=None,
+                        base_model_class=None,
+                        lm_head_model_class=None,
+                        double_head_model_class=None,
+                        ):
+            self.parent = parent
+            self.batch_size = batch_size
+            self.seq_length = seq_length
+            self.is_training = is_training
+            self.use_position_ids = use_position_ids
+            self.use_token_type_ids = use_token_type_ids
+            self.use_labels = use_labels
+            self.vocab_size = vocab_size
+            self.n_positions = n_positions
+            self.hidden_size = hidden_size
+            self.num_hidden_layers = num_hidden_layers
+            self.num_attention_heads = num_attention_heads
+            self.n_choices = n_choices
+            self.type_sequence_label_size = type_sequence_label_size
+            self.initializer_range = initializer_range
+            self.num_labels = num_labels
+            self.scope = scope
+            self.config_class = config_class
+            self.base_model_class = base_model_class
+            self.lm_head_model_class = lm_head_model_class
+            self.double_head_model_class = double_head_model_class
+            self.all_model_classes = (base_model_class, lm_head_model_class, double_head_model_class)
 
-    values = []
-    for _ in range(total_dims):
-        values.append(rng.randint(0, vocab_size - 1))
+        def prepare_config_and_inputs(self):
+            total_num_tokens = self.vocab_size
+            input_ids = ids_tensor([self.batch_size, self.n_choices, self.seq_length], total_num_tokens)
 
-    return torch.tensor(data=values, dtype=torch.long).view(shape).contiguous()
+            position_ids = None
+            if self.use_position_ids:
+                position_ids = ids_tensor([self.batch_size, self.n_choices, self.seq_length], self.n_positions)
+
+            token_type_ids = None
+            if self.use_token_type_ids:
+                total_voc = self.vocab_size
+                token_type_ids = ids_tensor([self.batch_size, self.n_choices, self.seq_length], total_voc)
+
+            mc_labels = None
+            lm_labels = None
+            mc_token_ids = None
+            if self.use_labels:
+                mc_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+                lm_labels = ids_tensor([self.batch_size, self.n_choices, self.seq_length], self.num_labels)
+                mc_token_ids = ids_tensor([self.batch_size, self.n_choices], self.seq_length)
+
+            config = self.config_class(
+                vocab_size_or_config_json_file=self.vocab_size,
+                n_positions=self.n_positions,
+                n_embd=self.hidden_size,
+                n_layer=self.num_hidden_layers,
+                n_head=self.num_attention_heads,
+                initializer_range=self.initializer_range)
+
+            return (config, input_ids, token_type_ids, position_ids,
+                    mc_labels, lm_labels, mc_token_ids)
+
+        def create_and_check_base_model(self, config, input_ids, token_type_ids, position_ids,
+                                mc_labels, lm_labels, mc_token_ids):
+            model = self.base_model_class(config)
+            model.eval()
+
+            outputs = model(input_ids, position_ids, token_type_ids)
+            outputs = model(input_ids, position_ids)
+            outputs = model(input_ids)
+
+            hidden_state = outputs[0]
+            self.parent.assertListEqual(
+                list(hidden_state.size()),
+                [self.batch_size, self.n_choices, self.seq_length, self.hidden_size])
+
+
+        def create_and_check_lm_head(self, config, input_ids, token_type_ids, position_ids,
+                                        mc_labels, lm_labels, mc_token_ids):
+            model = self.lm_head_model_class(config)
+            model.eval()
+            outputs = model(input_ids, position_ids, token_type_ids, lm_labels)
+            loss, lm_logits = outputs[:2]
+
+            total_voc = self.vocab_size
+            self.parent.assertListEqual(
+                list(lm_logits.size()),
+                [self.batch_size, self.n_choices, self.seq_length, total_voc])
+            self.parent.assertListEqual(
+                list(loss.size()),
+                [])
+
+        def create_and_check_presents(self, config, input_ids, token_type_ids, position_ids,
+                                        mc_labels, lm_labels, mc_token_ids):
+            for model_class in self.all_model_classes:
+                model = model_class(config)
+                model.eval()
+                outputs = model(input_ids)
+                presents = outputs[-1]
+                self.parent.assertEqual(self.num_hidden_layers, len(presents))
+                self.parent.assertListEqual(
+                    list(presents[0].size()),
+                    [2, self.batch_size * self.n_choices, self.num_attention_heads,
+                        self.seq_length, self.hidden_size // self.num_attention_heads])
+
+        def create_and_check_double_heads(self, config, input_ids, token_type_ids, position_ids,
+                                        mc_labels, lm_labels, mc_token_ids):
+            model = self.double_head_model_class(config)
+            model.eval()
+            outputs = model(input_ids, mc_token_ids, lm_labels=lm_labels, mc_labels=mc_labels,
+                            token_type_ids=token_type_ids, position_ids=position_ids)
+            lm_loss, mc_loss, lm_logits, mc_logits = outputs[:4]
+            loss = [lm_loss, mc_loss]
+
+            total_voc = self.vocab_size
+            self.parent.assertListEqual(
+                list(lm_logits.size()),
+                [self.batch_size, self.n_choices, self.seq_length, total_voc])
+            self.parent.assertListEqual(
+                list(mc_logits.size()),
+                [self.batch_size, self.n_choices])
+            self.parent.assertListEqual(
+                [list(l.size()) for l in loss],
+                [[], []])
+
+        def create_and_check_model_from_pretrained(self):
+            cache_dir = "/tmp/pytorch_transformers_test/"
+            for model_name in list(self.base_model_class.pretrained_model_archive_map.keys())[:1]:
+                model = self.base_model_class.from_pretrained(model_name, cache_dir=cache_dir)
+                shutil.rmtree(cache_dir)
+                self.parent.assertIsNotNone(model)
+
+        def prepare_config_and_inputs_for_common(self):
+            config_and_inputs = self.prepare_config_and_inputs()
+            (config, input_ids, token_type_ids, position_ids,
+                mc_labels, lm_labels, mc_token_ids) = config_and_inputs
+            inputs_dict = {'input_ids': input_ids}
+            return config, inputs_dict
+
+        def run_common_tests(self, test_presents=False):
+            config_and_inputs = self.prepare_config_and_inputs()
+            self.create_and_check_base_model(*config_and_inputs)
+
+            config_and_inputs = self.prepare_config_and_inputs()
+            self.create_and_check_lm_head(*config_and_inputs)
+
+            config_and_inputs = self.prepare_config_and_inputs()
+            self.create_and_check_double_heads(*config_and_inputs)
+
+            if test_presents:
+                config_and_inputs = self.prepare_config_and_inputs()
+                self.create_and_check_presents(*config_and_inputs)
+
+        def run_slow_tests(self):
+            self.create_and_check_model_from_pretrained()
 
 
 class ConfigTester(object):
@@ -275,179 +539,22 @@ class ConfigTester(object):
         self.create_and_test_config_to_json_file()
 
 
-class GPTModelTester(object):
-    def __init__(self,
-                    parent,
-                    batch_size=13,
-                    seq_length=7,
-                    is_training=True,
-                    use_position_ids=True,
-                    use_token_type_ids=True,
-                    use_labels=True,
-                    vocab_size=99,
-                    n_positions=33,
-                    hidden_size=32,
-                    num_hidden_layers=5,
-                    num_attention_heads=4,
-                    n_choices=3,
-                    type_sequence_label_size=2,
-                    initializer_range=0.02,
-                    num_labels=3,
-                    scope=None,
-                    config_class=None,
-                    base_model_class=None,
-                    lm_head_model_class=None,
-                    double_head_model_class=None,
-                    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_position_ids = use_position_ids
-        self.use_token_type_ids = use_token_type_ids
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.n_positions = n_positions
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.n_choices = n_choices
-        self.type_sequence_label_size = type_sequence_label_size
-        self.initializer_range = initializer_range
-        self.num_labels = num_labels
-        self.scope = scope
-        self.config_class = config_class
-        self.base_model_class = base_model_class
-        self.lm_head_model_class = lm_head_model_class
-        self.double_head_model_class = double_head_model_class
-        self.all_model_classes = (base_model_class, lm_head_model_class, double_head_model_class)
-
-    def prepare_config_and_inputs(self):
-        total_num_tokens = self.vocab_size
-        input_ids = ids_tensor([self.batch_size, self.n_choices, self.seq_length], total_num_tokens)
-
-        position_ids = None
-        if self.use_position_ids:
-            position_ids = ids_tensor([self.batch_size, self.n_choices, self.seq_length], self.n_positions)
-
-        token_type_ids = None
-        if self.use_token_type_ids:
-            total_voc = self.vocab_size
-            token_type_ids = ids_tensor([self.batch_size, self.n_choices, self.seq_length], total_voc)
-
-        mc_labels = None
-        lm_labels = None
-        mc_token_ids = None
-        if self.use_labels:
-            mc_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-            lm_labels = ids_tensor([self.batch_size, self.n_choices, self.seq_length], self.num_labels)
-            mc_token_ids = ids_tensor([self.batch_size, self.n_choices], self.seq_length)
-
-        config = self.config_class(
-            vocab_size_or_config_json_file=self.vocab_size,
-            n_positions=self.n_positions,
-            n_embd=self.hidden_size,
-            n_layer=self.num_hidden_layers,
-            n_head=self.num_attention_heads,
-            initializer_range=self.initializer_range)
-
-        return (config, input_ids, token_type_ids, position_ids,
-                mc_labels, lm_labels, mc_token_ids)
-
-    def create_and_check_base_model(self, config, input_ids, token_type_ids, position_ids,
-                            mc_labels, lm_labels, mc_token_ids):
-        model = self.base_model_class(config)
-        model.eval()
-
-        outputs = model(input_ids, position_ids, token_type_ids)
-        outputs = model(input_ids, position_ids)
-        outputs = model(input_ids)
-
-        hidden_state = outputs[0]
-        self.parent.assertListEqual(
-            list(hidden_state.size()),
-            [self.batch_size, self.n_choices, self.seq_length, self.hidden_size])
 
 
-    def create_and_check_lm_head(self, config, input_ids, token_type_ids, position_ids,
-                                    mc_labels, lm_labels, mc_token_ids):
-        model = self.lm_head_model_class(config)
-        model.eval()
-        outputs = model(input_ids, position_ids, token_type_ids, lm_labels)
-        loss, lm_logits = outputs[:2]
+def ids_tensor(shape, vocab_size, rng=None, name=None):
+    """Creates a random int32 tensor of the shape within the vocab size."""
+    if rng is None:
+        rng = random.Random()
 
-        total_voc = self.vocab_size
-        self.parent.assertListEqual(
-            list(lm_logits.size()),
-            [self.batch_size, self.n_choices, self.seq_length, total_voc])
-        self.parent.assertListEqual(
-            list(loss.size()),
-            [])
+    total_dims = 1
+    for dim in shape:
+        total_dims *= dim
 
-    def create_and_check_presents(self, config, input_ids, token_type_ids, position_ids,
-                                    mc_labels, lm_labels, mc_token_ids):
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            model.eval()
-            outputs = model(input_ids)
-            presents = outputs[-1]
-            self.parent.assertEqual(self.num_hidden_layers, len(presents))
-            self.parent.assertListEqual(
-                list(presents[0].size()),
-                [2, self.batch_size * self.n_choices, self.num_attention_heads,
-                    self.seq_length, self.hidden_size // self.num_attention_heads])
+    values = []
+    for _ in range(total_dims):
+        values.append(rng.randint(0, vocab_size - 1))
 
-    def create_and_check_double_heads(self, config, input_ids, token_type_ids, position_ids,
-                                    mc_labels, lm_labels, mc_token_ids):
-        model = self.double_head_model_class(config)
-        model.eval()
-        outputs = model(input_ids, mc_token_ids, lm_labels=lm_labels, mc_labels=mc_labels,
-                        token_type_ids=token_type_ids, position_ids=position_ids)
-        lm_loss, mc_loss, lm_logits, mc_logits = outputs[:4]
-        loss = [lm_loss, mc_loss]
-
-        total_voc = self.vocab_size
-        self.parent.assertListEqual(
-            list(lm_logits.size()),
-            [self.batch_size, self.n_choices, self.seq_length, total_voc])
-        self.parent.assertListEqual(
-            list(mc_logits.size()),
-            [self.batch_size, self.n_choices])
-        self.parent.assertListEqual(
-            [list(l.size()) for l in loss],
-            [[], []])
-
-    def create_and_check_model_from_pretrained(self):
-        cache_dir = "/tmp/pytorch_transformers_test/"
-        for model_name in list(self.base_model_class.pretrained_model_archive_map.keys())[:1]:
-            model = self.base_model_class.from_pretrained(model_name, cache_dir=cache_dir)
-            shutil.rmtree(cache_dir)
-            self.parent.assertIsNotNone(model)
-
-    def create_and_check_commons(self, config, input_ids, token_type_ids, position_ids,
-                                    mc_labels, lm_labels, mc_token_ids):
-        inputs_dict = {'input_ids': input_ids}
-        create_and_check_commons(self, config, inputs_dict)
-
-    def run_common_tests(self, test_presents=False):
-        config_and_inputs = self.prepare_config_and_inputs()
-        self.create_and_check_base_model(*config_and_inputs)
-
-        config_and_inputs = self.prepare_config_and_inputs()
-        self.create_and_check_lm_head(*config_and_inputs)
-
-        config_and_inputs = self.prepare_config_and_inputs()
-        self.create_and_check_double_heads(*config_and_inputs)
-
-        if test_presents:
-            config_and_inputs = self.prepare_config_and_inputs()
-            self.create_and_check_presents(*config_and_inputs)
-
-        config_and_inputs = self.prepare_config_and_inputs()
-        self.create_and_check_commons(*config_and_inputs)
-
-    def run_slow_tests(self):
-        self.create_and_check_model_from_pretrained()
+    return torch.tensor(data=values, dtype=torch.long).view(shape).contiguous()
 
 
 class ModelUtilsTest(unittest.TestCase):
@@ -471,79 +578,6 @@ class ModelUtilsTest(unittest.TestCase):
             self.assertEqual(model.config.output_hidden_states, True)
             self.assertEqual(model.config, config)
 
-    def test_resize_tokens_embeddings(self):
-        logging.basicConfig(level=logging.INFO)
-
-
-        for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
-            config = BertConfig.from_pretrained(model_name)
-            model = BertModel.from_pretrained(model_name)
-
-            model_vocab_size = config.vocab_size
-            # Retrieve the embeddings and clone theme
-            cloned_embeddings = model.embeddings.word_embeddings.weight.clone()
-
-            # Check that resizing the token embeddings with a larger vocab size increases the model's vocab size
-            model.resize_token_embeddings(model_vocab_size + 10)
-            self.assertEqual(model.config.vocab_size, model_vocab_size + 10)
-            # Check that it actually resizes the embeddings matrix
-            self.assertEqual(model.embeddings.word_embeddings.weight.shape[0], cloned_embeddings.shape[0] + 10)
-
-            # Check that resizing the token embeddings with a smaller vocab size decreases the model's vocab size
-            model.resize_token_embeddings(model_vocab_size)
-            self.assertEqual(model.config.vocab_size, model_vocab_size)
-            # Check that it actually resizes the embeddings matrix
-            self.assertEqual(model.embeddings.word_embeddings.weight.shape[0], cloned_embeddings.shape[0])
-
-            # Check that adding and removing tokens has not modified the first part of the embedding matrix.
-            models_equal = True
-            for p1, p2 in zip(cloned_embeddings, model.embeddings.word_embeddings.weight):
-                if p1.data.ne(p2.data).sum() > 0:
-                    models_equal = False
-
-            self.assertTrue(models_equal)
-
-    def test_tie_model_weights(self):
-        logging.basicConfig(level=logging.INFO)
-
-        def check_same_values(layer_1, layer_2):
-            equal = True
-            for p1, p2 in zip(layer_1.weight, layer_2.weight):
-                if p1.data.ne(p2.data).sum() > 0:
-                    equal = False
-            return equal
-
-        for model_name in list(GPT2_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
-            config = GPT2Config.from_pretrained(model_name)
-            model = GPT2LMHeadModel.from_pretrained(model_name)
-
-            # Get the embeddings and decoding layer
-            embeddings = model.transformer.wte
-            decoding = model.lm_head
-
-            # Check that the embedding layer and decoding layer are the same in size and in value
-            self.assertTrue(embeddings.weight.shape, decoding.weight.shape)
-            self.assertTrue(check_same_values(embeddings, decoding))
-
-            # Check that after modification, they remain the same.
-            embeddings.weight.data.div_(2)
-            # Check that the embedding layer and decoding layer are the same in size and in value
-            self.assertTrue(embeddings.weight.shape, decoding.weight.shape)
-            self.assertTrue(check_same_values(embeddings, decoding))
-
-            # Check that after modification, they remain the same.
-            decoding.weight.data.div_(4)
-            # Check that the embedding layer and decoding layer are the same in size and in value
-            self.assertTrue(embeddings.weight.shape, decoding.weight.shape)
-            self.assertTrue(check_same_values(embeddings, decoding))
-
-            # Check that after resize they remain tied.
-            model.resize_token_embeddings(config.vocab_size + 10)
-            decoding.weight.data.mul_(20)
-            # Check that the embedding layer and decoding layer are the same in size and in value
-            self.assertTrue(model.transformer.wte.weight.shape, model.lm_head.weight.shape)
-            self.assertTrue(check_same_values(model.transformer.wte, model.lm_head))
-
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/pytorch_transformers/tests/modeling_gpt2_test.py b/pytorch_transformers/tests/modeling_gpt2_test.py
index 00a9cb4614..4e32cc37e1 100644
--- a/pytorch_transformers/tests/modeling_gpt2_test.py
+++ b/pytorch_transformers/tests/modeling_gpt2_test.py
@@ -16,19 +16,14 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import os
 import unittest
-import json
-import random
-import shutil
 import pytest
 
-import torch
 
 from pytorch_transformers import (GPT2Config, GPT2Model,
-                                     GPT2LMHeadModel, GPT2DoubleHeadsModel)
+                                  GPT2LMHeadModel, GPT2DoubleHeadsModel)
 
-from .modeling_common_test import (create_and_check_commons, ConfigTester, GPTModelTester)
+from .modeling_common_test import CommonTestCases, ConfigTester
 
 class GPT2ModelTest(unittest.TestCase):
 
@@ -37,14 +32,14 @@ class GPT2ModelTest(unittest.TestCase):
         config_tester.run_common_tests()
 
     def test_model(self):
-        model_tester = GPTModelTester(self, config_class=GPT2Config, base_model_class=GPT2Model,
+        model_tester = CommonTestCases.GPTModelTester(self, config_class=GPT2Config, base_model_class=GPT2Model,
                                             lm_head_model_class=GPT2LMHeadModel,
                                             double_head_model_class=GPT2DoubleHeadsModel)
         model_tester.run_common_tests(test_presents=True)
 
     @pytest.mark.slow
     def test_pretrained(self):
-        model_tester = GPTModelTester(self, config_class=GPT2Config, base_model_class=GPT2Model,
+        model_tester = CommonTestCases.GPTModelTester(self, config_class=GPT2Config, base_model_class=GPT2Model,
                                             lm_head_model_class=GPT2LMHeadModel,
                                             double_head_model_class=GPT2DoubleHeadsModel)
         model_tester.run_slow_tests()
diff --git a/pytorch_transformers/tests/modeling_openai_test.py b/pytorch_transformers/tests/modeling_openai_test.py
index 4f57f4661b..243afb9501 100644
--- a/pytorch_transformers/tests/modeling_openai_test.py
+++ b/pytorch_transformers/tests/modeling_openai_test.py
@@ -19,12 +19,11 @@ from __future__ import print_function
 import unittest
 import pytest
 
-import torch
 
 from pytorch_transformers import (OpenAIGPTConfig, OpenAIGPTModel,
                                      OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel)
 
-from .modeling_common_test import (create_and_check_commons, ConfigTester, GPTModelTester)
+from .modeling_common_test import CommonTestCases, ConfigTester
 
 class OpenAIModelTest(unittest.TestCase):
 
@@ -33,14 +32,14 @@ class OpenAIModelTest(unittest.TestCase):
         config_tester.run_common_tests()
 
     def test_model(self):
-        model_tester = GPTModelTester(self, config_class=OpenAIGPTConfig, base_model_class=OpenAIGPTModel,
+        model_tester = CommonTestCases.GPTModelTester(self, config_class=OpenAIGPTConfig, base_model_class=OpenAIGPTModel,
                                            lm_head_model_class=OpenAIGPTLMHeadModel,
                                            double_head_model_class=OpenAIGPTDoubleHeadsModel)
         model_tester.run_common_tests(test_presents=False)
 
     @pytest.mark.slow
     def test_pretrained(self):
-        model_tester = GPTModelTester(self, config_class=OpenAIGPTConfig, base_model_class=OpenAIGPTModel,
+        model_tester = CommonTestCases.GPTModelTester(self, config_class=OpenAIGPTConfig, base_model_class=OpenAIGPTModel,
                                            lm_head_model_class=OpenAIGPTLMHeadModel,
                                            double_head_model_class=OpenAIGPTDoubleHeadsModel)
         model_tester.run_slow_tests()
diff --git a/pytorch_transformers/tests/modeling_transfo_xl_test.py b/pytorch_transformers/tests/modeling_transfo_xl_test.py
index 9631cd6034..e3c0fbcdf0 100644
--- a/pytorch_transformers/tests/modeling_transfo_xl_test.py
+++ b/pytorch_transformers/tests/modeling_transfo_xl_test.py
@@ -28,9 +28,15 @@ import torch
 from pytorch_transformers import (TransfoXLConfig, TransfoXLModel, TransfoXLLMHeadModel)
 from pytorch_transformers.modeling_transfo_xl import TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP
 
-from .modeling_common_test import ConfigTester, create_and_check_commons, ids_tensor
+from .modeling_common_test import ConfigTester, CommonTestCases, ids_tensor
+
+class TransfoXLModelTest(CommonTestCases.CommonModelTester):
+
+    all_model_classes = (TransfoXLModel, TransfoXLLMHeadModel)
+    test_pruning = False
+    test_torchscript = False
+    test_resize_embeddings = False
 
-class TransfoXLModelTest(unittest.TestCase):
     class TransfoXLModelTester(object):
 
         def __init__(self,
@@ -52,7 +58,6 @@ class TransfoXLModelTest(unittest.TestCase):
                      num_hidden_layers=5,
                      scope=None,
                      seed=1,
-                     all_model_classes=(TransfoXLModel, TransfoXLLMHeadModel),
                      ):
             self.parent = parent
             self.batch_size = batch_size
@@ -73,7 +78,6 @@ class TransfoXLModelTest(unittest.TestCase):
             self.num_hidden_layers = num_hidden_layers
             self.scope = scope
             self.seed = seed
-            self.all_model_classes = all_model_classes
 
         def prepare_config_and_inputs(self):
             input_ids_1 = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
@@ -171,16 +175,31 @@ class TransfoXLModelTest(unittest.TestCase):
                 list(list(mem.size()) for mem in result["mems_2"]),
                 [[self.mem_len, self.batch_size, self.hidden_size]] * self.num_hidden_layers)
 
-        def create_and_check_transfo_xl_commons(self, config, input_ids_1, input_ids_2, lm_labels):
+        def prepare_config_and_inputs_for_common(self):
+            config_and_inputs = self.prepare_config_and_inputs()
+            (config, input_ids_1, input_ids_2, lm_labels) = config_and_inputs
             inputs_dict = {'input_ids': input_ids_1}
-            create_and_check_commons(self, config, inputs_dict, test_pruning=False, test_torchscript=False)
+            return config, inputs_dict
 
-    def test_default(self):
-        self.run_tester(TransfoXLModelTest.TransfoXLModelTester(self))
+
+    def setUp(self):
+        self.model_tester = TransfoXLModelTest.TransfoXLModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=TransfoXLConfig, d_embed=37)
 
     def test_config(self):
-        config_tester = ConfigTester(self, config_class=TransfoXLConfig, d_embed=37)
-        config_tester.run_common_tests()
+        self.config_tester.run_common_tests()
+
+    def test_transfo_xl_model(self):
+        self.model_tester.set_seed()
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        output_result = self.model_tester.create_transfo_xl_model(*config_and_inputs)
+        self.model_tester.check_transfo_xl_model_output(output_result)
+
+    def test_transfo_xl_lm_head(self):
+        self.model_tester.set_seed()
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        output_result = self.model_tester.create_transfo_xl_lm_head(*config_and_inputs)
+        self.model_tester.check_transfo_xl_lm_head_output(output_result)
 
     @pytest.mark.slow
     def test_model_from_pretrained(self):
@@ -190,23 +209,6 @@ class TransfoXLModelTest(unittest.TestCase):
             shutil.rmtree(cache_dir)
             self.assertIsNotNone(model)
 
-    def run_tester(self, tester):
-        config_and_inputs = tester.prepare_config_and_inputs()
-
-        tester.set_seed()
-        config_and_inputs = tester.prepare_config_and_inputs()
-        output_result = tester.create_transfo_xl_model(*config_and_inputs)
-        tester.check_transfo_xl_model_output(output_result)
-
-        tester.set_seed()
-        config_and_inputs = tester.prepare_config_and_inputs()
-        output_result = tester.create_transfo_xl_lm_head(*config_and_inputs)
-        tester.check_transfo_xl_lm_head_output(output_result)
-
-        tester.set_seed()
-        config_and_inputs = tester.prepare_config_and_inputs()
-        tester.create_and_check_transfo_xl_commons(*config_and_inputs)
-
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/pytorch_transformers/tests/modeling_xlm_test.py b/pytorch_transformers/tests/modeling_xlm_test.py
index 9d6bc4054d..85189859a6 100644
--- a/pytorch_transformers/tests/modeling_xlm_test.py
+++ b/pytorch_transformers/tests/modeling_xlm_test.py
@@ -23,10 +23,15 @@ import pytest
 from pytorch_transformers import (XLMConfig, XLMModel, XLMWithLMHeadModel, XLMForQuestionAnswering, XLMForSequenceClassification)
 from pytorch_transformers.modeling_xlm import XLM_PRETRAINED_MODEL_ARCHIVE_MAP
 
-from .modeling_common_test import (create_and_check_commons, ConfigTester, ids_tensor)
+from .modeling_common_test import (CommonTestCases, ConfigTester, ids_tensor)
 
 
-class XLMModelTest(unittest.TestCase):
+class XLMModelTest(CommonTestCases.CommonModelTester):
+
+    all_model_classes = (XLMModel, XLMWithLMHeadModel,  
+                         XLMForQuestionAnswering, XLMForSequenceClassification) 
+                         # , XLMForSequenceClassification, XLMForTokenClassification),
+
     class XLMModelTester(object):
 
         def __init__(self,
@@ -58,8 +63,6 @@ class XLMModelTest(unittest.TestCase):
                      summary_type="last",
                      use_proj=True,
                      scope=None,
-                     all_model_classes = (XLMModel, XLMWithLMHeadModel,
-                                          XLMForQuestionAnswering, XLMForSequenceClassification),  # , XLMForSequenceClassification, XLMForTokenClassification),
                     ):
             self.parent = parent
             self.batch_size = batch_size
@@ -90,7 +93,6 @@ class XLMModelTest(unittest.TestCase):
             self.num_labels = num_labels
             self.num_choices = num_choices
             self.scope = scope
-            self.all_model_classes = all_model_classes
 
         def prepare_config_and_inputs(self):
             input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
@@ -237,28 +239,23 @@ class XLMModelTest(unittest.TestCase):
                 [self.batch_size, self.type_sequence_label_size])
 
 
-        def create_and_check_xlm_commons(self, config, input_ids, token_type_ids, input_lengths, sequence_labels, token_labels, is_impossible_labels, input_mask):
+        def prepare_config_and_inputs_for_common(self):
+            config_and_inputs = self.prepare_config_and_inputs()
+            (config, input_ids, token_type_ids, input_lengths,
+             sequence_labels, token_labels, is_impossible_labels, input_mask) = config_and_inputs
             inputs_dict = {'input_ids': input_ids, 'token_type_ids': token_type_ids, 'lengths': input_lengths}
-            create_and_check_commons(self, config, inputs_dict)
+            return config, inputs_dict
 
-    def test_default(self):
-        self.run_tester(XLMModelTest.XLMModelTester(self))
+    def setUp(self):
+        self.model_tester = XLMModelTest.XLMModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=XLMConfig, emb_dim=37)
 
     def test_config(self):
-        config_tester = ConfigTester(self, config_class=XLMConfig, emb_dim=37)
-        config_tester.run_common_tests()
+        self.config_tester.run_common_tests()
 
-    @pytest.mark.slow
-    def test_model_from_pretrained(self):
-        cache_dir = "/tmp/pytorch_transformers_test/"
-        for model_name in list(XLM_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
-            model = XLMModel.from_pretrained(model_name, cache_dir=cache_dir)
-            shutil.rmtree(cache_dir)
-            self.assertIsNotNone(model)
-
-    def run_tester(self, tester):
-        config_and_inputs = tester.prepare_config_and_inputs()
-        tester.create_and_check_xlm_model(*config_and_inputs)
+    def test_xlm_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_xlm_model(*config_and_inputs)
 
         # config_and_inputs = tester.prepare_config_and_inputs()
         # tester.create_and_check_xlm_for_masked_lm(*config_and_inputs)
@@ -275,8 +272,14 @@ class XLMModelTest(unittest.TestCase):
         # config_and_inputs = tester.prepare_config_and_inputs()
         # tester.create_and_check_xlm_for_token_classification(*config_and_inputs)
 
-        config_and_inputs = tester.prepare_config_and_inputs()
-        tester.create_and_check_xlm_commons(*config_and_inputs)
+    @pytest.mark.slow
+    def test_model_from_pretrained(self):
+        cache_dir = "/tmp/pytorch_transformers_test/"
+        for model_name in list(XLM_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
+            model = XLMModel.from_pretrained(model_name, cache_dir=cache_dir)
+            shutil.rmtree(cache_dir)
+            self.assertIsNotNone(model)
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/pytorch_transformers/tests/modeling_xlnet_test.py b/pytorch_transformers/tests/modeling_xlnet_test.py
index 41c114ce9c..3792125d6e 100644
--- a/pytorch_transformers/tests/modeling_xlnet_test.py
+++ b/pytorch_transformers/tests/modeling_xlnet_test.py
@@ -28,9 +28,14 @@ import torch
 from pytorch_transformers import (XLNetConfig, XLNetModel, XLNetLMHeadModel, XLNetForSequenceClassification, XLNetForQuestionAnswering)
 from pytorch_transformers.modeling_xlnet import XLNET_PRETRAINED_MODEL_ARCHIVE_MAP
 
-from .modeling_common_test import ConfigTester, create_and_check_commons, ids_tensor
+from .modeling_common_test import ConfigTester, CommonTestCases, ids_tensor
+
+class XLNetModelTest(CommonTestCases.CommonModelTester):
+
+    all_model_classes=(XLNetModel, XLNetLMHeadModel,
+                    XLNetForSequenceClassification, XLNetForQuestionAnswering)
+    test_pruning = False
 
-class XLNetModelTest(unittest.TestCase):
     class XLNetModelTester(object):
 
         def __init__(self,
@@ -56,8 +61,6 @@ class XLNetModelTest(unittest.TestCase):
                      initializer_range=0.05,
                      seed=1,
                      type_vocab_size=2,
-                     all_model_classes=(XLNetModel, XLNetLMHeadModel,
-                                        XLNetForSequenceClassification, XLNetForQuestionAnswering),
             ):
             self.parent = parent
             self.batch_size = batch_size
@@ -82,7 +85,6 @@ class XLNetModelTest(unittest.TestCase):
             self.seed = seed
             self.type_vocab_size = type_vocab_size
             self.type_sequence_label_size = type_sequence_label_size
-            self.all_model_classes = all_model_classes
 
         def prepare_config_and_inputs(self):
             input_ids_1 = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
@@ -264,17 +266,41 @@ class XLNetModelTest(unittest.TestCase):
                 list(list(mem.size()) for mem in result["mems_1"]),
                 [[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers)
 
-        def create_and_check_xlnet_commons(self, config, input_ids_1, input_ids_2, input_ids_q, perm_mask, input_mask,
-                target_mapping, inp_q, segment_ids, lm_labels, sequence_labels, is_impossible_labels):
+        def prepare_config_and_inputs_for_common(self):
+            config_and_inputs = self.prepare_config_and_inputs()
+            (config, input_ids_1, input_ids_2, input_ids_q, perm_mask, input_mask,
+                target_mapping, inp_q, segment_ids, lm_labels,
+                sequence_labels, is_impossible_labels) = config_and_inputs
             inputs_dict = {'input_ids': input_ids_1}
-            create_and_check_commons(self, config, inputs_dict, test_pruning=False)
+            return config, inputs_dict
 
-    def test_default(self):
-        self.run_tester(XLNetModelTest.XLNetModelTester(self))
+
+    def setUp(self):
+        self.model_tester = XLNetModelTest.XLNetModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=XLNetConfig, d_inner=37)
 
     def test_config(self):
-        config_tester = ConfigTester(self, config_class=XLNetConfig, d_inner=37)
-        config_tester.run_common_tests()
+        self.config_tester.run_common_tests()
+
+    def test_xlnet_base_model(self):
+        self.model_tester.set_seed()
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_xlnet_base_model(*config_and_inputs)
+
+    def test_xlnet_lm_head(self):
+        self.model_tester.set_seed()
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_xlnet_lm_head(*config_and_inputs) 
+
+    def test_xlnet_sequence_classif(self):
+        self.model_tester.set_seed()
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_xlnet_sequence_classif(*config_and_inputs)
+
+    def test_xlnet_qa(self):
+        self.model_tester.set_seed()
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_xlnet_qa(*config_and_inputs)
 
     @pytest.mark.slow
     def test_model_from_pretrained(self):
@@ -284,27 +310,6 @@ class XLNetModelTest(unittest.TestCase):
             shutil.rmtree(cache_dir)
             self.assertIsNotNone(model)
 
-    def run_tester(self, tester):
-        tester.set_seed()
-        config_and_inputs = tester.prepare_config_and_inputs()
-        tester.create_and_check_xlnet_base_model(*config_and_inputs)
-
-        tester.set_seed()
-        config_and_inputs = tester.prepare_config_and_inputs()
-        tester.create_and_check_xlnet_lm_head(*config_and_inputs) 
-
-        tester.set_seed()
-        config_and_inputs = tester.prepare_config_and_inputs()
-        tester.create_and_check_xlnet_sequence_classif(*config_and_inputs)
-
-        tester.set_seed()
-        config_and_inputs = tester.prepare_config_and_inputs()
-        tester.create_and_check_xlnet_qa(*config_and_inputs)
-
-        tester.set_seed()
-        config_and_inputs = tester.prepare_config_and_inputs()
-        tester.create_and_check_xlnet_commons(*config_and_inputs)
-
 
 if __name__ == "__main__":
     unittest.main()

From 744295636116eac1c0b84e23e9b3cab90886a45d Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Fri, 12 Jul 2019 11:26:16 +0200
Subject: [PATCH 110/139] save config file

---
 pytorch_transformers/modeling_utils.py | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/pytorch_transformers/modeling_utils.py b/pytorch_transformers/modeling_utils.py
index 9ca3a3d090..bb2b82b41c 100644
--- a/pytorch_transformers/modeling_utils.py
+++ b/pytorch_transformers/modeling_utils.py
@@ -48,6 +48,17 @@ class PretrainedConfig(object):
         self.output_hidden_states = kwargs.pop('output_hidden_states', False)
         self.torchscript = kwargs.pop('torchscript', False)
 
+    def save_pretrained(self, save_directory):
+        """ Save a configuration file to a directory, so that it
+            can be re-loaded using the `from_pretrained(save_directory)` class method.
+        """
+        assert os.path.isdir(save_directory), "Saving path should be a directory where the model and configuration can be saved"
+
+        # If we save using the predefined names, we can load using `from_pretrained`
+        output_config_file = os.path.join(save_directory, CONFIG_NAME)
+
+        self.to_json_file(output_config_file)
+
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path, *input, **kwargs):
         """
@@ -248,12 +259,13 @@ class PreTrainedModel(nn.Module):
         # Only save the model it-self if we are using distributed training
         model_to_save = self.module if hasattr(self, 'module') else self
 
+        # Save configuration file
+        model_to_save.config.save_pretrained(save_directory)
+
         # If we save using the predefined names, we can load using `from_pretrained`
         output_model_file = os.path.join(save_directory, WEIGHTS_NAME)
-        output_config_file = os.path.join(save_directory, CONFIG_NAME)
 
         torch.save(model_to_save.state_dict(), output_model_file)
-        model_to_save.config.to_json_file(output_config_file)
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):

From 762ded9b1c92f9cef2aa08c907e0f9b11b43e37a Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Fri, 12 Jul 2019 11:28:52 +0200
Subject: [PATCH 111/139] wip examples

---
 examples/run_bert_squad.py |  2 +-
 examples/run_glue.py       |  2 +-
 examples/run_squad.py      | 39 ++++++++++++++++++++------------------
 3 files changed, 23 insertions(+), 20 deletions(-)

diff --git a/examples/run_bert_squad.py b/examples/run_bert_squad.py
index c3fdb06316..e5ba1b3b95 100644
--- a/examples/run_bert_squad.py
+++ b/examples/run_bert_squad.py
@@ -13,7 +13,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Run BERT on SQuAD."""
+""" Finetuning a question-answering Bert model on SQuAD."""
 
 from __future__ import absolute_import, division, print_function
 
diff --git a/examples/run_glue.py b/examples/run_glue.py
index 7e615804c1..6f96a23476 100644
--- a/examples/run_glue.py
+++ b/examples/run_glue.py
@@ -13,7 +13,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""BERT finetuning runner."""
+""" Finetuning a classification model (Bert, XLM, XLNet,...) on GLUE."""
 
 from __future__ import absolute_import, division, print_function
 
diff --git a/examples/run_squad.py b/examples/run_squad.py
index 7f063109e3..3d3d964687 100644
--- a/examples/run_squad.py
+++ b/examples/run_squad.py
@@ -13,7 +13,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Run BERT on SQuAD."""
+""" Finetuning a question-answering model (Bert, XLM, XLNet,...) on SQuAD."""
 
 from __future__ import absolute_import, division, print_function
 
@@ -21,7 +21,6 @@ import argparse
 import logging
 import os
 import random
-import sys
 from io import open
 
 import numpy as np
@@ -33,31 +32,35 @@ from tqdm import tqdm, trange
 
 from tensorboardX import SummaryWriter
 
-from pytorch_transformers import (BertForQuestionAnswering, XLNetForQuestionAnswering,
-                                  XLMForQuestionAnswering, BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
-                                  XLNET_PRETRAINED_MODEL_ARCHIVE_MAP, XLM_PRETRAINED_MODEL_ARCHIVE_MAP)
-from pytorch_transformers import (BertTokenizer, XLNetTokenizer,
-                                  XLMTokenizer)
+from pytorch_transformers import (WEIGHTS_NAME, BertConfig,
+                                  BertForQuestionAnswering, BertTokenizer,
+                                  XLMConfig, XLMForQuestionAnswering,
+                                  XLMTokenizer, XLNetConfig,
+                                  XLNetForQuestionAnswering,
+                                  XLNetTokenizer)
+
+from pytorch_transformers import AdamW, WarmupLinearSchedule
 
 from utils_squad import read_squad_examples, convert_examples_to_features, RawResult, write_predictions
 
 logger = logging.getLogger(__name__)
 
-ALL_MODELS = sum((tuple(m.keys()) for m in (BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
-                                            XLNET_PRETRAINED_MODEL_ARCHIVE_MAP,
-                                            XLM_PRETRAINED_MODEL_ARCHIVE_MAP)), ())
+ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) \
+                  for conf in (BertConfig, XLNetConfig, XLMConfig)), ())
 
 MODEL_CLASSES = {
-    'bert': BertForQuestionAnswering,
-    'xlnet': XLNetForQuestionAnswering,
-    'xlm': XLMForQuestionAnswering,
+    'bert': (BertConfig, BertForQuestionAnswering, BertTokenizer),
+    'xlnet': (XLNetConfig, XLNetForQuestionAnswering, XLNetTokenizer),
+    'xlm': (XLMConfig, XLMForQuestionAnswering, XLMTokenizer),
 }
 
-TOKENIZER_CLASSES = {
-    'bert': BertTokenizer,
-    'xlnet': XLNetTokenizer,
-    'xlm': XLMTokenizer,
-}
+def set_seed(args):
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+    torch.manual_seed(args.seed)
+    if args.n_gpu > 0:
+        torch.cuda.manual_seed_all(args.seed)
+
 
 def train(args, train_dataset, model):
     """ Train the model """

From 699bc7e86ea2253bfb2f011f006180b2e49f0703 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Fri, 12 Jul 2019 11:46:57 +0200
Subject: [PATCH 112/139] fix gpt-2 unk token test

---
 docs/README.md                            | 2 +-
 pytorch_transformers/tokenization_gpt2.py | 4 +++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/docs/README.md b/docs/README.md
index c39ecda0d1..1b3c1feade 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -57,4 +57,4 @@ It should build the static app that will be available under `/docs/_build/html`
 ## Adding a new element to the tree (toc-tree)
 
 Accepted files are reStructuredText (.rst) and Markdown (.md). Create a file with its extension and put it
-in the source directory. You can then link it to the toc-tree by putting the filename without the extension.
\ No newline at end of file
+in the source directory. You can then link it to the toc-tree by putting the filename without the extension.
diff --git a/pytorch_transformers/tokenization_gpt2.py b/pytorch_transformers/tokenization_gpt2.py
index 6084dc3e05..bd90a92251 100644
--- a/pytorch_transformers/tokenization_gpt2.py
+++ b/pytorch_transformers/tokenization_gpt2.py
@@ -177,7 +177,9 @@ class GPT2Tokenizer(PreTrainedTokenizer):
 
     def _convert_token_to_id(self, token):
         """ Converts a token (str/unicode) in an id using the vocab. """
-        return self.encoder.get(token)
+        if token in self.encoder:
+            return self.encoder.get(token)
+        return self.encoder.get(self.unk_token)
 
     def _convert_id_to_token(self, index):
         """Converts an index (integer) in a token (string/unicode) using the vocab."""

From 936e813c848aa5cad842a18498c440a72505c265 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Fri, 12 Jul 2019 14:16:06 +0200
Subject: [PATCH 113/139] clean up examples - added squad example and test

---
 examples/generation_xlnet.py                  |  20 -
 examples/run_bert_extract_features.py         | 297 ----------
 examples/run_bert_squad.py                    | 399 -------------
 examples/{bertology.py => run_bertology.py}   |   0
 examples/run_glue.py                          |  13 +-
 examples/run_gpt2.py                          | 131 -----
 examples/run_squad.py                         | 505 ++++++++--------
 examples/run_swag.py                          | 555 ++++++++++++++++++
 examples/run_xlnet_squad.py                   | 399 -------------
 examples/test_examples.py                     |  26 +
 examples/tests_samples/.gitignore             |   1 +
 .../tests_samples/SQUAD/dev-v2.0-small.json   | 140 +++++
 examples/utils_squad.py                       |   4 +-
 examples/utils_squad_evaluate.py              | 289 +++++++++
 14 files changed, 1266 insertions(+), 1513 deletions(-)
 delete mode 100644 examples/generation_xlnet.py
 delete mode 100644 examples/run_bert_extract_features.py
 delete mode 100644 examples/run_bert_squad.py
 rename examples/{bertology.py => run_bertology.py} (100%)
 delete mode 100644 examples/run_gpt2.py
 create mode 100644 examples/run_swag.py
 delete mode 100644 examples/run_xlnet_squad.py
 create mode 100644 examples/tests_samples/SQUAD/dev-v2.0-small.json
 create mode 100644 examples/utils_squad_evaluate.py

diff --git a/examples/generation_xlnet.py b/examples/generation_xlnet.py
deleted file mode 100644
index fe3610cfd1..0000000000
--- a/examples/generation_xlnet.py
+++ /dev/null
@@ -1,20 +0,0 @@
-import torch
-from torch.nn import functional as F
-from pytorch_transformers import XLNetModel, XLNetLMHeadModel, XLNetTokenizer
-
-import logging
-logging.basicConfig(level=logging.INFO)
-
-tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
-model = XLNetLMHeadModel.from_pretrained('xlnet-large-cased', attn_type='uni')
-
-tokens = tokenizer.encode('I am very happy')
-for i in range(len(tokens), 20):
-    mask = torch.tensor([[[0.0] * i + [1.0]]])
-    logits, _ = model(torch.tensor([tokens + [0]]),
-                    #   perm_mask=mask.expand(-1, i+1, -1),
-                      target_mapping=mask,
-                      inp_q=mask.squeeze(1))
-    output = torch.multinomial(F.softmax(logits[0, 0, :]), 1)
-    tokens.append(output.item())
-    print(tokenizer.decode(tokens))
diff --git a/examples/run_bert_extract_features.py b/examples/run_bert_extract_features.py
deleted file mode 100644
index cc7dedd6af..0000000000
--- a/examples/run_bert_extract_features.py
+++ /dev/null
@@ -1,297 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Extract pre-computed feature vectors from a PyTorch BERT model."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import argparse
-import collections
-import logging
-import json
-import re
-
-import torch
-from torch.utils.data import TensorDataset, DataLoader, SequentialSampler
-from torch.utils.data.distributed import DistributedSampler
-
-from pytorch_transformers.tokenization_bert import BertTokenizer
-from pytorch_transformers.modeling_bert import BertModel
-
-logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s', 
-                    datefmt = '%m/%d/%Y %H:%M:%S',
-                    level = logging.INFO)
-logger = logging.getLogger(__name__)
-
-
-class InputExample(object):
-
-    def __init__(self, unique_id, text_a, text_b):
-        self.unique_id = unique_id
-        self.text_a = text_a
-        self.text_b = text_b
-
-
-class InputFeatures(object):
-    """A single set of features of data."""
-
-    def __init__(self, unique_id, tokens, input_ids, input_mask, input_type_ids):
-        self.unique_id = unique_id
-        self.tokens = tokens
-        self.input_ids = input_ids
-        self.input_mask = input_mask
-        self.input_type_ids = input_type_ids
-
-
-def convert_examples_to_features(examples, seq_length, tokenizer):
-    """Loads a data file into a list of `InputFeature`s."""
-
-    features = []
-    for (ex_index, example) in enumerate(examples):
-        tokens_a = tokenizer.tokenize(example.text_a)
-
-        tokens_b = None
-        if example.text_b:
-            tokens_b = tokenizer.tokenize(example.text_b)
-
-        if tokens_b:
-            # Modifies `tokens_a` and `tokens_b` in place so that the total
-            # length is less than the specified length.
-            # Account for [CLS], [SEP], [SEP] with "- 3"
-            _truncate_seq_pair(tokens_a, tokens_b, seq_length - 3)
-        else:
-            # Account for [CLS] and [SEP] with "- 2"
-            if len(tokens_a) > seq_length - 2:
-                tokens_a = tokens_a[0:(seq_length - 2)]
-
-        # The convention in BERT is:
-        # (a) For sequence pairs:
-        #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
-        #  type_ids:   0   0  0    0    0     0      0   0    1  1  1   1  1   1
-        # (b) For single sequences:
-        #  tokens:   [CLS] the dog is hairy . [SEP]
-        #  type_ids:   0   0   0   0  0     0   0
-        #
-        # Where "type_ids" are used to indicate whether this is the first
-        # sequence or the second sequence. The embedding vectors for `type=0` and
-        # `type=1` were learned during pre-training and are added to the wordpiece
-        # embedding vector (and position vector). This is not *strictly* necessary
-        # since the [SEP] token unambigiously separates the sequences, but it makes
-        # it easier for the model to learn the concept of sequences.
-        #
-        # For classification tasks, the first vector (corresponding to [CLS]) is
-        # used as as the "sentence vector". Note that this only makes sense because
-        # the entire model is fine-tuned.
-        tokens = []
-        input_type_ids = []
-        tokens.append("[CLS]")
-        input_type_ids.append(0)
-        for token in tokens_a:
-            tokens.append(token)
-            input_type_ids.append(0)
-        tokens.append("[SEP]")
-        input_type_ids.append(0)
-
-        if tokens_b:
-            for token in tokens_b:
-                tokens.append(token)
-                input_type_ids.append(1)
-            tokens.append("[SEP]")
-            input_type_ids.append(1)
-
-        input_ids = tokenizer.convert_tokens_to_ids(tokens)
-
-        # The mask has 1 for real tokens and 0 for padding tokens. Only real
-        # tokens are attended to.
-        input_mask = [1] * len(input_ids)
-
-        # Zero-pad up to the sequence length.
-        while len(input_ids) < seq_length:
-            input_ids.append(0)
-            input_mask.append(0)
-            input_type_ids.append(0)
-
-        assert len(input_ids) == seq_length
-        assert len(input_mask) == seq_length
-        assert len(input_type_ids) == seq_length
-
-        if ex_index < 5:
-            logger.info("*** Example ***")
-            logger.info("unique_id: %s" % (example.unique_id))
-            logger.info("tokens: %s" % " ".join([str(x) for x in tokens]))
-            logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
-            logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
-            logger.info(
-                "input_type_ids: %s" % " ".join([str(x) for x in input_type_ids]))
-
-        features.append(
-            InputFeatures(
-                unique_id=example.unique_id,
-                tokens=tokens,
-                input_ids=input_ids,
-                input_mask=input_mask,
-                input_type_ids=input_type_ids))
-    return features
-
-
-def _truncate_seq_pair(tokens_a, tokens_b, max_length):
-    """Truncates a sequence pair in place to the maximum length."""
-
-    # This is a simple heuristic which will always truncate the longer sequence
-    # one token at a time. This makes more sense than truncating an equal percent
-    # of tokens from each, since if one sequence is very short then each token
-    # that's truncated likely contains more information than a longer sequence.
-    while True:
-        total_length = len(tokens_a) + len(tokens_b)
-        if total_length <= max_length:
-            break
-        if len(tokens_a) > len(tokens_b):
-            tokens_a.pop()
-        else:
-            tokens_b.pop()
-
-
-def read_examples(input_file):
-    """Read a list of `InputExample`s from an input file."""
-    examples = []
-    unique_id = 0
-    with open(input_file, "r", encoding='utf-8') as reader:
-        while True:
-            line = reader.readline()
-            if not line:
-                break
-            line = line.strip()
-            text_a = None
-            text_b = None
-            m = re.match(r"^(.*) \|\|\| (.*)$", line)
-            if m is None:
-                text_a = line
-            else:
-                text_a = m.group(1)
-                text_b = m.group(2)
-            examples.append(
-                InputExample(unique_id=unique_id, text_a=text_a, text_b=text_b))
-            unique_id += 1
-    return examples
-
-
-def main():
-    parser = argparse.ArgumentParser()
-
-    ## Required parameters
-    parser.add_argument("--input_file", default=None, type=str, required=True)
-    parser.add_argument("--output_file", default=None, type=str, required=True)
-    parser.add_argument("--bert_model", default=None, type=str, required=True,
-                        help="Bert pre-trained model selected in the list: bert-base-uncased, "
-                             "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.")
-
-    ## Other parameters
-    parser.add_argument("--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.")
-    parser.add_argument("--layers", default="-1,-2,-3,-4", type=str)
-    parser.add_argument("--max_seq_length", default=128, type=int,
-                        help="The maximum total input sequence length after WordPiece tokenization. Sequences longer "
-                            "than this will be truncated, and sequences shorter than this will be padded.")
-    parser.add_argument("--batch_size", default=32, type=int, help="Batch size for predictions.")
-    parser.add_argument("--local_rank",
-                        type=int,
-                        default=-1,
-                        help = "local_rank for distributed training on gpus")
-    parser.add_argument("--no_cuda",
-                        action='store_true',
-                        help="Whether not to use CUDA when available")
-
-    args = parser.parse_args()
-
-    if args.local_rank == -1 or args.no_cuda:
-        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
-        n_gpu = torch.cuda.device_count()
-    else:
-        device = torch.device("cuda", args.local_rank)
-        n_gpu = 1
-        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
-        torch.distributed.init_process_group(backend='nccl')
-    logger.info("device: {} n_gpu: {} distributed training: {}".format(device, n_gpu, bool(args.local_rank != -1)))
-
-    layer_indexes = [int(x) for x in args.layers.split(",")]
-
-    tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)
-
-    examples = read_examples(args.input_file)
-
-    features = convert_examples_to_features(
-        examples=examples, seq_length=args.max_seq_length, tokenizer=tokenizer)
-
-    unique_id_to_feature = {}
-    for feature in features:
-        unique_id_to_feature[feature.unique_id] = feature
-
-    model = BertModel.from_pretrained(args.bert_model)
-    model.to(device)
-
-    if args.local_rank != -1:
-        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
-                                                          output_device=args.local_rank)
-    elif n_gpu > 1:
-        model = torch.nn.DataParallel(model)
-
-    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
-    all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
-    all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
-
-    eval_data = TensorDataset(all_input_ids, all_input_mask, all_example_index)
-    if args.local_rank == -1:
-        eval_sampler = SequentialSampler(eval_data)
-    else:
-        eval_sampler = DistributedSampler(eval_data)
-    eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.batch_size)
-
-    model.eval()
-    with open(args.output_file, "w", encoding='utf-8') as writer:
-        for input_ids, input_mask, example_indices in eval_dataloader:
-            input_ids = input_ids.to(device)
-            input_mask = input_mask.to(device)
-
-            all_encoder_layers, _ = model(input_ids, token_type_ids=None, attention_mask=input_mask)
-            all_encoder_layers = all_encoder_layers
-
-            for b, example_index in enumerate(example_indices):
-                feature = features[example_index.item()]
-                unique_id = int(feature.unique_id)
-                # feature = unique_id_to_feature[unique_id]
-                output_json = collections.OrderedDict()
-                output_json["linex_index"] = unique_id
-                all_out_features = []
-                for (i, token) in enumerate(feature.tokens):
-                    all_layers = []
-                    for (j, layer_index) in enumerate(layer_indexes):
-                        layer_output = all_encoder_layers[int(layer_index)].detach().cpu().numpy()
-                        layer_output = layer_output[b]
-                        layers = collections.OrderedDict()
-                        layers["index"] = layer_index
-                        layers["values"] = [
-                            round(x.item(), 6) for x in layer_output[i]
-                        ]
-                        all_layers.append(layers)
-                    out_features = collections.OrderedDict()
-                    out_features["token"] = token
-                    out_features["layers"] = all_layers
-                    all_out_features.append(out_features)
-                output_json["features"] = all_out_features
-                writer.write(json.dumps(output_json) + "\n")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/examples/run_bert_squad.py b/examples/run_bert_squad.py
deleted file mode 100644
index e5ba1b3b95..0000000000
--- a/examples/run_bert_squad.py
+++ /dev/null
@@ -1,399 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Finetuning a question-answering Bert model on SQuAD."""
-
-from __future__ import absolute_import, division, print_function
-
-import argparse
-import logging
-import os
-import random
-import sys
-from io import open
-
-import numpy as np
-import torch
-from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
-                              TensorDataset)
-from torch.utils.data.distributed import DistributedSampler
-from tqdm import tqdm, trange
-
-from tensorboardX import SummaryWriter
-
-from pytorch_transformers import WEIGHTS_NAME, CONFIG_NAME
-from pytorch_transformers.modeling_bert import BertForQuestionAnswering
-from pytorch_transformers.optimization import BertAdam, WarmupLinearSchedule
-from pytorch_transformers.tokenization_bert import BertTokenizer
-
-from utils_squad import read_squad_examples, convert_examples_to_features, RawResult, write_predictions
-
-if sys.version_info[0] == 2:
-    import cPickle as pickle
-else:
-    import pickle
-
-logger = logging.getLogger(__name__)
-
-
-def main():
-    parser = argparse.ArgumentParser()
-
-    ## Required parameters
-    parser.add_argument("--bert_model", default=None, type=str, required=True,
-                        help="Bert pre-trained model selected in the list: bert-base-uncased, "
-                        "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, "
-                        "bert-base-multilingual-cased, bert-base-chinese.")
-    parser.add_argument("--output_dir", default=None, type=str, required=True,
-                        help="The output directory where the model checkpoints and predictions will be written.")
-
-    ## Other parameters
-    parser.add_argument("--train_file", default=None, type=str, help="SQuAD json for training. E.g., train-v1.1.json")
-    parser.add_argument("--predict_file", default=None, type=str,
-                        help="SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json")
-    parser.add_argument("--max_seq_length", default=384, type=int,
-                        help="The maximum total input sequence length after WordPiece tokenization. Sequences "
-                             "longer than this will be truncated, and sequences shorter than this will be padded.")
-    parser.add_argument("--doc_stride", default=128, type=int,
-                        help="When splitting up a long document into chunks, how much stride to take between chunks.")
-    parser.add_argument("--max_query_length", default=64, type=int,
-                        help="The maximum number of tokens for the question. Questions longer than this will "
-                             "be truncated to this length.")
-    parser.add_argument("--do_train", action='store_true', help="Whether to run training.")
-    parser.add_argument("--do_predict", action='store_true', help="Whether to run eval on the dev set.")
-    parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.")
-    parser.add_argument("--predict_batch_size", default=8, type=int, help="Total batch size for predictions.")
-    parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
-    parser.add_argument("--num_train_epochs", default=3.0, type=float,
-                        help="Total number of training epochs to perform.")
-    parser.add_argument("--warmup_proportion", default=0.1, type=float,
-                        help="Proportion of training to perform linear learning rate warmup for. E.g., 0.1 = 10%% "
-                             "of training.")
-    parser.add_argument("--n_best_size", default=20, type=int,
-                        help="The total number of n-best predictions to generate in the nbest_predictions.json "
-                             "output file.")
-    parser.add_argument("--max_answer_length", default=30, type=int,
-                        help="The maximum length of an answer that can be generated. This is needed because the start "
-                             "and end predictions are not conditioned on one another.")
-    parser.add_argument("--verbose_logging", action='store_true',
-                        help="If true, all of the warnings related to data processing will be printed. "
-                             "A number of warnings are expected for a normal SQuAD evaluation.")
-    parser.add_argument("--no_cuda",
-                        action='store_true',
-                        help="Whether not to use CUDA when available")
-    parser.add_argument('--seed',
-                        type=int,
-                        default=42,
-                        help="random seed for initialization")
-    parser.add_argument('--gradient_accumulation_steps',
-                        type=int,
-                        default=1,
-                        help="Number of updates steps to accumulate before performing a backward/update pass.")
-    parser.add_argument("--do_lower_case",
-                        action='store_true',
-                        help="Whether to lower case the input text. True for uncased models, False for cased models.")
-    parser.add_argument("--local_rank",
-                        type=int,
-                        default=-1,
-                        help="local_rank for distributed training on gpus")
-    parser.add_argument('--fp16',
-                        action='store_true',
-                        help="Whether to use 16-bit float precision instead of 32-bit")
-    parser.add_argument('--overwrite_output_dir',
-                        action='store_true',
-                        help="Overwrite the content of the output directory")
-    parser.add_argument('--loss_scale',
-                        type=float, default=0,
-                        help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
-                             "0 (default value): dynamic loss scaling.\n"
-                             "Positive power of 2: static loss scaling value.\n")
-    parser.add_argument('--version_2_with_negative',
-                        action='store_true',
-                        help='If true, the SQuAD examples contain some that do not have an answer.')
-    parser.add_argument('--null_score_diff_threshold',
-                        type=float, default=0.0,
-                        help="If null_score - best_non_null is greater than the threshold predict null.")
-    parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.")
-    parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.")
-    args = parser.parse_args()
-    print(args)
-
-    if args.server_ip and args.server_port:
-        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
-        import ptvsd
-        print("Waiting for debugger attach")
-        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
-        ptvsd.wait_for_attach()
-
-    if args.local_rank == -1 or args.no_cuda:
-        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
-        n_gpu = torch.cuda.device_count()
-    else:
-        torch.cuda.set_device(args.local_rank)
-        device = torch.device("cuda", args.local_rank)
-        n_gpu = 1
-        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
-        torch.distributed.init_process_group(backend='nccl')
-
-    logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
-                        datefmt = '%m/%d/%Y %H:%M:%S',
-                        level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
-
-    logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format(
-        device, n_gpu, bool(args.local_rank != -1), args.fp16))
-
-    if args.gradient_accumulation_steps < 1:
-        raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(
-                            args.gradient_accumulation_steps))
-
-    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps
-
-    random.seed(args.seed)
-    np.random.seed(args.seed)
-    torch.manual_seed(args.seed)
-    if n_gpu > 0:
-        torch.cuda.manual_seed_all(args.seed)
-
-    if not args.do_train and not args.do_predict:
-        raise ValueError("At least one of `do_train` or `do_predict` must be True.")
-
-    if args.do_train:
-        if not args.train_file:
-            raise ValueError(
-                "If `do_train` is True, then `train_file` must be specified.")
-    if args.do_predict:
-        if not args.predict_file:
-            raise ValueError(
-                "If `do_predict` is True, then `predict_file` must be specified.")
-
-    if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir:
-        raise ValueError("Output directory {} already exists and is not empty. Use --overwrite_output_dir to overcome.".format(args.output_dir))
-    if not os.path.exists(args.output_dir):
-        os.makedirs(args.output_dir)
-
-    if args.local_rank not in [-1, 0]:
-        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
-    tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)
-    model = BertForQuestionAnswering.from_pretrained(args.bert_model)
-    if args.local_rank == 0:
-        torch.distributed.barrier()
-
-    if args.fp16:
-        model.half()
-    model.to(device)
-    if args.local_rank != -1:
-        model = torch.nn.parallel.DistributedDataParallel(model,
-                                                          device_ids=[args.local_rank],
-                                                          output_device=args.local_rank,
-                                                          find_unused_parameters=True)
-    elif n_gpu > 1:
-        model = torch.nn.DataParallel(model)
-
-    if args.do_train:
-        if args.local_rank in [-1, 0]:
-            tb_writer = SummaryWriter()
-        # Prepare data loader
-        train_examples = read_squad_examples(
-            input_file=args.train_file, is_training=True, version_2_with_negative=args.version_2_with_negative)
-        cached_train_features_file = args.train_file+'_{0}_{1}_{2}_{3}'.format(
-            list(filter(None, args.bert_model.split('/'))).pop(), str(args.max_seq_length), str(args.doc_stride), str(args.max_query_length))
-        try:
-            with open(cached_train_features_file, "rb") as reader:
-                train_features = pickle.load(reader)
-        except:
-            train_features = convert_examples_to_features(
-                examples=train_examples,
-                tokenizer=tokenizer,
-                max_seq_length=args.max_seq_length,
-                doc_stride=args.doc_stride,
-                max_query_length=args.max_query_length,
-                is_training=True)
-            if args.local_rank == -1 or torch.distributed.get_rank() == 0:
-                logger.info("  Saving train features into cached file %s", cached_train_features_file)
-                with open(cached_train_features_file, "wb") as writer:
-                    pickle.dump(train_features, writer)
-
-        all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
-        all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
-        all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)
-        all_start_positions = torch.tensor([f.start_position for f in train_features], dtype=torch.long)
-        all_end_positions = torch.tensor([f.end_position for f in train_features], dtype=torch.long)
-        train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
-                                   all_start_positions, all_end_positions)
-        if args.local_rank == -1:
-            train_sampler = RandomSampler(train_data)
-        else:
-            train_sampler = DistributedSampler(train_data)
-
-        train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)
-        num_train_optimization_steps = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
-        # if args.local_rank != -1:
-        #     num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size()
-
-        # Prepare optimizer
-        param_optimizer = list(model.named_parameters())
-
-        # hack to remove pooler, which is not used
-        # thus it produce None grad that break apex
-        param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]]
-
-        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
-        optimizer_grouped_parameters = [
-            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
-            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
-            ]
-
-        if args.fp16:
-            try:
-                from apex.optimizers import FP16_Optimizer
-                from apex.optimizers import FusedAdam
-            except ImportError:
-                raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
-
-            optimizer = FusedAdam(optimizer_grouped_parameters,
-                                  lr=args.learning_rate,
-                                  bias_correction=False,
-                                  max_grad_norm=1.0)
-            if args.loss_scale == 0:
-                optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
-            else:
-                optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale)
-            warmup_linear = WarmupLinearSchedule(warmup=args.warmup_proportion,
-                                                 t_total=num_train_optimization_steps)
-        else:
-            optimizer = BertAdam(optimizer_grouped_parameters,
-                                 lr=args.learning_rate,
-                                 warmup=args.warmup_proportion,
-                                 t_total=num_train_optimization_steps)
-
-        global_step = 0
-
-        logger.info("***** Running training *****")
-        logger.info("  Num orig examples = %d", len(train_examples))
-        logger.info("  Num split examples = %d", len(train_features))
-        logger.info("  Batch size = %d", args.train_batch_size)
-        logger.info("  Num steps = %d", num_train_optimization_steps)
-
-        model.train()
-        for epoch in trange(int(args.num_train_epochs), desc="Epoch"):
-            for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])):
-                if n_gpu == 1:
-                    batch = tuple(t.to(device) for t in batch) # multi-gpu does scattering it-self
-                input_ids, input_mask, segment_ids, start_positions, end_positions = batch
-                loss = model(input_ids, segment_ids, input_mask, start_positions, end_positions)
-                if n_gpu > 1:
-                    loss = loss.mean() # mean() to average on multi-gpu.
-                if args.gradient_accumulation_steps > 1:
-                    loss = loss / args.gradient_accumulation_steps
-
-                if args.fp16:
-                    optimizer.backward(loss)
-                else:
-                    loss.backward()
-                if (step + 1) % args.gradient_accumulation_steps == 0:
-                    if args.fp16:
-                        # modify learning rate with special warm up BERT uses
-                        # if args.fp16 is False, BertAdam is used and handles this automatically
-                        lr_this_step = args.learning_rate * warmup_linear.get_lr(global_step, args.warmup_proportion)
-                        for param_group in optimizer.param_groups:
-                            param_group['lr'] = lr_this_step
-                    optimizer.step()
-                    optimizer.zero_grad()
-                    global_step += 1
-                    if args.local_rank in [-1, 0]:
-                        if not args.fp16:
-                            tb_writer.add_scalar('lr', optimizer.get_lr()[0], global_step)
-                        tb_writer.add_scalar('loss', loss.item(), global_step)
-
-    if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
-        # Save a trained model, configuration and tokenizer
-        model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
-
-        # If we save using the predefined names, we can load using `from_pretrained`
-        output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
-        output_config_file = os.path.join(args.output_dir, CONFIG_NAME)
-
-        torch.save(model_to_save.state_dict(), output_model_file)
-        model_to_save.config.to_json_file(output_config_file)
-        tokenizer.save_vocabulary(args.output_dir)
-
-        # Load a trained model and vocabulary that you have fine-tuned
-        model = BertForQuestionAnswering.from_pretrained(args.output_dir)
-        tokenizer = BertTokenizer.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
-
-        # Good practice: save your training arguments together with the trained model
-        output_args_file = os.path.join(args.output_dir, 'training_args.bin')
-        torch.save(args, output_args_file)
-    else:
-        model = BertForQuestionAnswering.from_pretrained(args.bert_model)
-
-    model.to(device)
-
-    if args.do_predict and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
-        eval_examples = read_squad_examples(
-            input_file=args.predict_file, is_training=False, version_2_with_negative=args.version_2_with_negative)
-        eval_features = convert_examples_to_features(
-            examples=eval_examples,
-            tokenizer=tokenizer,
-            max_seq_length=args.max_seq_length,
-            doc_stride=args.doc_stride,
-            max_query_length=args.max_query_length,
-            is_training=False)
-
-        logger.info("***** Running predictions *****")
-        logger.info("  Num orig examples = %d", len(eval_examples))
-        logger.info("  Num split examples = %d", len(eval_features))
-        logger.info("  Batch size = %d", args.predict_batch_size)
-
-        all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
-        all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
-        all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
-        all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
-        eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_example_index)
-        # Run prediction for full data
-        eval_sampler = SequentialSampler(eval_data)
-        eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.predict_batch_size)
-
-        model.eval()
-        all_results = []
-        logger.info("Start evaluating")
-        for input_ids, input_mask, segment_ids, example_indices in tqdm(eval_dataloader, desc="Evaluating", disable=args.local_rank not in [-1, 0]):
-            if len(all_results) % 1000 == 0:
-                logger.info("Processing example: %d" % (len(all_results)))
-            input_ids = input_ids.to(device)
-            input_mask = input_mask.to(device)
-            segment_ids = segment_ids.to(device)
-            with torch.no_grad():
-                batch_start_logits, batch_end_logits = model(input_ids, segment_ids, input_mask)
-            for i, example_index in enumerate(example_indices):
-                start_logits = batch_start_logits[i].detach().cpu().tolist()
-                end_logits = batch_end_logits[i].detach().cpu().tolist()
-                eval_feature = eval_features[example_index.item()]
-                unique_id = int(eval_feature.unique_id)
-                all_results.append(RawResult(unique_id=unique_id,
-                                             start_logits=start_logits,
-                                             end_logits=end_logits))
-        output_prediction_file = os.path.join(args.output_dir, "predictions.json")
-        output_nbest_file = os.path.join(args.output_dir, "nbest_predictions.json")
-        output_null_log_odds_file = os.path.join(args.output_dir, "null_odds.json")
-        write_predictions(eval_examples, eval_features, all_results,
-                          args.n_best_size, args.max_answer_length,
-                          args.do_lower_case, output_prediction_file,
-                          output_nbest_file, output_null_log_odds_file, args.verbose_logging,
-                          args.version_2_with_negative, args.null_score_diff_threshold)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/examples/bertology.py b/examples/run_bertology.py
similarity index 100%
rename from examples/bertology.py
rename to examples/run_bertology.py
diff --git a/examples/run_glue.py b/examples/run_glue.py
index 6f96a23476..f0633c3f12 100644
--- a/examples/run_glue.py
+++ b/examples/run_glue.py
@@ -306,9 +306,9 @@ def main():
                         help="Set this flag if you are using an uncased model.")
 
     parser.add_argument("--per_gpu_train_batch_size", default=8, type=int,
-                        help="Batch size per GPU for training.")
+                        help="Batch size per GPU/CPU for training.")
     parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int,
-                        help="Batch size per GPU for evaluation.")
+                        help="Batch size per GPU/CPU for evaluation.")
     parser.add_argument('--gradient_accumulation_steps', type=int, default=1,
                         help="Number of updates steps to accumulate before performing a backward/update pass.")
     parser.add_argument("--learning_rate", default=5e-5, type=float,
@@ -395,8 +395,7 @@ def main():
 
     # Load pretrained model and tokenizer
     if args.local_rank not in [-1, 0]:
-        # Make sure only the first process in distributed training will download model & vocab
-        torch.distributed.barrier()
+        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
 
     args.model_type = ""
     for key in MODEL_CLASSES:
@@ -409,7 +408,7 @@ def main():
     model = model_class.from_pretrained(args.model_name, from_tf=bool('.ckpt' in args.model_name), config=config)
 
     if args.local_rank == 0:
-        torch.distributed.barrier()
+        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
 
     # Distributed and parrallel training
     model.to(args.device)
@@ -422,6 +421,7 @@ def main():
 
     logger.info("Training/evaluation parameters %s", args)
 
+
     # Training
     if args.do_train:
         train_dataset = load_and_cache_examples(args, args.task_name, tokenizer, evaluate=False)
@@ -450,6 +450,7 @@ def main():
         tokenizer = tokenizer_class.from_pretrained(args.output_dir)
         model.to(args.device)
 
+
     # Evaluation
     results = {}
     if args.do_eval and args.local_rank in [-1, 0]:
@@ -459,7 +460,7 @@ def main():
             logging.getLogger("pytorch_transformers.modeling_utils").setLevel(logging.WARN)  # Reduce logging
         logger.info("Evaluate the following checkpoints: %s", checkpoints)
         for checkpoint in checkpoints:
-            global_step = checkpoint.split('-')[-1]
+            global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else ""
             model = model_class.from_pretrained(checkpoint)
             model.to(args.device)
             result = evaluate(args, model, tokenizer, prefix=global_step)
diff --git a/examples/run_gpt2.py b/examples/run_gpt2.py
deleted file mode 100644
index a759e449f9..0000000000
--- a/examples/run_gpt2.py
+++ /dev/null
@@ -1,131 +0,0 @@
-#!/usr/bin/env python3
-
-import argparse
-import logging
-from tqdm import trange
-
-import torch
-import torch.nn.functional as F
-import numpy as np
-
-from pytorch_transformers import GPT2LMHeadModel, GPT2Tokenizer
-
-logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
-                    datefmt = '%m/%d/%Y %H:%M:%S',
-                    level = logging.INFO)
-logger = logging.getLogger(__name__)
-
-def top_k_logits(logits, k):
-    """
-    Masks everything but the k top entries as -infinity (1e10).
-    Used to mask logits such that e^-infinity -> 0 won't contribute to the
-    sum of the denominator.
-    """
-    if k == 0:
-        return logits
-    else:
-        values = torch.topk(logits, k)[0]
-        batch_mins = values[:, -1].view(-1, 1).expand_as(logits)
-        return torch.where(logits < batch_mins, torch.ones_like(logits) * -1e10, logits)
-
-def sample_sequence(model, length, start_token=None, batch_size=None, context=None, temperature=1, top_k=0, device='cuda', sample=True):
-    if start_token is None:
-        assert context is not None, 'Specify exactly one of start_token and context!'
-        context = torch.tensor(context, device=device, dtype=torch.long).unsqueeze(0).repeat(batch_size, 1)
-    else:
-        assert context is None, 'Specify exactly one of start_token and context!'
-        context = torch.full((batch_size, 1), start_token, device=device, dtype=torch.long)
-    prev = context
-    output = context
-    past = None
-    with torch.no_grad():
-        for i in trange(length):
-            logits, past = model(prev, past=past)
-            logits = logits[:, -1, :] / temperature
-            logits = top_k_logits(logits, k=top_k)
-            log_probs = F.softmax(logits, dim=-1)
-            if sample:
-                prev = torch.multinomial(log_probs, num_samples=1)
-            else:
-                _, prev = torch.topk(log_probs, k=1, dim=-1)
-            output = torch.cat((output, prev), dim=1)
-    return output
-
-def run_model():
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--model_name_or_path', type=str, default='gpt2', help='pretrained model name or path to local checkpoint')
-    parser.add_argument("--seed", type=int, default=0)
-    parser.add_argument("--nsamples", type=int, default=1)
-    parser.add_argument("--batch_size", type=int, default=-1)
-    parser.add_argument("--length", type=int, default=-1)
-    parser.add_argument("--temperature", type=float, default=1.0)
-    parser.add_argument("--top_k", type=int, default=0)
-    parser.add_argument('--unconditional', action='store_true', help='If true, unconditional generation.')
-    args = parser.parse_args()
-    print(args)
-
-    if args.batch_size == -1:
-        args.batch_size = 1
-    assert args.nsamples % args.batch_size == 0
-
-    np.random.seed(args.seed)
-    torch.random.manual_seed(args.seed)
-    torch.cuda.manual_seed(args.seed)
-    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-
-    enc = GPT2Tokenizer.from_pretrained(args.model_name_or_path)
-    model = GPT2LMHeadModel.from_pretrained(args.model_name_or_path)
-    model.to(device)
-    model.eval()
-
-    if args.length == -1:
-        args.length = model.config.n_ctx // 2
-    elif args.length > model.config.n_ctx:
-        raise ValueError("Can't get samples longer than window size: %s" % model.config.n_ctx)
-
-    while True:
-        context_tokens = []
-        if not args.unconditional:
-            raw_text = input("Model prompt >>> ")
-            while not raw_text:
-                print('Prompt should not be empty!')
-                raw_text = input("Model prompt >>> ")
-            context_tokens = enc.encode(raw_text)
-            generated = 0
-            for _ in range(args.nsamples // args.batch_size):
-                out = sample_sequence(
-                    model=model, length=args.length,
-                    context=context_tokens,
-                    start_token=None,
-                    batch_size=args.batch_size,
-                    temperature=args.temperature, top_k=args.top_k, device=device
-                )
-                out = out[:, len(context_tokens):].tolist()
-                for i in range(args.batch_size):
-                    generated += 1
-                    text = enc.decode(out[i])
-                    print("=" * 40 + " SAMPLE " + str(generated) + " " + "=" * 40)
-                    print(text)
-            print("=" * 80)
-        else:
-            generated = 0
-            for _ in range(args.nsamples // args.batch_size):
-                out = sample_sequence(
-                    model=model, length=args.length,
-                    context=None,
-                    start_token=enc.encoder['<|endoftext|>'],
-                    batch_size=args.batch_size,
-                    temperature=args.temperature, top_k=args.top_k, device=device
-                )
-                out = out[:,1:].tolist()
-                for i in range(args.batch_size):
-                    generated += 1
-                    text = enc.decode(out[i])
-                    print("=" * 40 + " SAMPLE " + str(generated) + " " + "=" * 40)
-                    print(text)
-            print("=" * 80)
-
-if __name__ == '__main__':
-    run_model()
-
-
diff --git a/examples/run_squad.py b/examples/run_squad.py
index 3d3d964687..af4a771f4a 100644
--- a/examples/run_squad.py
+++ b/examples/run_squad.py
@@ -43,6 +43,8 @@ from pytorch_transformers import AdamW, WarmupLinearSchedule
 
 from utils_squad import read_squad_examples, convert_examples_to_features, RawResult, write_predictions
 
+from utils_squad_evaluate import EVAL_OPTS, main as evaluate_on_squad
+
 logger = logging.getLogger(__name__)
 
 ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) \
@@ -62,29 +64,29 @@ def set_seed(args):
         torch.cuda.manual_seed_all(args.seed)
 
 
-def train(args, train_dataset, model):
+def train(args, train_dataset, model, tokenizer):
     """ Train the model """
     if args.local_rank in [-1, 0]:
         tb_writer = SummaryWriter()
 
-    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps
+    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
     train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
     train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size)
 
     if args.max_steps > 0:
-        num_train_optimization_steps = args.max_steps
+        t_total = args.max_steps
         args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
     else:
-        num_train_optimization_steps = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
+        t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
 
-    # Prepare optimizer
+    # Prepare optimizer and schedule (linear warmup and decay)
     no_decay = ['bias', 'LayerNorm.weight']
     optimizer_grouped_parameters = [
-        {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
+        {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay},
         {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
         ]
-    optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate,
-                         t_total=num_train_optimization_steps, warmup=args.warmup_proportion)
+    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
+    scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total)
     if args.fp16:
         try:
             from apex import amp
@@ -96,72 +98,172 @@ def train(args, train_dataset, model):
     logger.info("***** Running training *****")
     logger.info("  Num examples = %d", len(train_dataset))
     logger.info("  Num Epochs = %d", args.num_train_epochs)
-    logger.info("  Batch size = %d", args.train_batch_size)
-    logger.info("  Total batch size (distributed) = %d", args.train_batch_size * (torch.distributed.get_world_size() if args.local_rank != -1 else 1))
+    logger.info("  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
+    logger.info("  Total train batch size (w. parallel, distributed & accumulation) = %d",
+                   args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1))
     logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
-    logger.info("  Total optimization steps = %d", num_train_optimization_steps)
+    logger.info("  Total optimization steps = %d", t_total)
 
     global_step = 0
     tr_loss, logging_loss = 0.0, 0.0
-    model.train()
-    optimizer.zero_grad()
-    for _ in trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]):
-        for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])):
+    model.zero_grad()
+    train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0])
+    set_seed(args)  # Added here for reproductibility (even between python 2 and 3)
+    for _ in train_iterator:
+        epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
+        for step, batch in enumerate(epoch_iterator):
+            model.train()
             batch = tuple(t.to(args.device) for t in batch)
-            inputs = {'input_ids':      batch[0],
-                      'attention_mask': batch[1],
-                      'token_type_ids': batch[2] if args.model_type in ['bert', 'xlnet'] else None,  # XLM don't use segment_ids
-                      'labels':         batch[3]}
+            inputs = {'input_ids':       batch[0],
+                      'token_type_ids':  batch[1] if args.model_type in ['bert', 'xlnet'] else None,  # XLM don't use segment_ids
+                      'attention_mask':  batch[2],
+                      'start_positions': batch[3],
+                      'end_positions':   batch[4]}
             ouputs = model(**inputs)
-            loss = ouputs[0]
+            loss = ouputs[0]  # model outputs are always tuple in pytorch-transformers (see doc)
+
+            if args.n_gpu > 1:
+                loss = loss.mean() # mean() to average on multi-gpu parallel training
+            if args.gradient_accumulation_steps > 1:
+                loss = loss / args.gradient_accumulation_steps
+
+            if args.fp16:
+                with amp.scale_loss(loss, optimizer) as scaled_loss:
+                    scaled_loss.backward()
+                torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
+            else:
+                loss.backward()
+                torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
+
+            tr_loss += loss.item()
+            if (step + 1) % args.gradient_accumulation_steps == 0:
+                scheduler.step()  # Update learning rate schedule
+                optimizer.step()
+                model.zero_grad()
+                global_step += 1
+
+                if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
+                    # Log metrics
+                    if args.local_rank == -1 and args.evaluate_during_training:  # Only evaluate when single GPU otherwise metrics may not average well
+                        results = evaluate(args, model, tokenizer)
+                        for key, value in results.items():
+                            tb_writer.add_scalar('eval_{}'.format(key), value, global_step)
+                    tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step)
+                    tb_writer.add_scalar('loss', (tr_loss - logging_loss)/args.logging_steps, global_step)
+                    logging_loss = tr_loss
+
+                if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
+                    # Save model checkpoint
+                    output_dir = os.path.join(args.output_dir, 'checkpoint-{}'.format(global_step))
+                    if not os.path.exists(output_dir):
+                        os.makedirs(output_dir)
+                    model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
+                    model_to_save.save_pretrained(output_dir)
+                    torch.save(args, os.path.join(output_dir, 'training_args.bin'))
+                    logger.info("Saving model checkpoint to %s", output_dir)
+
+            if args.max_steps > 0 and global_step > args.max_steps:
+                epoch_iterator.close()
+                break
+        if args.max_steps > 0 and global_step > args.max_steps:
+            train_iterator.close()
+            break
+
+    return global_step, tr_loss / global_step
 
 
-def evalutate(args, dataset, model):
-    """ Evaluate the model """
+def evaluate(args, model, tokenizer, prefix=""):
+    dataset, examples, features = load_and_cache_examples(args, tokenizer, evaluate=True, output_examples=True)
+
+    if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
+        os.makedirs(args.output_dir)
+
+    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
+    # Note that DistributedSampler samples randomly
+    eval_sampler = SequentialSampler(dataset) if args.local_rank == -1 else DistributedSampler(dataset)
+    eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)
+
+    # Eval!
+    logger.info("***** Running evaluation {} *****".format(prefix))
+    logger.info("  Num examples = %d", len(dataset))
+    logger.info("  Batch size = %d", args.eval_batch_size)
+    all_results = []
+    for batch in tqdm(eval_dataloader, desc="Evaluating"):
+        model.eval()
+        batch = tuple(t.to(args.device) for t in batch)
+        example_indices = batch[3]
+        with torch.no_grad():
+            inputs = {'input_ids':      batch[0],
+                        'token_type_ids': batch[1] if args.model_type in ['bert', 'xlnet'] else None,  # XLM don't use segment_ids
+                        'attention_mask': batch[2]}
+            outputs = model(**inputs)
+            batch_start_logits, batch_end_logits = outputs[:2]
+
+        for i, example_index in enumerate(example_indices):
+            start_logits = batch_start_logits[i].detach().cpu().tolist()
+            end_logits = batch_end_logits[i].detach().cpu().tolist()
+            eval_feature = features[example_index.item()]
+            unique_id = int(eval_feature.unique_id)
+            all_results.append(RawResult(unique_id=unique_id,
+                                         start_logits=start_logits,
+                                         end_logits=end_logits))
+
+    output_prediction_file = os.path.join(args.output_dir, "predictions_{}.json".format(prefix))
+    output_nbest_file = os.path.join(args.output_dir, "nbest_predictions_{}.json".format(prefix))
+    output_null_log_odds_file = os.path.join(args.output_dir, "null_odds_{}.json".format(prefix))
+    all_predictions = write_predictions(examples, features, all_results,
+                                        args.n_best_size, args.max_answer_length,
+                                        args.do_lower_case, output_prediction_file,
+                                        output_nbest_file, output_null_log_odds_file,
+                                        args.verbose_logging, args.version_2_with_negative,
+                                        args.null_score_diff_threshold)
+
+    evaluate_options = EVAL_OPTS(data_file=args.predict_file,
+                                 pred_file=output_prediction_file,
+                                 na_prob_file=output_null_log_odds_file)
+    results = evaluate_on_squad(evaluate_options)
+    return results
 
 
-
-def load_and_cache_examples(args, tokenizer, training=True):
-    """ Load data features from cache or dataset file. """
-    cached_features_file = os.path.join(args.data_dir, 'cached_{}_{}_{}_{}'.format(
+def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False):
+    # Load data features from cache or dataset file
+    input_file = args.predict_file if evaluate else args.train_file
+    cached_features_file = os.path.join(os.path.dirname(input_file), 'cached_{}_{}_{}'.format(
         'dev' if evaluate else 'train',
         list(filter(None, args.model_name.split('/'))).pop(),
-        str(args.max_seq_length),
-        str(task)))
-    if os.path.exists(cached_features_file):
+        str(args.max_seq_length)))
+    if os.path.exists(cached_features_file) and not args.overwrite_cache and not output_examples:
         logger.info("Loading features from cached file %s", cached_features_file)
         features = torch.load(cached_features_file)
     else:
-        logger.info("Creating features from dataset file at %s", args.data_dir)
-        label_list = processor.get_labels()
-        examples = read_squad_examples(input_file=args.train_file if training else args.predict_file,
-                        is_training=training,
-                        version_2_with_negative=args.version_2_with_negative)
-        features = convert_examples_to_features(
-            examples=examples,
-            tokenizer=tokenizer,
-            max_seq_length=args.max_seq_length,
-            doc_stride=args.doc_stride,
-            max_query_length=args.max_query_length,
-            is_training=training)
+        logger.info("Creating features from dataset file at %s", input_file)
+        examples = read_squad_examples(input_file=input_file,
+                                       is_training=not evaluate,
+                                       version_2_with_negative=args.version_2_with_negative)
+        features = convert_examples_to_features(examples=examples,
+                                                tokenizer=tokenizer,
+                                                max_seq_length=args.max_seq_length,
+                                                doc_stride=args.doc_stride,
+                                                max_query_length=args.max_query_length,
+                                                is_training=not evaluate)
         if args.local_rank in [-1, 0]:
-            logger.info("Num orig examples = %d", len(examples))
-            logger.info("Num split examples = %d", len(features))
             logger.info("Saving features into cached file %s", cached_features_file)
             torch.save(features, cached_features_file)
 
     # Convert to Tensors and build dataset
-    all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
-    all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
-    all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
-    if training:
-        all_start_positions = torch.tensor([f.start_position for f in train_features], dtype=torch.long)
-        all_end_positions = torch.tensor([f.end_position for f in train_features], dtype=torch.long)
-        dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_start_positions, all_end_positions)
-    else:
+    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
+    all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
+    all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
+    if evaluate:
         all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
         dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_example_index)
+    else:
+        all_start_positions = torch.tensor([f.start_position for f in features], dtype=torch.long)
+        all_end_positions = torch.tensor([f.end_position for f in features], dtype=torch.long)
+        dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_start_positions, all_end_positions)
 
+    if output_examples:
+        return dataset, examples, features
     return dataset
 
 
@@ -179,12 +281,17 @@ def main():
                         help="The output directory where the model checkpoints and predictions will be written.")
 
     ## Other parameters
+    parser.add_argument("--config_name", default="", type=str,
+                        help="Pretrained config name or path if not the same as model_name")
+    parser.add_argument("--tokenizer_name", default="", type=str,
+                        help="Pretrained tokenizer name or path if not the same as model_name")
+    parser.add_argument("--cache_dir", default="", type=str,
+                        help="Where do you want to store the pre-trained models downloaded from s3")
+
     parser.add_argument('--version_2_with_negative', action='store_true',
                         help='If true, the SQuAD examples contain some that do not have an answer.')
     parser.add_argument('--null_score_diff_threshold', type=float, default=0.0,
                         help="If null_score - best_non_null is greater than the threshold predict null.")
-    parser.add_argument('--overwrite_output_dir', action='store_true',
-                        help="Overwrite the content of the output directory")
 
     parser.add_argument("--max_seq_length", default=384, type=int,
                         help="The maximum total input sequence length after WordPiece tokenization. Sequences "
@@ -196,23 +303,33 @@ def main():
                              "be truncated to this length.")
     parser.add_argument("--do_train", action='store_true',
                         help="Whether to run training.")
-    parser.add_argument("--do_predict", action='store_true',
+    parser.add_argument("--do_eval", action='store_true',
                         help="Whether to run eval on the dev set.")
+    parser.add_argument("--evaluate_during_training", action='store_true',
+                        help="Rul evaluation during training at each logging step.")
     parser.add_argument("--do_lower_case", action='store_true',
-                        help="Whether to lower case the input text. True for uncased models, False for cased models.")
+                        help="Set this flag if you are using an uncased model.")
 
-    parser.add_argument("--train_batch_size", default=32, type=int,
-                        help="Total batch size for training.")
-    parser.add_argument("--predict_batch_size", default=8, type=int,
-                        help="Total batch size for predictions.")
+    parser.add_argument("--per_gpu_train_batch_size", default=8, type=int,
+                        help="Batch size per GPU/CPU for training.")
+    parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int,
+                        help="Batch size per GPU/CPU for evaluation.")
     parser.add_argument("--learning_rate", default=5e-5, type=float,
                         help="The initial learning rate for Adam.")
     parser.add_argument('--gradient_accumulation_steps', type=int, default=1,
                         help="Number of updates steps to accumulate before performing a backward/update pass.")
+    parser.add_argument("--weight_decay", default=0.0, type=float,
+                        help="Weight deay if we apply some.")
+    parser.add_argument("--adam_epsilon", default=1e-8, type=float,
+                        help="Epsilon for Adam optimizer.")
+    parser.add_argument("--max_grad_norm", default=1.0, type=float,
+                        help="Max gradient norm.")
     parser.add_argument("--num_train_epochs", default=3.0, type=float,
                         help="Total number of training epochs to perform.")
-    parser.add_argument("--warmup_proportion", default=0.1, type=float,
-                        help="Proportion of training with linear learning rate warmup (0.1 = 10%% of training).")
+    parser.add_argument("--max_steps", default=-1, type=int,
+                        help="If > 0: set total number of training steps to perform. Override num_train_epochs.")
+    parser.add_argument("--warmup_steps", default=0, type=int,
+                        help="Linear warmup over warmup_steps.")
     parser.add_argument("--n_best_size", default=20, type=int,
                         help="The total number of n-best predictions to generate in the nbest_predictions.json output file.")
     parser.add_argument("--max_answer_length", default=30, type=int,
@@ -222,10 +339,21 @@ def main():
                         help="If true, all of the warnings related to data processing will be printed. "
                              "A number of warnings are expected for a normal SQuAD evaluation.")
 
+    parser.add_argument('--logging_steps', type=int, default=50,
+                        help="Log every X updates steps.")
+    parser.add_argument('--save_steps', type=int, default=50,
+                        help="Save checkpoint every X updates steps.")
+    parser.add_argument("--eval_all_checkpoints", action='store_true',
+                        help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number")
     parser.add_argument("--no_cuda", action='store_true',
                         help="Whether not to use CUDA when available")
+    parser.add_argument('--overwrite_output_dir', action='store_true',
+                        help="Overwrite the content of the output directory")
+    parser.add_argument('--overwrite_cache', action='store_true',
+                        help="Overwrite the cached training and evaluation sets")
     parser.add_argument('--seed', type=int, default=42,
                         help="random seed for initialization")
+
     parser.add_argument("--local_rank", type=int, default=-1,
                         help="local_rank for distributed training on gpus")
     parser.add_argument('--fp16', action='store_true',
@@ -236,11 +364,11 @@ def main():
     parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.")
     parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.")
     args = parser.parse_args()
-    print(args)
 
     if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir:
         raise ValueError("Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(args.output_dir))
 
+    # Setup distant debugging if needed
     if args.server_ip and args.server_port:
         # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
         import ptvsd
@@ -260,29 +388,31 @@ def main():
     args.device = device
 
     # Setup logging
-    logging.basicConfig(level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
+    logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
+                        datefmt = '%m/%d/%Y %H:%M:%S',
+                        level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
     logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
-                args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16)
+                    args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16)
 
-    # Setup seeds
-    random.seed(args.seed)
-    np.random.seed(args.seed)
-    torch.manual_seed(args.seed)
-    if args.n_gpu > 0:
-        torch.cuda.manual_seed_all(args.seed)
+    # Set seed
+    set_seed(args)
 
     # Load pretrained model and tokenizer
     if args.local_rank not in [-1, 0]:
-        torch.distributed.barrier()  # Make sure only 1st process in distributed training download model & vocab
+        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
 
-    args.model_type = args.model_name.lower().split('-')[0]
-    tokenizer_class = TOKENIZER_CLASSES[args.model_type]
-    model_class = MODEL_CLASSES[args.model_type]
-    tokenizer = tokenizer_class.from_pretrained(args.model_name, do_lower_case=args.do_lower_case)
-    model = model_class.from_pretrained(args.model_name, num_labels=num_labels)
+    args.model_type = ""
+    for key in MODEL_CLASSES:
+        if key in args.model_name.lower():
+            args.model_type = key  # take the first match in model types
+            break
+    config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
+    config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name)
+    tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name, do_lower_case=args.do_lower_case)
+    model = model_class.from_pretrained(args.model_name, from_tf=bool('.ckpt' in args.model_name), config=config)
 
     if args.local_rank == 0:
-        torch.distributed.barrier()
+        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
 
     # Distributed and parrallel training
     model.to(args.device)
@@ -293,199 +423,54 @@ def main():
     elif args.n_gpu > 1:
         model = torch.nn.DataParallel(model)
 
+    logger.info("Training/evaluation parameters %s", args)
+
     # Training
     if args.do_train:
-        if args.local_rank in [-1, 0]:
-            tb_writer = SummaryWriter()
-        # Prepare data loader
-        train_examples = read_squad_examples(
-            input_file=args.train_file, is_training=True, version_2_with_negative=args.version_2_with_negative)
-        cached_train_features_file = args.train_file+'_{0}_{1}_{2}_{3}'.format(
-            list(filter(None, args.bert_model.split('/'))).pop(), str(args.max_seq_length), str(args.doc_stride), str(args.max_query_length))
-        try:
-            with open(cached_train_features_file, "rb") as reader:
-                train_features = pickle.load(reader)
-        except:
-            train_features = convert_examples_to_features(
-                examples=train_examples,
-                tokenizer=tokenizer,
-                max_seq_length=args.max_seq_length,
-                doc_stride=args.doc_stride,
-                max_query_length=args.max_query_length,
-                is_training=True)
-            if args.local_rank == -1 or torch.distributed.get_rank() == 0:
-                logger.info("  Saving train features into cached file %s", cached_train_features_file)
-                with open(cached_train_features_file, "wb") as writer:
-                    pickle.dump(train_features, writer)
+        train_dataset = load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False)
+        global_step, tr_loss = train(args, train_dataset, model, tokenizer)
+        logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
 
-        all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
-        all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
-        all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)
-        all_start_positions = torch.tensor([f.start_position for f in train_features], dtype=torch.long)
-        all_end_positions = torch.tensor([f.end_position for f in train_features], dtype=torch.long)
-        train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
-                                   all_start_positions, all_end_positions)
-        if args.local_rank == -1:
-            train_sampler = RandomSampler(train_data)
-        else:
-            train_sampler = DistributedSampler(train_data)
 
-        train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)
-        num_train_optimization_steps = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
-        # if args.local_rank != -1:
-        #     num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size()
+    # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
+    if args.local_rank == -1 or torch.distributed.get_rank() == 0:
+        # Create output directory if needed
+        if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
+            os.makedirs(args.output_dir)
 
-        # Prepare optimizer
-        param_optimizer = list(model.named_parameters())
-
-        # hack to remove pooler, which is not used
-        # thus it produce None grad that break apex
-        param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]]
-
-        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
-        optimizer_grouped_parameters = [
-            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
-            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
-            ]
-
-        if args.fp16:
-            try:
-                from apex.optimizers import FP16_Optimizer
-                from apex.optimizers import FusedAdam
-            except ImportError:
-                raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
-
-            optimizer = FusedAdam(optimizer_grouped_parameters,
-                                  lr=args.learning_rate,
-                                  bias_correction=False,
-                                  max_grad_norm=1.0)
-            if args.loss_scale == 0:
-                optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
-            else:
-                optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale)
-            warmup_linear = WarmupLinearSchedule(warmup=args.warmup_proportion,
-                                                 t_total=num_train_optimization_steps)
-        else:
-            optimizer = BertAdam(optimizer_grouped_parameters,
-                                 lr=args.learning_rate,
-                                 warmup=args.warmup_proportion,
-                                 t_total=num_train_optimization_steps)
-
-        global_step = 0
-
-        logger.info("***** Running training *****")
-        logger.info("  Num orig examples = %d", len(train_examples))
-        logger.info("  Num split examples = %d", len(train_features))
-        logger.info("  Batch size = %d", args.train_batch_size)
-        logger.info("  Num steps = %d", num_train_optimization_steps)
-
-        model.train()
-        for epoch in trange(int(args.num_train_epochs), desc="Epoch"):
-            for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])):
-                if n_gpu == 1:
-                    batch = tuple(t.to(device) for t in batch) # multi-gpu does scattering it-self
-                input_ids, input_mask, segment_ids, start_positions, end_positions = batch
-                loss = model(input_ids, segment_ids, input_mask, start_positions, end_positions)
-                if n_gpu > 1:
-                    loss = loss.mean() # mean() to average on multi-gpu.
-                if args.gradient_accumulation_steps > 1:
-                    loss = loss / args.gradient_accumulation_steps
-
-                if args.fp16:
-                    optimizer.backward(loss)
-                else:
-                    loss.backward()
-                if (step + 1) % args.gradient_accumulation_steps == 0:
-                    if args.fp16:
-                        # modify learning rate with special warm up BERT uses
-                        # if args.fp16 is False, BertAdam is used and handles this automatically
-                        lr_this_step = args.learning_rate * warmup_linear.get_lr(global_step, args.warmup_proportion)
-                        for param_group in optimizer.param_groups:
-                            param_group['lr'] = lr_this_step
-                    optimizer.step()
-                    optimizer.zero_grad()
-                    global_step += 1
-                    if args.local_rank in [-1, 0]:
-                        if not args.fp16:
-                            tb_writer.add_scalar('lr', optimizer.get_lr()[0], global_step)
-                        tb_writer.add_scalar('loss', loss.item(), global_step)
-
-    if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
-        # Save a trained model, configuration and tokenizer
-        model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
-
-        # If we save using the predefined names, we can load using `from_pretrained`
-        output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
-        output_config_file = os.path.join(args.output_dir, CONFIG_NAME)
-
-        torch.save(model_to_save.state_dict(), output_model_file)
-        model_to_save.config.to_json_file(output_config_file)
-        tokenizer.save_vocabulary(args.output_dir)
-
-        # Load a trained model and vocabulary that you have fine-tuned
-        model = BertForQuestionAnswering.from_pretrained(args.output_dir)
-        tokenizer = BertTokenizer.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
+        logger.info("Saving model checkpoint to %s", args.output_dir)
+        # Save a trained model, configuration and tokenizer using `save_pretrained()`.
+        # They can then be reloaded using `from_pretrained()`
+        model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
+        model_to_save.save_pretrained(args.output_dir)
+        tokenizer.save_pretrained(args.output_dir)
 
         # Good practice: save your training arguments together with the trained model
-        output_args_file = os.path.join(args.output_dir, 'training_args.bin')
-        torch.save(args, output_args_file)
-    else:
-        model = BertForQuestionAnswering.from_pretrained(args.bert_model)
+        torch.save(args, os.path.join(args.output_dir, 'training_args.bin'))
 
-    model.to(device)
+        # Load a trained model and vocabulary that you have fine-tuned
+        model = model_class.from_pretrained(args.output_dir)
+        tokenizer = tokenizer_class.from_pretrained(args.output_dir)
+        model.to(args.device)
 
-    if args.do_predict and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
-        eval_examples = read_squad_examples(
-            input_file=args.predict_file, is_training=False, version_2_with_negative=args.version_2_with_negative)
-        eval_features = convert_examples_to_features(
-            examples=eval_examples,
-            tokenizer=tokenizer,
-            max_seq_length=args.max_seq_length,
-            doc_stride=args.doc_stride,
-            max_query_length=args.max_query_length,
-            is_training=False)
 
-        logger.info("***** Running predictions *****")
-        logger.info("  Num orig examples = %d", len(eval_examples))
-        logger.info("  Num split examples = %d", len(eval_features))
-        logger.info("  Batch size = %d", args.predict_batch_size)
-
-        all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
-        all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
-        all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
-        all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
-        eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_example_index)
-        # Run prediction for full data
-        eval_sampler = SequentialSampler(eval_data)
-        eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.predict_batch_size)
-
-        model.eval()
-        all_results = []
-        logger.info("Start evaluating")
-        for input_ids, input_mask, segment_ids, example_indices in tqdm(eval_dataloader, desc="Evaluating", disable=args.local_rank not in [-1, 0]):
-            if len(all_results) % 1000 == 0:
-                logger.info("Processing example: %d" % (len(all_results)))
-            input_ids = input_ids.to(device)
-            input_mask = input_mask.to(device)
-            segment_ids = segment_ids.to(device)
-            with torch.no_grad():
-                batch_start_logits, batch_end_logits = model(input_ids, segment_ids, input_mask)
-            for i, example_index in enumerate(example_indices):
-                start_logits = batch_start_logits[i].detach().cpu().tolist()
-                end_logits = batch_end_logits[i].detach().cpu().tolist()
-                eval_feature = eval_features[example_index.item()]
-                unique_id = int(eval_feature.unique_id)
-                all_results.append(RawResult(unique_id=unique_id,
-                                             start_logits=start_logits,
-                                             end_logits=end_logits))
-        output_prediction_file = os.path.join(args.output_dir, "predictions.json")
-        output_nbest_file = os.path.join(args.output_dir, "nbest_predictions.json")
-        output_null_log_odds_file = os.path.join(args.output_dir, "null_odds.json")
-        write_predictions(eval_examples, eval_features, all_results,
-                          args.n_best_size, args.max_answer_length,
-                          args.do_lower_case, output_prediction_file,
-                          output_nbest_file, output_null_log_odds_file, args.verbose_logging,
-                          args.version_2_with_negative, args.null_score_diff_threshold)
+    # Evaluation
+    results = {}
+    if args.do_eval and args.local_rank in [-1, 0]:
+        checkpoints = [args.output_dir]
+        if args.eval_all_checkpoints:
+            checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True)))
+            logging.getLogger("pytorch_transformers.modeling_utils").setLevel(logging.WARN)  # Reduce logging
+        logger.info("Evaluate the following checkpoints: %s", checkpoints)
+        for checkpoint in checkpoints:
+            global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else ""
+            model = model_class.from_pretrained(checkpoint)
+            model.to(args.device)
+            result = evaluate(args, model, tokenizer, prefix=global_step)
+            result = dict((k + ('_{}'.format(global_step) if global_step else ''), v) for k, v in result.items())
+            results.update(result)
+    logger.info("Results: {}".format(results))
+    return results
 
 
 if __name__ == "__main__":
diff --git a/examples/run_swag.py b/examples/run_swag.py
new file mode 100644
index 0000000000..00cd3a7840
--- /dev/null
+++ b/examples/run_swag.py
@@ -0,0 +1,555 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""BERT finetuning runner."""
+
+from __future__ import absolute_import
+
+import argparse
+import csv
+import logging
+import os
+import random
+import sys
+from io import open
+
+import numpy as np
+import torch
+from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
+                              TensorDataset)
+from torch.utils.data.distributed import DistributedSampler
+from tqdm import tqdm, trange
+
+from pytorch_transformers.file_utils import PYTORCH_PRETRAINED_BERT_CACHE, WEIGHTS_NAME, CONFIG_NAME
+from pytorch_transformers.modeling_bert import BertForMultipleChoice, BertConfig
+from pytorch_transformers.optimization import BertAdam, WarmupLinearSchedule
+from pytorch_transformers.tokenization_bert import BertTokenizer
+
+logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
+                    datefmt = '%m/%d/%Y %H:%M:%S',
+                    level = logging.INFO)
+logger = logging.getLogger(__name__)
+
+
+class SwagExample(object):
+    """A single training/test example for the SWAG dataset."""
+    def __init__(self,
+                 swag_id,
+                 context_sentence,
+                 start_ending,
+                 ending_0,
+                 ending_1,
+                 ending_2,
+                 ending_3,
+                 label = None):
+        self.swag_id = swag_id
+        self.context_sentence = context_sentence
+        self.start_ending = start_ending
+        self.endings = [
+            ending_0,
+            ending_1,
+            ending_2,
+            ending_3,
+        ]
+        self.label = label
+
+    def __str__(self):
+        return self.__repr__()
+
+    def __repr__(self):
+        l = [
+            "swag_id: {}".format(self.swag_id),
+            "context_sentence: {}".format(self.context_sentence),
+            "start_ending: {}".format(self.start_ending),
+            "ending_0: {}".format(self.endings[0]),
+            "ending_1: {}".format(self.endings[1]),
+            "ending_2: {}".format(self.endings[2]),
+            "ending_3: {}".format(self.endings[3]),
+        ]
+
+        if self.label is not None:
+            l.append("label: {}".format(self.label))
+
+        return ", ".join(l)
+
+
+class InputFeatures(object):
+    def __init__(self,
+                 example_id,
+                 choices_features,
+                 label
+
+    ):
+        self.example_id = example_id
+        self.choices_features = [
+            {
+                'input_ids': input_ids,
+                'input_mask': input_mask,
+                'segment_ids': segment_ids
+            }
+            for _, input_ids, input_mask, segment_ids in choices_features
+        ]
+        self.label = label
+
+
+def read_swag_examples(input_file, is_training):
+    with open(input_file, 'r', encoding='utf-8') as f:
+        reader = csv.reader(f)
+        lines = []
+        for line in reader:
+            if sys.version_info[0] == 2:
+                line = list(unicode(cell, 'utf-8') for cell in line)
+            lines.append(line)
+
+    if is_training and lines[0][-1] != 'label':
+        raise ValueError(
+            "For training, the input file must contain a label column."
+        )
+
+    examples = [
+        SwagExample(
+            swag_id = line[2],
+            context_sentence = line[4],
+            start_ending = line[5], # in the swag dataset, the
+                                         # common beginning of each
+                                         # choice is stored in "sent2".
+            ending_0 = line[7],
+            ending_1 = line[8],
+            ending_2 = line[9],
+            ending_3 = line[10],
+            label = int(line[11]) if is_training else None
+        ) for line in lines[1:] # we skip the line with the column names
+    ]
+
+    return examples
+
+def convert_examples_to_features(examples, tokenizer, max_seq_length,
+                                 is_training):
+    """Loads a data file into a list of `InputBatch`s."""
+
+    # Swag is a multiple choice task. To perform this task using Bert,
+    # we will use the formatting proposed in "Improving Language
+    # Understanding by Generative Pre-Training" and suggested by
+    # @jacobdevlin-google in this issue
+    # https://github.com/google-research/bert/issues/38.
+    #
+    # Each choice will correspond to a sample on which we run the
+    # inference. For a given Swag example, we will create the 4
+    # following inputs:
+    # - [CLS] context [SEP] choice_1 [SEP]
+    # - [CLS] context [SEP] choice_2 [SEP]
+    # - [CLS] context [SEP] choice_3 [SEP]
+    # - [CLS] context [SEP] choice_4 [SEP]
+    # The model will output a single value for each input. To get the
+    # final decision of the model, we will run a softmax over these 4
+    # outputs.
+    features = []
+    for example_index, example in enumerate(examples):
+        context_tokens = tokenizer.tokenize(example.context_sentence)
+        start_ending_tokens = tokenizer.tokenize(example.start_ending)
+
+        choices_features = []
+        for ending_index, ending in enumerate(example.endings):
+            # We create a copy of the context tokens in order to be
+            # able to shrink it according to ending_tokens
+            context_tokens_choice = context_tokens[:]
+            ending_tokens = start_ending_tokens + tokenizer.tokenize(ending)
+            # Modifies `context_tokens_choice` and `ending_tokens` in
+            # place so that the total length is less than the
+            # specified length.  Account for [CLS], [SEP], [SEP] with
+            # "- 3"
+            _truncate_seq_pair(context_tokens_choice, ending_tokens, max_seq_length - 3)
+
+            tokens = ["[CLS]"] + context_tokens_choice + ["[SEP]"] + ending_tokens + ["[SEP]"]
+            segment_ids = [0] * (len(context_tokens_choice) + 2) + [1] * (len(ending_tokens) + 1)
+
+            input_ids = tokenizer.convert_tokens_to_ids(tokens)
+            input_mask = [1] * len(input_ids)
+
+            # Zero-pad up to the sequence length.
+            padding = [0] * (max_seq_length - len(input_ids))
+            input_ids += padding
+            input_mask += padding
+            segment_ids += padding
+
+            assert len(input_ids) == max_seq_length
+            assert len(input_mask) == max_seq_length
+            assert len(segment_ids) == max_seq_length
+
+            choices_features.append((tokens, input_ids, input_mask, segment_ids))
+
+        label = example.label
+        if example_index < 5:
+            logger.info("*** Example ***")
+            logger.info("swag_id: {}".format(example.swag_id))
+            for choice_idx, (tokens, input_ids, input_mask, segment_ids) in enumerate(choices_features):
+                logger.info("choice: {}".format(choice_idx))
+                logger.info("tokens: {}".format(' '.join(tokens)))
+                logger.info("input_ids: {}".format(' '.join(map(str, input_ids))))
+                logger.info("input_mask: {}".format(' '.join(map(str, input_mask))))
+                logger.info("segment_ids: {}".format(' '.join(map(str, segment_ids))))
+            if is_training:
+                logger.info("label: {}".format(label))
+
+        features.append(
+            InputFeatures(
+                example_id = example.swag_id,
+                choices_features = choices_features,
+                label = label
+            )
+        )
+
+    return features
+
+def _truncate_seq_pair(tokens_a, tokens_b, max_length):
+    """Truncates a sequence pair in place to the maximum length."""
+
+    # This is a simple heuristic which will always truncate the longer sequence
+    # one token at a time. This makes more sense than truncating an equal percent
+    # of tokens from each, since if one sequence is very short then each token
+    # that's truncated likely contains more information than a longer sequence.
+    while True:
+        total_length = len(tokens_a) + len(tokens_b)
+        if total_length <= max_length:
+            break
+        if len(tokens_a) > len(tokens_b):
+            tokens_a.pop()
+        else:
+            tokens_b.pop()
+
+def accuracy(out, labels):
+    outputs = np.argmax(out, axis=1)
+    return np.sum(outputs == labels)
+
+def select_field(features, field):
+    return [
+        [
+            choice[field]
+            for choice in feature.choices_features
+        ]
+        for feature in features
+    ]
+
+def main():
+    parser = argparse.ArgumentParser()
+
+    ## Required parameters
+    parser.add_argument("--data_dir",
+                        default=None,
+                        type=str,
+                        required=True,
+                        help="The input data dir. Should contain the .csv files (or other data files) for the task.")
+    parser.add_argument("--bert_model", default=None, type=str, required=True,
+                        help="Bert pre-trained model selected in the list: bert-base-uncased, "
+                        "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, "
+                        "bert-base-multilingual-cased, bert-base-chinese.")
+    parser.add_argument("--output_dir",
+                        default=None,
+                        type=str,
+                        required=True,
+                        help="The output directory where the model checkpoints will be written.")
+
+    ## Other parameters
+    parser.add_argument("--max_seq_length",
+                        default=128,
+                        type=int,
+                        help="The maximum total input sequence length after WordPiece tokenization. \n"
+                             "Sequences longer than this will be truncated, and sequences shorter \n"
+                             "than this will be padded.")
+    parser.add_argument("--do_train",
+                        action='store_true',
+                        help="Whether to run training.")
+    parser.add_argument("--do_eval",
+                        action='store_true',
+                        help="Whether to run eval on the dev set.")
+    parser.add_argument("--do_lower_case",
+                        action='store_true',
+                        help="Set this flag if you are using an uncased model.")
+    parser.add_argument("--train_batch_size",
+                        default=32,
+                        type=int,
+                        help="Total batch size for training.")
+    parser.add_argument("--eval_batch_size",
+                        default=8,
+                        type=int,
+                        help="Total batch size for eval.")
+    parser.add_argument("--learning_rate",
+                        default=5e-5,
+                        type=float,
+                        help="The initial learning rate for Adam.")
+    parser.add_argument("--num_train_epochs",
+                        default=3.0,
+                        type=float,
+                        help="Total number of training epochs to perform.")
+    parser.add_argument("--warmup_proportion",
+                        default=0.1,
+                        type=float,
+                        help="Proportion of training to perform linear learning rate warmup for. "
+                             "E.g., 0.1 = 10%% of training.")
+    parser.add_argument("--no_cuda",
+                        action='store_true',
+                        help="Whether not to use CUDA when available")
+    parser.add_argument("--local_rank",
+                        type=int,
+                        default=-1,
+                        help="local_rank for distributed training on gpus")
+    parser.add_argument('--seed',
+                        type=int,
+                        default=42,
+                        help="random seed for initialization")
+    parser.add_argument('--gradient_accumulation_steps',
+                        type=int,
+                        default=1,
+                        help="Number of updates steps to accumulate before performing a backward/update pass.")
+    parser.add_argument('--fp16',
+                        action='store_true',
+                        help="Whether to use 16-bit float precision instead of 32-bit")
+    parser.add_argument('--loss_scale',
+                        type=float, default=0,
+                        help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
+                             "0 (default value): dynamic loss scaling.\n"
+                             "Positive power of 2: static loss scaling value.\n")
+
+    args = parser.parse_args()
+
+    if args.local_rank == -1 or args.no_cuda:
+        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
+        n_gpu = torch.cuda.device_count()
+    else:
+        torch.cuda.set_device(args.local_rank)
+        device = torch.device("cuda", args.local_rank)
+        n_gpu = 1
+        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
+        torch.distributed.init_process_group(backend='nccl')
+    logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format(
+        device, n_gpu, bool(args.local_rank != -1), args.fp16))
+
+    if args.gradient_accumulation_steps < 1:
+        raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(
+                            args.gradient_accumulation_steps))
+
+    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps
+
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+    torch.manual_seed(args.seed)
+    if n_gpu > 0:
+        torch.cuda.manual_seed_all(args.seed)
+
+    if not args.do_train and not args.do_eval:
+        raise ValueError("At least one of `do_train` or `do_eval` must be True.")
+
+    if os.path.exists(args.output_dir) and os.listdir(args.output_dir):
+        raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir))
+    if not os.path.exists(args.output_dir):
+        os.makedirs(args.output_dir)
+
+    tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)
+
+    # Prepare model
+    model = BertForMultipleChoice.from_pretrained(args.bert_model,
+        cache_dir=os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format(args.local_rank)),
+        num_choices=4)
+    if args.fp16:
+        model.half()
+    model.to(device)
+    if args.local_rank != -1:
+        try:
+            from apex.parallel import DistributedDataParallel as DDP
+        except ImportError:
+            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
+
+        model = DDP(model)
+    elif n_gpu > 1:
+        model = torch.nn.DataParallel(model)
+
+    if args.do_train:
+
+        # Prepare data loader
+
+        train_examples = read_swag_examples(os.path.join(args.data_dir, 'train.csv'), is_training = True)
+        train_features = convert_examples_to_features(
+            train_examples, tokenizer, args.max_seq_length, True)
+        all_input_ids = torch.tensor(select_field(train_features, 'input_ids'), dtype=torch.long)
+        all_input_mask = torch.tensor(select_field(train_features, 'input_mask'), dtype=torch.long)
+        all_segment_ids = torch.tensor(select_field(train_features, 'segment_ids'), dtype=torch.long)
+        all_label = torch.tensor([f.label for f in train_features], dtype=torch.long)
+        train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label)
+        if args.local_rank == -1:
+            train_sampler = RandomSampler(train_data)
+        else:
+            train_sampler = DistributedSampler(train_data)
+        train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)
+
+        num_train_optimization_steps = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
+        if args.local_rank != -1:
+            num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size()
+
+        # Prepare optimizer
+
+        param_optimizer = list(model.named_parameters())
+
+        # hack to remove pooler, which is not used
+        # thus it produce None grad that break apex
+        param_optimizer = [n for n in param_optimizer]
+
+        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
+        optimizer_grouped_parameters = [
+            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
+            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
+            ]
+        if args.fp16:
+            try:
+                from apex.optimizers import FP16_Optimizer
+                from apex.optimizers import FusedAdam
+            except ImportError:
+                raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
+
+            optimizer = FusedAdam(optimizer_grouped_parameters,
+                                  lr=args.learning_rate,
+                                  bias_correction=False,
+                                  max_grad_norm=1.0)
+            if args.loss_scale == 0:
+                optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
+            else:
+                optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale)
+            warmup_linear = WarmupLinearSchedule(warmup=args.warmup_proportion,
+                                                 t_total=num_train_optimization_steps)
+        else:
+            optimizer = BertAdam(optimizer_grouped_parameters,
+                                 lr=args.learning_rate,
+                                 warmup=args.warmup_proportion,
+                                 t_total=num_train_optimization_steps)
+
+        global_step = 0
+
+        logger.info("***** Running training *****")
+        logger.info("  Num examples = %d", len(train_examples))
+        logger.info("  Batch size = %d", args.train_batch_size)
+        logger.info("  Num steps = %d", num_train_optimization_steps)
+
+        model.train()
+        for _ in trange(int(args.num_train_epochs), desc="Epoch"):
+            tr_loss = 0
+            nb_tr_examples, nb_tr_steps = 0, 0
+            for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")):
+                batch = tuple(t.to(device) for t in batch)
+                input_ids, input_mask, segment_ids, label_ids = batch
+                loss = model(input_ids, segment_ids, input_mask, label_ids)
+                if n_gpu > 1:
+                    loss = loss.mean() # mean() to average on multi-gpu.
+                if args.fp16 and args.loss_scale != 1.0:
+                    # rescale loss for fp16 training
+                    # see https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html
+                    loss = loss * args.loss_scale
+                if args.gradient_accumulation_steps > 1:
+                    loss = loss / args.gradient_accumulation_steps
+                tr_loss += loss.item()
+                nb_tr_examples += input_ids.size(0)
+                nb_tr_steps += 1
+
+                if args.fp16:
+                    optimizer.backward(loss)
+                else:
+                    loss.backward()
+                if (step + 1) % args.gradient_accumulation_steps == 0:
+                    if args.fp16:
+                        # modify learning rate with special warm up BERT uses
+                        # if args.fp16 is False, BertAdam is used that handles this automatically
+                        lr_this_step = args.learning_rate * warmup_linear.get_lr(global_step, args.warmup_proportion)
+                        for param_group in optimizer.param_groups:
+                            param_group['lr'] = lr_this_step
+                    optimizer.step()
+                    optimizer.zero_grad()
+                    global_step += 1
+
+
+    if args.do_train:
+        # Save a trained model, configuration and tokenizer
+        model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
+
+        # If we save using the predefined names, we can load using `from_pretrained`
+        output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
+        output_config_file = os.path.join(args.output_dir, CONFIG_NAME)
+
+        torch.save(model_to_save.state_dict(), output_model_file)
+        model_to_save.config.to_json_file(output_config_file)
+        tokenizer.save_vocabulary(args.output_dir)
+
+        # Load a trained model and vocabulary that you have fine-tuned
+        model = BertForMultipleChoice.from_pretrained(args.output_dir, num_choices=4)
+        tokenizer = BertTokenizer.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
+    else:
+        model = BertForMultipleChoice.from_pretrained(args.bert_model, num_choices=4)
+    model.to(device)
+
+
+    if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
+        eval_examples = read_swag_examples(os.path.join(args.data_dir, 'val.csv'), is_training = True)
+        eval_features = convert_examples_to_features(
+            eval_examples, tokenizer, args.max_seq_length, True)
+        logger.info("***** Running evaluation *****")
+        logger.info("  Num examples = %d", len(eval_examples))
+        logger.info("  Batch size = %d", args.eval_batch_size)
+        all_input_ids = torch.tensor(select_field(eval_features, 'input_ids'), dtype=torch.long)
+        all_input_mask = torch.tensor(select_field(eval_features, 'input_mask'), dtype=torch.long)
+        all_segment_ids = torch.tensor(select_field(eval_features, 'segment_ids'), dtype=torch.long)
+        all_label = torch.tensor([f.label for f in eval_features], dtype=torch.long)
+        eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label)
+        # Run prediction for full data
+        eval_sampler = SequentialSampler(eval_data)
+        eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)
+
+        model.eval()
+        eval_loss, eval_accuracy = 0, 0
+        nb_eval_steps, nb_eval_examples = 0, 0
+        for input_ids, input_mask, segment_ids, label_ids in tqdm(eval_dataloader, desc="Evaluating"):
+            input_ids = input_ids.to(device)
+            input_mask = input_mask.to(device)
+            segment_ids = segment_ids.to(device)
+            label_ids = label_ids.to(device)
+
+            with torch.no_grad():
+                tmp_eval_loss = model(input_ids, segment_ids, input_mask, label_ids)
+                logits = model(input_ids, segment_ids, input_mask)
+
+            logits = logits.detach().cpu().numpy()
+            label_ids = label_ids.to('cpu').numpy()
+            tmp_eval_accuracy = accuracy(logits, label_ids)
+
+            eval_loss += tmp_eval_loss.mean().item()
+            eval_accuracy += tmp_eval_accuracy
+
+            nb_eval_examples += input_ids.size(0)
+            nb_eval_steps += 1
+
+        eval_loss = eval_loss / nb_eval_steps
+        eval_accuracy = eval_accuracy / nb_eval_examples
+
+        result = {'eval_loss': eval_loss,
+                  'eval_accuracy': eval_accuracy,
+                  'global_step': global_step,
+                  'loss': tr_loss/global_step}
+
+        output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
+        with open(output_eval_file, "w") as writer:
+            logger.info("***** Eval results *****")
+            for key in sorted(result.keys()):
+                logger.info("  %s = %s", key, str(result[key]))
+                writer.write("%s = %s\n" % (key, str(result[key])))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/run_xlnet_squad.py b/examples/run_xlnet_squad.py
deleted file mode 100644
index 393fa98abd..0000000000
--- a/examples/run_xlnet_squad.py
+++ /dev/null
@@ -1,399 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Run BERT on SQuAD."""
-
-from __future__ import absolute_import, division, print_function
-
-import argparse
-import logging
-import os
-import random
-import sys
-from io import open
-
-import numpy as np
-import torch
-from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
-                              TensorDataset)
-from torch.utils.data.distributed import DistributedSampler
-from tqdm import tqdm, trange
-
-from tensorboardX import SummaryWriter
-
-from pytorch_transformers import WEIGHTS_NAME, CONFIG_NAME
-from pytorch_transformers.modeling_xlnet import BertForQuestionAnswering
-from pytorch_transformers.tokenization_xlnet import XLNetTokenizer
-from pytorch_transformers.optimization import BertAdam, WarmupLinearSchedule
-
-from utils_squad import read_squad_examples, convert_examples_to_features, RawResult, write_predictions
-
-if sys.version_info[0] == 2:
-    import cPickle as pickle
-else:
-    import pickle
-
-logger = logging.getLogger(__name__)
-
-
-def main():
-    parser = argparse.ArgumentParser()
-
-    ## Required parameters
-    parser.add_argument("--bert_model", default=None, type=str, required=True,
-                        help="Bert pre-trained model selected in the list: bert-base-uncased, "
-                        "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, "
-                        "bert-base-multilingual-cased, bert-base-chinese.")
-    parser.add_argument("--output_dir", default=None, type=str, required=True,
-                        help="The output directory where the model checkpoints and predictions will be written.")
-
-    ## Other parameters
-    parser.add_argument("--train_file", default=None, type=str, help="SQuAD json for training. E.g., train-v1.1.json")
-    parser.add_argument("--predict_file", default=None, type=str,
-                        help="SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json")
-    parser.add_argument("--max_seq_length", default=384, type=int,
-                        help="The maximum total input sequence length after WordPiece tokenization. Sequences "
-                             "longer than this will be truncated, and sequences shorter than this will be padded.")
-    parser.add_argument("--doc_stride", default=128, type=int,
-                        help="When splitting up a long document into chunks, how much stride to take between chunks.")
-    parser.add_argument("--max_query_length", default=64, type=int,
-                        help="The maximum number of tokens for the question. Questions longer than this will "
-                             "be truncated to this length.")
-    parser.add_argument("--do_train", action='store_true', help="Whether to run training.")
-    parser.add_argument("--do_predict", action='store_true', help="Whether to run eval on the dev set.")
-    parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.")
-    parser.add_argument("--predict_batch_size", default=8, type=int, help="Total batch size for predictions.")
-    parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
-    parser.add_argument("--num_train_epochs", default=3.0, type=float,
-                        help="Total number of training epochs to perform.")
-    parser.add_argument("--warmup_proportion", default=0.1, type=float,
-                        help="Proportion of training to perform linear learning rate warmup for. E.g., 0.1 = 10%% "
-                             "of training.")
-    parser.add_argument("--n_best_size", default=20, type=int,
-                        help="The total number of n-best predictions to generate in the nbest_predictions.json "
-                             "output file.")
-    parser.add_argument("--max_answer_length", default=30, type=int,
-                        help="The maximum length of an answer that can be generated. This is needed because the start "
-                             "and end predictions are not conditioned on one another.")
-    parser.add_argument("--verbose_logging", action='store_true',
-                        help="If true, all of the warnings related to data processing will be printed. "
-                             "A number of warnings are expected for a normal SQuAD evaluation.")
-    parser.add_argument("--no_cuda",
-                        action='store_true',
-                        help="Whether not to use CUDA when available")
-    parser.add_argument('--seed',
-                        type=int,
-                        default=42,
-                        help="random seed for initialization")
-    parser.add_argument('--gradient_accumulation_steps',
-                        type=int,
-                        default=1,
-                        help="Number of updates steps to accumulate before performing a backward/update pass.")
-    parser.add_argument("--do_lower_case",
-                        action='store_true',
-                        help="Whether to lower case the input text. True for uncased models, False for cased models.")
-    parser.add_argument("--local_rank",
-                        type=int,
-                        default=-1,
-                        help="local_rank for distributed training on gpus")
-    parser.add_argument('--fp16',
-                        action='store_true',
-                        help="Whether to use 16-bit float precision instead of 32-bit")
-    parser.add_argument('--overwrite_output_dir',
-                        action='store_true',
-                        help="Overwrite the content of the output directory")
-    parser.add_argument('--loss_scale',
-                        type=float, default=0,
-                        help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
-                             "0 (default value): dynamic loss scaling.\n"
-                             "Positive power of 2: static loss scaling value.\n")
-    parser.add_argument('--version_2_with_negative',
-                        action='store_true',
-                        help='If true, the SQuAD examples contain some that do not have an answer.')
-    parser.add_argument('--null_score_diff_threshold',
-                        type=float, default=0.0,
-                        help="If null_score - best_non_null is greater than the threshold predict null.")
-    parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.")
-    parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.")
-    args = parser.parse_args()
-    print(args)
-
-    if args.server_ip and args.server_port:
-        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
-        import ptvsd
-        print("Waiting for debugger attach")
-        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
-        ptvsd.wait_for_attach()
-
-    if args.local_rank == -1 or args.no_cuda:
-        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
-        n_gpu = torch.cuda.device_count()
-    else:
-        torch.cuda.set_device(args.local_rank)
-        device = torch.device("cuda", args.local_rank)
-        n_gpu = 1
-        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
-        torch.distributed.init_process_group(backend='nccl')
-
-    logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
-                        datefmt = '%m/%d/%Y %H:%M:%S',
-                        level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
-
-    logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format(
-        device, n_gpu, bool(args.local_rank != -1), args.fp16))
-
-    if args.gradient_accumulation_steps < 1:
-        raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(
-                            args.gradient_accumulation_steps))
-
-    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps
-
-    random.seed(args.seed)
-    np.random.seed(args.seed)
-    torch.manual_seed(args.seed)
-    if n_gpu > 0:
-        torch.cuda.manual_seed_all(args.seed)
-
-    if not args.do_train and not args.do_predict:
-        raise ValueError("At least one of `do_train` or `do_predict` must be True.")
-
-    if args.do_train:
-        if not args.train_file:
-            raise ValueError(
-                "If `do_train` is True, then `train_file` must be specified.")
-    if args.do_predict:
-        if not args.predict_file:
-            raise ValueError(
-                "If `do_predict` is True, then `predict_file` must be specified.")
-
-    if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir:
-        raise ValueError("Output directory {} already exists and is not empty. Use --overwrite_output_dir to overcome.".format(args.output_dir))
-    if not os.path.exists(args.output_dir):
-        os.makedirs(args.output_dir)
-
-    if args.local_rank not in [-1, 0]:
-        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
-    tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)
-    model = BertForQuestionAnswering.from_pretrained(args.bert_model)
-    if args.local_rank == 0:
-        torch.distributed.barrier()
-
-    if args.fp16:
-        model.half()
-    model.to(device)
-    if args.local_rank != -1:
-        model = torch.nn.parallel.DistributedDataParallel(model,
-                                                          device_ids=[args.local_rank],
-                                                          output_device=args.local_rank,
-                                                          find_unused_parameters=True)
-    elif n_gpu > 1:
-        model = torch.nn.DataParallel(model)
-
-    if args.do_train:
-        if args.local_rank in [-1, 0]:
-            tb_writer = SummaryWriter()
-        # Prepare data loader
-        train_examples = read_squad_examples(
-            input_file=args.train_file, is_training=True, version_2_with_negative=args.version_2_with_negative)
-        cached_train_features_file = args.train_file+'_{0}_{1}_{2}_{3}'.format(
-            list(filter(None, args.bert_model.split('/'))).pop(), str(args.max_seq_length), str(args.doc_stride), str(args.max_query_length))
-        try:
-            with open(cached_train_features_file, "rb") as reader:
-                train_features = pickle.load(reader)
-        except:
-            train_features = convert_examples_to_features(
-                examples=train_examples,
-                tokenizer=tokenizer,
-                max_seq_length=args.max_seq_length,
-                doc_stride=args.doc_stride,
-                max_query_length=args.max_query_length,
-                is_training=True)
-            if args.local_rank == -1 or torch.distributed.get_rank() == 0:
-                logger.info("  Saving train features into cached file %s", cached_train_features_file)
-                with open(cached_train_features_file, "wb") as writer:
-                    pickle.dump(train_features, writer)
-
-        all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
-        all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
-        all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)
-        all_start_positions = torch.tensor([f.start_position for f in train_features], dtype=torch.long)
-        all_end_positions = torch.tensor([f.end_position for f in train_features], dtype=torch.long)
-        train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
-                                   all_start_positions, all_end_positions)
-        if args.local_rank == -1:
-            train_sampler = RandomSampler(train_data)
-        else:
-            train_sampler = DistributedSampler(train_data)
-
-        train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)
-        num_train_optimization_steps = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
-        # if args.local_rank != -1:
-        #     num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size()
-
-        # Prepare optimizer
-        param_optimizer = list(model.named_parameters())
-
-        # hack to remove pooler, which is not used
-        # thus it produce None grad that break apex
-        param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]]
-
-        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
-        optimizer_grouped_parameters = [
-            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
-            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
-            ]
-
-        if args.fp16:
-            try:
-                from apex.optimizers import FP16_Optimizer
-                from apex.optimizers import FusedAdam
-            except ImportError:
-                raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
-
-            optimizer = FusedAdam(optimizer_grouped_parameters,
-                                  lr=args.learning_rate,
-                                  bias_correction=False,
-                                  max_grad_norm=1.0)
-            if args.loss_scale == 0:
-                optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
-            else:
-                optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale)
-            warmup_linear = WarmupLinearSchedule(warmup=args.warmup_proportion,
-                                                 t_total=num_train_optimization_steps)
-        else:
-            optimizer = BertAdam(optimizer_grouped_parameters,
-                                 lr=args.learning_rate,
-                                 warmup=args.warmup_proportion,
-                                 t_total=num_train_optimization_steps)
-
-        global_step = 0
-
-        logger.info("***** Running training *****")
-        logger.info("  Num orig examples = %d", len(train_examples))
-        logger.info("  Num split examples = %d", len(train_features))
-        logger.info("  Batch size = %d", args.train_batch_size)
-        logger.info("  Num steps = %d", num_train_optimization_steps)
-
-        model.train()
-        for epoch in trange(int(args.num_train_epochs), desc="Epoch"):
-            for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])):
-                if n_gpu == 1:
-                    batch = tuple(t.to(device) for t in batch) # multi-gpu does scattering it-self
-                input_ids, input_mask, segment_ids, start_positions, end_positions = batch
-                loss = model(input_ids, segment_ids, input_mask, start_positions, end_positions)
-                if n_gpu > 1:
-                    loss = loss.mean() # mean() to average on multi-gpu.
-                if args.gradient_accumulation_steps > 1:
-                    loss = loss / args.gradient_accumulation_steps
-
-                if args.fp16:
-                    optimizer.backward(loss)
-                else:
-                    loss.backward()
-                if (step + 1) % args.gradient_accumulation_steps == 0:
-                    if args.fp16:
-                        # modify learning rate with special warm up BERT uses
-                        # if args.fp16 is False, BertAdam is used and handles this automatically
-                        lr_this_step = args.learning_rate * warmup_linear.get_lr(global_step, args.warmup_proportion)
-                        for param_group in optimizer.param_groups:
-                            param_group['lr'] = lr_this_step
-                    optimizer.step()
-                    optimizer.zero_grad()
-                    global_step += 1
-                    if args.local_rank in [-1, 0]:
-                        if not args.fp16:
-                            tb_writer.add_scalar('lr', optimizer.get_lr()[0], global_step)
-                        tb_writer.add_scalar('loss', loss.item(), global_step)
-
-    if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
-        # Save a trained model, configuration and tokenizer
-        model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
-
-        # If we save using the predefined names, we can load using `from_pretrained`
-        output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
-        output_config_file = os.path.join(args.output_dir, CONFIG_NAME)
-
-        torch.save(model_to_save.state_dict(), output_model_file)
-        model_to_save.config.to_json_file(output_config_file)
-        tokenizer.save_vocabulary(args.output_dir)
-
-        # Load a trained model and vocabulary that you have fine-tuned
-        model = BertForQuestionAnswering.from_pretrained(args.output_dir)
-        tokenizer = BertTokenizer.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
-
-        # Good practice: save your training arguments together with the trained model
-        output_args_file = os.path.join(args.output_dir, 'training_args.bin')
-        torch.save(args, output_args_file)
-    else:
-        model = BertForQuestionAnswering.from_pretrained(args.bert_model)
-
-    model.to(device)
-
-    if args.do_predict and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
-        eval_examples = read_squad_examples(
-            input_file=args.predict_file, is_training=False, version_2_with_negative=args.version_2_with_negative)
-        eval_features = convert_examples_to_features(
-            examples=eval_examples,
-            tokenizer=tokenizer,
-            max_seq_length=args.max_seq_length,
-            doc_stride=args.doc_stride,
-            max_query_length=args.max_query_length,
-            is_training=False)
-
-        logger.info("***** Running predictions *****")
-        logger.info("  Num orig examples = %d", len(eval_examples))
-        logger.info("  Num split examples = %d", len(eval_features))
-        logger.info("  Batch size = %d", args.predict_batch_size)
-
-        all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
-        all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
-        all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
-        all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
-        eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_example_index)
-        # Run prediction for full data
-        eval_sampler = SequentialSampler(eval_data)
-        eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.predict_batch_size)
-
-        model.eval()
-        all_results = []
-        logger.info("Start evaluating")
-        for input_ids, input_mask, segment_ids, example_indices in tqdm(eval_dataloader, desc="Evaluating", disable=args.local_rank not in [-1, 0]):
-            if len(all_results) % 1000 == 0:
-                logger.info("Processing example: %d" % (len(all_results)))
-            input_ids = input_ids.to(device)
-            input_mask = input_mask.to(device)
-            segment_ids = segment_ids.to(device)
-            with torch.no_grad():
-                batch_start_logits, batch_end_logits = model(input_ids, segment_ids, input_mask)
-            for i, example_index in enumerate(example_indices):
-                start_logits = batch_start_logits[i].detach().cpu().tolist()
-                end_logits = batch_end_logits[i].detach().cpu().tolist()
-                eval_feature = eval_features[example_index.item()]
-                unique_id = int(eval_feature.unique_id)
-                all_results.append(RawResult(unique_id=unique_id,
-                                             start_logits=start_logits,
-                                             end_logits=end_logits))
-        output_prediction_file = os.path.join(args.output_dir, "predictions.json")
-        output_nbest_file = os.path.join(args.output_dir, "nbest_predictions.json")
-        output_null_log_odds_file = os.path.join(args.output_dir, "null_odds.json")
-        write_predictions(eval_examples, eval_features, all_results,
-                          args.n_best_size, args.max_answer_length,
-                          args.do_lower_case, output_prediction_file,
-                          output_nbest_file, output_null_log_odds_file, args.verbose_logging,
-                          args.version_2_with_negative, args.null_score_diff_threshold)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/examples/test_examples.py b/examples/test_examples.py
index 2e6ed45063..6b7090bcb9 100644
--- a/examples/test_examples.py
+++ b/examples/test_examples.py
@@ -28,6 +28,7 @@ except ImportError:
     from mock import patch
 
 import run_glue
+import run_squad
 
 logging.basicConfig(level=logging.DEBUG)
 
@@ -64,6 +65,31 @@ class ExamplesTests(unittest.TestCase):
             for value in result.values():
                 self.assertGreaterEqual(value, 0.75)
 
+    def test_run_squad(self):
+        stream_handler = logging.StreamHandler(sys.stdout)
+        logger.addHandler(stream_handler)
+
+        testargs = ["run_squad.py",
+                    "--train_file=./examples/tests_samples/SQUAD/dev-v2.0-small.json",
+                    "--predict_file=./examples/tests_samples/SQUAD/dev-v2.0-small.json",
+                    "--model_name=bert-base-uncased",
+                    "--output_dir=./examples/tests_samples/temp_dir",
+                    "--max_steps=10",
+                    "--warmup_steps=2",
+                    "--do_train",
+                    "--do_eval",
+                    "--version_2_with_negative",
+                    "--learning_rate=1e-4",
+                    "--per_gpu_train_batch_size=2",
+                    "--per_gpu_eval_batch_size=1",
+                    "--overwrite_output_dir",
+                    "--seed=42"]
+        model_name = "--model_name=bert-base-uncased"
+        with patch.object(sys, 'argv', testargs + [model_name]):
+            result = run_squad.main()
+            self.assertGreaterEqual(result['f1'], 30)
+            self.assertGreaterEqual(result['exact'], 30)
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/examples/tests_samples/.gitignore b/examples/tests_samples/.gitignore
index 1ac7520522..c8ce21fe24 100644
--- a/examples/tests_samples/.gitignore
+++ b/examples/tests_samples/.gitignore
@@ -2,4 +2,5 @@
 cache*
 temp*
 !*.tsv
+!*.json
 !.gitignore
\ No newline at end of file
diff --git a/examples/tests_samples/SQUAD/dev-v2.0-small.json b/examples/tests_samples/SQUAD/dev-v2.0-small.json
new file mode 100644
index 0000000000..834d9ee660
--- /dev/null
+++ b/examples/tests_samples/SQUAD/dev-v2.0-small.json
@@ -0,0 +1,140 @@
+{
+    "version": "v2.0",
+    "data": [{
+        "title": "Normans",
+        "paragraphs": [{
+            "qas": [{
+                "question": "In what country is Normandy located?",
+                "id": "56ddde6b9a695914005b9628",
+                "answers": [{
+                    "text": "France",
+                    "answer_start": 159
+                }],
+                "is_impossible": false
+            }, {
+                "question": "When were the Normans in Normandy?",
+                "id": "56ddde6b9a695914005b9629",
+                "answers": [{
+                    "text": "10th and 11th centuries",
+                    "answer_start": 94
+                }],
+                "is_impossible": false
+            }, {
+                "question": "From which countries did the Norse originate?",
+                "id": "56ddde6b9a695914005b962a",
+                "answers": [{
+                    "text": "Denmark, Iceland and Norway",
+                    "answer_start": 256
+                }],
+                "is_impossible": false
+            }, {
+                "plausible_answers": [{
+                    "text": "Rollo",
+                    "answer_start": 308
+                }],
+                "question": "Who did King Charles III swear fealty to?",
+                "id": "5ad39d53604f3c001a3fe8d3",
+                "answers": [],
+                "is_impossible": true
+            }, {
+                "plausible_answers": [{
+                    "text": "10th century",
+                    "answer_start": 671
+                }],
+                "question": "When did the Frankish identity emerge?",
+                "id": "5ad39d53604f3c001a3fe8d4",
+                "answers": [],
+                "is_impossible": true
+            }],
+            "context": "The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave their name to Normandy, a region in France. They were descended from Norse (\"Norman\" comes from \"Norseman\") raiders and pirates from Denmark, Iceland and Norway who, under their leader Rollo, agreed to swear fealty to King Charles III of West Francia. Through generations of assimilation and mixing with the native Frankish and Roman-Gaulish populations, their descendants would gradually merge with the Carolingian-based cultures of West Francia. The distinct cultural and ethnic identity of the Normans emerged initially in the first half of the 10th century, and it continued to evolve over the succeeding centuries."
+        }, {
+            "qas": [{
+                "question": "Who was the duke in the battle of Hastings?",
+                "id": "56dddf4066d3e219004dad5f",
+                "answers": [{
+                    "text": "William the Conqueror",
+                    "answer_start": 1022
+                }],
+                "is_impossible": false
+            }, {
+                "plausible_answers": [{
+                    "text": "Antioch",
+                    "answer_start": 1295
+                }],
+                "question": "What principality did William the conquerer found?",
+                "id": "5ad3a266604f3c001a3fea2b",
+                "answers": [],
+                "is_impossible": true
+            }],
+            "context": "The Norman dynasty had a major political, cultural and military impact on medieval Europe and even the Near East. The Normans were famed for their martial spirit and eventually for their Christian piety, becoming exponents of the Catholic orthodoxy into which they assimilated. They adopted the Gallo-Romance language of the Frankish land they settled, their dialect becoming known as Norman, Normaund or Norman French, an important literary language. The Duchy of Normandy, which they formed by treaty with the French crown, was a great fief of medieval France, and under Richard I of Normandy was forged into a cohesive and formidable principality in feudal tenure. The Normans are noted both for their culture, such as their unique Romanesque architecture and musical traditions, and for their significant military accomplishments and innovations. Norman adventurers founded the Kingdom of Sicily under Roger II after conquering southern Italy on the Saracens and Byzantines, and an expedition on behalf of their duke, William the Conqueror, led to the Norman conquest of England at the Battle of Hastings in 1066. Norman cultural and military influence spread from these new European centres to the Crusader states of the Near East, where their prince Bohemond I founded the Principality of Antioch in the Levant, to Scotland and Wales in Great Britain, to Ireland, and to the coasts of north Africa and the Canary Islands."
+        }]
+    }, {
+        "title": "Computational_complexity_theory",
+        "paragraphs": [{
+            "qas": [{
+                "question": "What branch of theoretical computer science deals with broadly classifying computational problems by difficulty and class of relationship?",
+                "id": "56e16182e3433e1400422e28",
+                "answers": [{
+                    "text": "Computational complexity theory",
+                    "answer_start": 0
+                }],
+                "is_impossible": false
+            }, {
+                "plausible_answers": [{
+                    "text": "algorithm",
+                    "answer_start": 472
+                }],
+                "question": "What is a manual application of mathematical steps?",
+                "id": "5ad5316b5b96ef001a10ab76",
+                "answers": [],
+                "is_impossible": true
+            }],
+            "context": "Computational complexity theory is a branch of the theory of computation in theoretical computer science that focuses on classifying computational problems according to their inherent difficulty, and relating those classes to each other. A computational problem is understood to be a task that is in principle amenable to being solved by a computer, which is equivalent to stating that the problem may be solved by mechanical application of mathematical steps, such as an algorithm."
+        }, {
+            "qas": [{
+                "question": "What measure of a computational problem broadly defines the inherent difficulty of the solution?",
+                "id": "56e16839cd28a01900c67887",
+                "answers": [{
+                    "text": "if its solution requires significant resources",
+                    "answer_start": 46
+                }],
+                "is_impossible": false
+            }, {
+                "question": "What method is used to intuitively assess or quantify the amount of resources required to solve a computational problem?",
+                "id": "56e16839cd28a01900c67888",
+                "answers": [{
+                    "text": "mathematical models of computation",
+                    "answer_start": 176
+                }],
+                "is_impossible": false
+            }, {
+                "question": "What are two basic primary resources used to guage complexity?",
+                "id": "56e16839cd28a01900c67889",
+                "answers": [{
+                    "text": "time and storage",
+                    "answer_start": 305
+                }],
+                "is_impossible": false
+            }, {
+                "plausible_answers": [{
+                    "text": "the number of gates in a circuit",
+                    "answer_start": 436
+                }],
+                "question": "What unit is measured to determine circuit simplicity?",
+                "id": "5ad532575b96ef001a10ab7f",
+                "answers": [],
+                "is_impossible": true
+            }, {
+                "plausible_answers": [{
+                    "text": "the number of processors",
+                    "answer_start": 502
+                }],
+                "question": "What number is used in perpendicular computing?",
+                "id": "5ad532575b96ef001a10ab80",
+                "answers": [],
+                "is_impossible": true
+            }],
+            "context": "A problem is regarded as inherently difficult if its solution requires significant resources, whatever the algorithm used. The theory formalizes this intuition, by introducing mathematical models of computation to study these problems and quantifying the amount of resources needed to solve them, such as time and storage. Other complexity measures are also used, such as the amount of communication (used in communication complexity), the number of gates in a circuit (used in circuit complexity) and the number of processors (used in parallel computing). One of the roles of computational complexity theory is to determine the practical limits on what computers can and cannot do."
+        }]
+    }]
+}
\ No newline at end of file
diff --git a/examples/utils_squad.py b/examples/utils_squad.py
index c858776183..305eeb7b40 100644
--- a/examples/utils_squad.py
+++ b/examples/utils_squad.py
@@ -556,7 +556,7 @@ def write_predictions(all_examples, all_features, all_results, n_best_size,
             if len(nbest)==1:
                 nbest.insert(0,
                     _NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0))
-                
+
         # In very rare edge cases we could have no valid predictions. So we
         # just create a nonce prediction in this case to avoid failure.
         if not nbest:
@@ -609,6 +609,8 @@ def write_predictions(all_examples, all_features, all_results, n_best_size,
         with open(output_null_log_odds_file, "w") as writer:
             writer.write(json.dumps(scores_diff_json, indent=4) + "\n")
 
+    return all_predictions
+
 
 def get_final_text(pred_text, orig_text, do_lower_case, verbose_logging=False):
     """Project the tokenized prediction back to the original text."""
diff --git a/examples/utils_squad_evaluate.py b/examples/utils_squad_evaluate.py
new file mode 100644
index 0000000000..d0cf643fe3
--- /dev/null
+++ b/examples/utils_squad_evaluate.py
@@ -0,0 +1,289 @@
+"""Official evaluation script for SQuAD version 2.0.
+
+In addition to basic functionality, we also compute additional statistics and
+plot precision-recall curves if an additional na_prob.json file is provided.
+This file is expected to map question ID's to the model's predicted probability
+that a question is unanswerable.
+"""
+import argparse
+import collections
+import json
+import numpy as np
+import os
+import re
+import string
+import sys
+
+class EVAL_OPTS():
+  def __init__(self, data_file, pred_file, out_file="",
+               na_prob_file="na_prob.json", na_prob_thresh=1.0,
+               out_image_dir=None, verbose=False):
+    self.data_file = data_file
+    self.pred_file = pred_file
+    self.out_file = out_file
+    self.na_prob_file = na_prob_file
+    self.na_prob_thresh = na_prob_thresh
+    self.out_image_dir = out_image_dir
+    self.verbose = verbose
+
+OPTS = None
+
+def parse_args():
+  parser = argparse.ArgumentParser('Official evaluation script for SQuAD version 2.0.')
+  parser.add_argument('data_file', metavar='data.json', help='Input data JSON file.')
+  parser.add_argument('pred_file', metavar='pred.json', help='Model predictions.')
+  parser.add_argument('--out-file', '-o', metavar='eval.json',
+                      help='Write accuracy metrics to file (default is stdout).')
+  parser.add_argument('--na-prob-file', '-n', metavar='na_prob.json',
+                      help='Model estimates of probability of no answer.')
+  parser.add_argument('--na-prob-thresh', '-t', type=float, default=1.0,
+                      help='Predict "" if no-answer probability exceeds this (default = 1.0).')
+  parser.add_argument('--out-image-dir', '-p', metavar='out_images', default=None,
+                      help='Save precision-recall curves to directory.')
+  parser.add_argument('--verbose', '-v', action='store_true')
+  if len(sys.argv) == 1:
+    parser.print_help()
+    sys.exit(1)
+  return parser.parse_args()
+
+def make_qid_to_has_ans(dataset):
+  qid_to_has_ans = {}
+  for article in dataset:
+    for p in article['paragraphs']:
+      for qa in p['qas']:
+        qid_to_has_ans[qa['id']] = bool(qa['answers'])
+  return qid_to_has_ans
+
+def normalize_answer(s):
+  """Lower text and remove punctuation, articles and extra whitespace."""
+  def remove_articles(text):
+    regex = re.compile(r'\b(a|an|the)\b', re.UNICODE)
+    return re.sub(regex, ' ', text)
+  def white_space_fix(text):
+    return ' '.join(text.split())
+  def remove_punc(text):
+    exclude = set(string.punctuation)
+    return ''.join(ch for ch in text if ch not in exclude)
+  def lower(text):
+    return text.lower()
+  return white_space_fix(remove_articles(remove_punc(lower(s))))
+
+def get_tokens(s):
+  if not s: return []
+  return normalize_answer(s).split()
+
+def compute_exact(a_gold, a_pred):
+  return int(normalize_answer(a_gold) == normalize_answer(a_pred))
+
+def compute_f1(a_gold, a_pred):
+  gold_toks = get_tokens(a_gold)
+  pred_toks = get_tokens(a_pred)
+  common = collections.Counter(gold_toks) & collections.Counter(pred_toks)
+  num_same = sum(common.values())
+  if len(gold_toks) == 0 or len(pred_toks) == 0:
+    # If either is no-answer, then F1 is 1 if they agree, 0 otherwise
+    return int(gold_toks == pred_toks)
+  if num_same == 0:
+    return 0
+  precision = 1.0 * num_same / len(pred_toks)
+  recall = 1.0 * num_same / len(gold_toks)
+  f1 = (2 * precision * recall) / (precision + recall)
+  return f1
+
+def get_raw_scores(dataset, preds):
+  exact_scores = {}
+  f1_scores = {}
+  for article in dataset:
+    for p in article['paragraphs']:
+      for qa in p['qas']:
+        qid = qa['id']
+        gold_answers = [a['text'] for a in qa['answers']
+                        if normalize_answer(a['text'])]
+        if not gold_answers:
+          # For unanswerable questions, only correct answer is empty string
+          gold_answers = ['']
+        if qid not in preds:
+          print('Missing prediction for %s' % qid)
+          continue
+        a_pred = preds[qid]
+        # Take max over all gold answers
+        exact_scores[qid] = max(compute_exact(a, a_pred) for a in gold_answers)
+        f1_scores[qid] = max(compute_f1(a, a_pred) for a in gold_answers)
+  return exact_scores, f1_scores
+
+def apply_no_ans_threshold(scores, na_probs, qid_to_has_ans, na_prob_thresh):
+  new_scores = {}
+  for qid, s in scores.items():
+    pred_na = na_probs[qid] > na_prob_thresh
+    if pred_na:
+      new_scores[qid] = float(not qid_to_has_ans[qid])
+    else:
+      new_scores[qid] = s
+  return new_scores
+
+def make_eval_dict(exact_scores, f1_scores, qid_list=None):
+  if not qid_list:
+    total = len(exact_scores)
+    return collections.OrderedDict([
+        ('exact', 100.0 * sum(exact_scores.values()) / total),
+        ('f1', 100.0 * sum(f1_scores.values()) / total),
+        ('total', total),
+    ])
+  else:
+    total = len(qid_list)
+    return collections.OrderedDict([
+        ('exact', 100.0 * sum(exact_scores[k] for k in qid_list) / total),
+        ('f1', 100.0 * sum(f1_scores[k] for k in qid_list) / total),
+        ('total', total),
+    ])
+
+def merge_eval(main_eval, new_eval, prefix):
+  for k in new_eval:
+    main_eval['%s_%s' % (prefix, k)] = new_eval[k]
+
+def plot_pr_curve(precisions, recalls, out_image, title):
+  plt.step(recalls, precisions, color='b', alpha=0.2, where='post')
+  plt.fill_between(recalls, precisions, step='post', alpha=0.2, color='b')
+  plt.xlabel('Recall')
+  plt.ylabel('Precision')
+  plt.xlim([0.0, 1.05])
+  plt.ylim([0.0, 1.05])
+  plt.title(title)
+  plt.savefig(out_image)
+  plt.clf()
+
+def make_precision_recall_eval(scores, na_probs, num_true_pos, qid_to_has_ans,
+                               out_image=None, title=None):
+  qid_list = sorted(na_probs, key=lambda k: na_probs[k])
+  true_pos = 0.0
+  cur_p = 1.0
+  cur_r = 0.0
+  precisions = [1.0]
+  recalls = [0.0]
+  avg_prec = 0.0
+  for i, qid in enumerate(qid_list):
+    if qid_to_has_ans[qid]:
+      true_pos += scores[qid]
+    cur_p = true_pos / float(i+1)
+    cur_r = true_pos / float(num_true_pos)
+    if i == len(qid_list) - 1 or na_probs[qid] != na_probs[qid_list[i+1]]:
+      # i.e., if we can put a threshold after this point
+      avg_prec += cur_p * (cur_r - recalls[-1])
+      precisions.append(cur_p)
+      recalls.append(cur_r)
+  if out_image:
+    plot_pr_curve(precisions, recalls, out_image, title)
+  return {'ap': 100.0 * avg_prec}
+
+def run_precision_recall_analysis(main_eval, exact_raw, f1_raw, na_probs, 
+                                  qid_to_has_ans, out_image_dir):
+  if out_image_dir and not os.path.exists(out_image_dir):
+    os.makedirs(out_image_dir)
+  num_true_pos = sum(1 for v in qid_to_has_ans.values() if v)
+  if num_true_pos == 0:
+    return
+  pr_exact = make_precision_recall_eval(
+      exact_raw, na_probs, num_true_pos, qid_to_has_ans,
+      out_image=os.path.join(out_image_dir, 'pr_exact.png'),
+      title='Precision-Recall curve for Exact Match score')
+  pr_f1 = make_precision_recall_eval(
+      f1_raw, na_probs, num_true_pos, qid_to_has_ans,
+      out_image=os.path.join(out_image_dir, 'pr_f1.png'),
+      title='Precision-Recall curve for F1 score')
+  oracle_scores = {k: float(v) for k, v in qid_to_has_ans.items()}
+  pr_oracle = make_precision_recall_eval(
+      oracle_scores, na_probs, num_true_pos, qid_to_has_ans,
+      out_image=os.path.join(out_image_dir, 'pr_oracle.png'),
+      title='Oracle Precision-Recall curve (binary task of HasAns vs. NoAns)')
+  merge_eval(main_eval, pr_exact, 'pr_exact')
+  merge_eval(main_eval, pr_f1, 'pr_f1')
+  merge_eval(main_eval, pr_oracle, 'pr_oracle')
+
+def histogram_na_prob(na_probs, qid_list, image_dir, name):
+  if not qid_list:
+    return
+  x = [na_probs[k] for k in qid_list]
+  weights = np.ones_like(x) / float(len(x))
+  plt.hist(x, weights=weights, bins=20, range=(0.0, 1.0))
+  plt.xlabel('Model probability of no-answer')
+  plt.ylabel('Proportion of dataset')
+  plt.title('Histogram of no-answer probability: %s' % name)
+  plt.savefig(os.path.join(image_dir, 'na_prob_hist_%s.png' % name))
+  plt.clf()
+
+def find_best_thresh(preds, scores, na_probs, qid_to_has_ans):
+  num_no_ans = sum(1 for k in qid_to_has_ans if not qid_to_has_ans[k])
+  cur_score = num_no_ans
+  best_score = cur_score
+  best_thresh = 0.0
+  qid_list = sorted(na_probs, key=lambda k: na_probs[k])
+  for i, qid in enumerate(qid_list):
+    if qid not in scores: continue
+    if qid_to_has_ans[qid]:
+      diff = scores[qid]
+    else:
+      if preds[qid]:
+        diff = -1
+      else:
+        diff = 0
+    cur_score += diff
+    if cur_score > best_score:
+      best_score = cur_score
+      best_thresh = na_probs[qid]
+  return 100.0 * best_score / len(scores), best_thresh
+
+def find_all_best_thresh(main_eval, preds, exact_raw, f1_raw, na_probs, qid_to_has_ans):
+  best_exact, exact_thresh = find_best_thresh(preds, exact_raw, na_probs, qid_to_has_ans)
+  best_f1, f1_thresh = find_best_thresh(preds, f1_raw, na_probs, qid_to_has_ans)
+  main_eval['best_exact'] = best_exact
+  main_eval['best_exact_thresh'] = exact_thresh
+  main_eval['best_f1'] = best_f1
+  main_eval['best_f1_thresh'] = f1_thresh
+
+def main(OPTS):
+  with open(OPTS.data_file) as f:
+    dataset_json = json.load(f)
+    dataset = dataset_json['data']
+  with open(OPTS.pred_file) as f:
+    preds = json.load(f)
+  if OPTS.na_prob_file:
+    with open(OPTS.na_prob_file) as f:
+      na_probs = json.load(f)
+  else:
+    na_probs = {k: 0.0 for k in preds}
+  qid_to_has_ans = make_qid_to_has_ans(dataset)  # maps qid to True/False
+  has_ans_qids = [k for k, v in qid_to_has_ans.items() if v]
+  no_ans_qids = [k for k, v in qid_to_has_ans.items() if not v]
+  exact_raw, f1_raw = get_raw_scores(dataset, preds)
+  exact_thresh = apply_no_ans_threshold(exact_raw, na_probs, qid_to_has_ans,
+                                        OPTS.na_prob_thresh)
+  f1_thresh = apply_no_ans_threshold(f1_raw, na_probs, qid_to_has_ans,
+                                     OPTS.na_prob_thresh)
+  out_eval = make_eval_dict(exact_thresh, f1_thresh)
+  if has_ans_qids:
+    has_ans_eval = make_eval_dict(exact_thresh, f1_thresh, qid_list=has_ans_qids)
+    merge_eval(out_eval, has_ans_eval, 'HasAns')
+  if no_ans_qids:
+    no_ans_eval = make_eval_dict(exact_thresh, f1_thresh, qid_list=no_ans_qids)
+    merge_eval(out_eval, no_ans_eval, 'NoAns')
+  if OPTS.na_prob_file:
+    find_all_best_thresh(out_eval, preds, exact_raw, f1_raw, na_probs, qid_to_has_ans)
+  if OPTS.na_prob_file and OPTS.out_image_dir:
+    run_precision_recall_analysis(out_eval, exact_raw, f1_raw, na_probs, 
+                                  qid_to_has_ans, OPTS.out_image_dir)
+    histogram_na_prob(na_probs, has_ans_qids, OPTS.out_image_dir, 'hasAns')
+    histogram_na_prob(na_probs, no_ans_qids, OPTS.out_image_dir, 'noAns')
+  if OPTS.out_file:
+    with open(OPTS.out_file, 'w') as f:
+      json.dump(out_eval, f)
+  else:
+    print(json.dumps(out_eval, indent=2))
+  return out_eval
+
+if __name__ == '__main__':
+  OPTS = parse_args()
+  if OPTS.out_image_dir:
+    import matplotlib
+    matplotlib.use('Agg')
+    import matplotlib.pyplot as plt 
+  main(OPTS)

From 7322c314a613f0d5c16483814f6aaf544dbb4403 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Fri, 12 Jul 2019 14:24:08 +0200
Subject: [PATCH 114/139] remove python2 testing for examples

---
 .circleci/config.yml      |   2 -
 examples/run_bert_swag.py | 555 --------------------------------------
 2 files changed, 557 deletions(-)
 delete mode 100644 examples/run_bert_swag.py

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 65e392d2da..7f316b0b3a 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -24,9 +24,7 @@ jobs:
             - checkout
             - run: sudo pip install --progress-bar off .
             - run: sudo pip install pytest codecov pytest-cov
-            - run: sudo pip install tensorboardX scikit-learn mock
             - run: python -m pytest -sv ./pytorch_transformers/tests/ --cov
-            - run: python -m pytest -sv ./examples/
             - run: codecov
 workflows:
   version: 2
diff --git a/examples/run_bert_swag.py b/examples/run_bert_swag.py
deleted file mode 100644
index 00cd3a7840..0000000000
--- a/examples/run_bert_swag.py
+++ /dev/null
@@ -1,555 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""BERT finetuning runner."""
-
-from __future__ import absolute_import
-
-import argparse
-import csv
-import logging
-import os
-import random
-import sys
-from io import open
-
-import numpy as np
-import torch
-from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
-                              TensorDataset)
-from torch.utils.data.distributed import DistributedSampler
-from tqdm import tqdm, trange
-
-from pytorch_transformers.file_utils import PYTORCH_PRETRAINED_BERT_CACHE, WEIGHTS_NAME, CONFIG_NAME
-from pytorch_transformers.modeling_bert import BertForMultipleChoice, BertConfig
-from pytorch_transformers.optimization import BertAdam, WarmupLinearSchedule
-from pytorch_transformers.tokenization_bert import BertTokenizer
-
-logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
-                    datefmt = '%m/%d/%Y %H:%M:%S',
-                    level = logging.INFO)
-logger = logging.getLogger(__name__)
-
-
-class SwagExample(object):
-    """A single training/test example for the SWAG dataset."""
-    def __init__(self,
-                 swag_id,
-                 context_sentence,
-                 start_ending,
-                 ending_0,
-                 ending_1,
-                 ending_2,
-                 ending_3,
-                 label = None):
-        self.swag_id = swag_id
-        self.context_sentence = context_sentence
-        self.start_ending = start_ending
-        self.endings = [
-            ending_0,
-            ending_1,
-            ending_2,
-            ending_3,
-        ]
-        self.label = label
-
-    def __str__(self):
-        return self.__repr__()
-
-    def __repr__(self):
-        l = [
-            "swag_id: {}".format(self.swag_id),
-            "context_sentence: {}".format(self.context_sentence),
-            "start_ending: {}".format(self.start_ending),
-            "ending_0: {}".format(self.endings[0]),
-            "ending_1: {}".format(self.endings[1]),
-            "ending_2: {}".format(self.endings[2]),
-            "ending_3: {}".format(self.endings[3]),
-        ]
-
-        if self.label is not None:
-            l.append("label: {}".format(self.label))
-
-        return ", ".join(l)
-
-
-class InputFeatures(object):
-    def __init__(self,
-                 example_id,
-                 choices_features,
-                 label
-
-    ):
-        self.example_id = example_id
-        self.choices_features = [
-            {
-                'input_ids': input_ids,
-                'input_mask': input_mask,
-                'segment_ids': segment_ids
-            }
-            for _, input_ids, input_mask, segment_ids in choices_features
-        ]
-        self.label = label
-
-
-def read_swag_examples(input_file, is_training):
-    with open(input_file, 'r', encoding='utf-8') as f:
-        reader = csv.reader(f)
-        lines = []
-        for line in reader:
-            if sys.version_info[0] == 2:
-                line = list(unicode(cell, 'utf-8') for cell in line)
-            lines.append(line)
-
-    if is_training and lines[0][-1] != 'label':
-        raise ValueError(
-            "For training, the input file must contain a label column."
-        )
-
-    examples = [
-        SwagExample(
-            swag_id = line[2],
-            context_sentence = line[4],
-            start_ending = line[5], # in the swag dataset, the
-                                         # common beginning of each
-                                         # choice is stored in "sent2".
-            ending_0 = line[7],
-            ending_1 = line[8],
-            ending_2 = line[9],
-            ending_3 = line[10],
-            label = int(line[11]) if is_training else None
-        ) for line in lines[1:] # we skip the line with the column names
-    ]
-
-    return examples
-
-def convert_examples_to_features(examples, tokenizer, max_seq_length,
-                                 is_training):
-    """Loads a data file into a list of `InputBatch`s."""
-
-    # Swag is a multiple choice task. To perform this task using Bert,
-    # we will use the formatting proposed in "Improving Language
-    # Understanding by Generative Pre-Training" and suggested by
-    # @jacobdevlin-google in this issue
-    # https://github.com/google-research/bert/issues/38.
-    #
-    # Each choice will correspond to a sample on which we run the
-    # inference. For a given Swag example, we will create the 4
-    # following inputs:
-    # - [CLS] context [SEP] choice_1 [SEP]
-    # - [CLS] context [SEP] choice_2 [SEP]
-    # - [CLS] context [SEP] choice_3 [SEP]
-    # - [CLS] context [SEP] choice_4 [SEP]
-    # The model will output a single value for each input. To get the
-    # final decision of the model, we will run a softmax over these 4
-    # outputs.
-    features = []
-    for example_index, example in enumerate(examples):
-        context_tokens = tokenizer.tokenize(example.context_sentence)
-        start_ending_tokens = tokenizer.tokenize(example.start_ending)
-
-        choices_features = []
-        for ending_index, ending in enumerate(example.endings):
-            # We create a copy of the context tokens in order to be
-            # able to shrink it according to ending_tokens
-            context_tokens_choice = context_tokens[:]
-            ending_tokens = start_ending_tokens + tokenizer.tokenize(ending)
-            # Modifies `context_tokens_choice` and `ending_tokens` in
-            # place so that the total length is less than the
-            # specified length.  Account for [CLS], [SEP], [SEP] with
-            # "- 3"
-            _truncate_seq_pair(context_tokens_choice, ending_tokens, max_seq_length - 3)
-
-            tokens = ["[CLS]"] + context_tokens_choice + ["[SEP]"] + ending_tokens + ["[SEP]"]
-            segment_ids = [0] * (len(context_tokens_choice) + 2) + [1] * (len(ending_tokens) + 1)
-
-            input_ids = tokenizer.convert_tokens_to_ids(tokens)
-            input_mask = [1] * len(input_ids)
-
-            # Zero-pad up to the sequence length.
-            padding = [0] * (max_seq_length - len(input_ids))
-            input_ids += padding
-            input_mask += padding
-            segment_ids += padding
-
-            assert len(input_ids) == max_seq_length
-            assert len(input_mask) == max_seq_length
-            assert len(segment_ids) == max_seq_length
-
-            choices_features.append((tokens, input_ids, input_mask, segment_ids))
-
-        label = example.label
-        if example_index < 5:
-            logger.info("*** Example ***")
-            logger.info("swag_id: {}".format(example.swag_id))
-            for choice_idx, (tokens, input_ids, input_mask, segment_ids) in enumerate(choices_features):
-                logger.info("choice: {}".format(choice_idx))
-                logger.info("tokens: {}".format(' '.join(tokens)))
-                logger.info("input_ids: {}".format(' '.join(map(str, input_ids))))
-                logger.info("input_mask: {}".format(' '.join(map(str, input_mask))))
-                logger.info("segment_ids: {}".format(' '.join(map(str, segment_ids))))
-            if is_training:
-                logger.info("label: {}".format(label))
-
-        features.append(
-            InputFeatures(
-                example_id = example.swag_id,
-                choices_features = choices_features,
-                label = label
-            )
-        )
-
-    return features
-
-def _truncate_seq_pair(tokens_a, tokens_b, max_length):
-    """Truncates a sequence pair in place to the maximum length."""
-
-    # This is a simple heuristic which will always truncate the longer sequence
-    # one token at a time. This makes more sense than truncating an equal percent
-    # of tokens from each, since if one sequence is very short then each token
-    # that's truncated likely contains more information than a longer sequence.
-    while True:
-        total_length = len(tokens_a) + len(tokens_b)
-        if total_length <= max_length:
-            break
-        if len(tokens_a) > len(tokens_b):
-            tokens_a.pop()
-        else:
-            tokens_b.pop()
-
-def accuracy(out, labels):
-    outputs = np.argmax(out, axis=1)
-    return np.sum(outputs == labels)
-
-def select_field(features, field):
-    return [
-        [
-            choice[field]
-            for choice in feature.choices_features
-        ]
-        for feature in features
-    ]
-
-def main():
-    parser = argparse.ArgumentParser()
-
-    ## Required parameters
-    parser.add_argument("--data_dir",
-                        default=None,
-                        type=str,
-                        required=True,
-                        help="The input data dir. Should contain the .csv files (or other data files) for the task.")
-    parser.add_argument("--bert_model", default=None, type=str, required=True,
-                        help="Bert pre-trained model selected in the list: bert-base-uncased, "
-                        "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, "
-                        "bert-base-multilingual-cased, bert-base-chinese.")
-    parser.add_argument("--output_dir",
-                        default=None,
-                        type=str,
-                        required=True,
-                        help="The output directory where the model checkpoints will be written.")
-
-    ## Other parameters
-    parser.add_argument("--max_seq_length",
-                        default=128,
-                        type=int,
-                        help="The maximum total input sequence length after WordPiece tokenization. \n"
-                             "Sequences longer than this will be truncated, and sequences shorter \n"
-                             "than this will be padded.")
-    parser.add_argument("--do_train",
-                        action='store_true',
-                        help="Whether to run training.")
-    parser.add_argument("--do_eval",
-                        action='store_true',
-                        help="Whether to run eval on the dev set.")
-    parser.add_argument("--do_lower_case",
-                        action='store_true',
-                        help="Set this flag if you are using an uncased model.")
-    parser.add_argument("--train_batch_size",
-                        default=32,
-                        type=int,
-                        help="Total batch size for training.")
-    parser.add_argument("--eval_batch_size",
-                        default=8,
-                        type=int,
-                        help="Total batch size for eval.")
-    parser.add_argument("--learning_rate",
-                        default=5e-5,
-                        type=float,
-                        help="The initial learning rate for Adam.")
-    parser.add_argument("--num_train_epochs",
-                        default=3.0,
-                        type=float,
-                        help="Total number of training epochs to perform.")
-    parser.add_argument("--warmup_proportion",
-                        default=0.1,
-                        type=float,
-                        help="Proportion of training to perform linear learning rate warmup for. "
-                             "E.g., 0.1 = 10%% of training.")
-    parser.add_argument("--no_cuda",
-                        action='store_true',
-                        help="Whether not to use CUDA when available")
-    parser.add_argument("--local_rank",
-                        type=int,
-                        default=-1,
-                        help="local_rank for distributed training on gpus")
-    parser.add_argument('--seed',
-                        type=int,
-                        default=42,
-                        help="random seed for initialization")
-    parser.add_argument('--gradient_accumulation_steps',
-                        type=int,
-                        default=1,
-                        help="Number of updates steps to accumulate before performing a backward/update pass.")
-    parser.add_argument('--fp16',
-                        action='store_true',
-                        help="Whether to use 16-bit float precision instead of 32-bit")
-    parser.add_argument('--loss_scale',
-                        type=float, default=0,
-                        help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
-                             "0 (default value): dynamic loss scaling.\n"
-                             "Positive power of 2: static loss scaling value.\n")
-
-    args = parser.parse_args()
-
-    if args.local_rank == -1 or args.no_cuda:
-        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
-        n_gpu = torch.cuda.device_count()
-    else:
-        torch.cuda.set_device(args.local_rank)
-        device = torch.device("cuda", args.local_rank)
-        n_gpu = 1
-        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
-        torch.distributed.init_process_group(backend='nccl')
-    logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format(
-        device, n_gpu, bool(args.local_rank != -1), args.fp16))
-
-    if args.gradient_accumulation_steps < 1:
-        raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(
-                            args.gradient_accumulation_steps))
-
-    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps
-
-    random.seed(args.seed)
-    np.random.seed(args.seed)
-    torch.manual_seed(args.seed)
-    if n_gpu > 0:
-        torch.cuda.manual_seed_all(args.seed)
-
-    if not args.do_train and not args.do_eval:
-        raise ValueError("At least one of `do_train` or `do_eval` must be True.")
-
-    if os.path.exists(args.output_dir) and os.listdir(args.output_dir):
-        raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir))
-    if not os.path.exists(args.output_dir):
-        os.makedirs(args.output_dir)
-
-    tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)
-
-    # Prepare model
-    model = BertForMultipleChoice.from_pretrained(args.bert_model,
-        cache_dir=os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format(args.local_rank)),
-        num_choices=4)
-    if args.fp16:
-        model.half()
-    model.to(device)
-    if args.local_rank != -1:
-        try:
-            from apex.parallel import DistributedDataParallel as DDP
-        except ImportError:
-            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
-
-        model = DDP(model)
-    elif n_gpu > 1:
-        model = torch.nn.DataParallel(model)
-
-    if args.do_train:
-
-        # Prepare data loader
-
-        train_examples = read_swag_examples(os.path.join(args.data_dir, 'train.csv'), is_training = True)
-        train_features = convert_examples_to_features(
-            train_examples, tokenizer, args.max_seq_length, True)
-        all_input_ids = torch.tensor(select_field(train_features, 'input_ids'), dtype=torch.long)
-        all_input_mask = torch.tensor(select_field(train_features, 'input_mask'), dtype=torch.long)
-        all_segment_ids = torch.tensor(select_field(train_features, 'segment_ids'), dtype=torch.long)
-        all_label = torch.tensor([f.label for f in train_features], dtype=torch.long)
-        train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label)
-        if args.local_rank == -1:
-            train_sampler = RandomSampler(train_data)
-        else:
-            train_sampler = DistributedSampler(train_data)
-        train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)
-
-        num_train_optimization_steps = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
-        if args.local_rank != -1:
-            num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size()
-
-        # Prepare optimizer
-
-        param_optimizer = list(model.named_parameters())
-
-        # hack to remove pooler, which is not used
-        # thus it produce None grad that break apex
-        param_optimizer = [n for n in param_optimizer]
-
-        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
-        optimizer_grouped_parameters = [
-            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
-            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
-            ]
-        if args.fp16:
-            try:
-                from apex.optimizers import FP16_Optimizer
-                from apex.optimizers import FusedAdam
-            except ImportError:
-                raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
-
-            optimizer = FusedAdam(optimizer_grouped_parameters,
-                                  lr=args.learning_rate,
-                                  bias_correction=False,
-                                  max_grad_norm=1.0)
-            if args.loss_scale == 0:
-                optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
-            else:
-                optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale)
-            warmup_linear = WarmupLinearSchedule(warmup=args.warmup_proportion,
-                                                 t_total=num_train_optimization_steps)
-        else:
-            optimizer = BertAdam(optimizer_grouped_parameters,
-                                 lr=args.learning_rate,
-                                 warmup=args.warmup_proportion,
-                                 t_total=num_train_optimization_steps)
-
-        global_step = 0
-
-        logger.info("***** Running training *****")
-        logger.info("  Num examples = %d", len(train_examples))
-        logger.info("  Batch size = %d", args.train_batch_size)
-        logger.info("  Num steps = %d", num_train_optimization_steps)
-
-        model.train()
-        for _ in trange(int(args.num_train_epochs), desc="Epoch"):
-            tr_loss = 0
-            nb_tr_examples, nb_tr_steps = 0, 0
-            for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")):
-                batch = tuple(t.to(device) for t in batch)
-                input_ids, input_mask, segment_ids, label_ids = batch
-                loss = model(input_ids, segment_ids, input_mask, label_ids)
-                if n_gpu > 1:
-                    loss = loss.mean() # mean() to average on multi-gpu.
-                if args.fp16 and args.loss_scale != 1.0:
-                    # rescale loss for fp16 training
-                    # see https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html
-                    loss = loss * args.loss_scale
-                if args.gradient_accumulation_steps > 1:
-                    loss = loss / args.gradient_accumulation_steps
-                tr_loss += loss.item()
-                nb_tr_examples += input_ids.size(0)
-                nb_tr_steps += 1
-
-                if args.fp16:
-                    optimizer.backward(loss)
-                else:
-                    loss.backward()
-                if (step + 1) % args.gradient_accumulation_steps == 0:
-                    if args.fp16:
-                        # modify learning rate with special warm up BERT uses
-                        # if args.fp16 is False, BertAdam is used that handles this automatically
-                        lr_this_step = args.learning_rate * warmup_linear.get_lr(global_step, args.warmup_proportion)
-                        for param_group in optimizer.param_groups:
-                            param_group['lr'] = lr_this_step
-                    optimizer.step()
-                    optimizer.zero_grad()
-                    global_step += 1
-
-
-    if args.do_train:
-        # Save a trained model, configuration and tokenizer
-        model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
-
-        # If we save using the predefined names, we can load using `from_pretrained`
-        output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
-        output_config_file = os.path.join(args.output_dir, CONFIG_NAME)
-
-        torch.save(model_to_save.state_dict(), output_model_file)
-        model_to_save.config.to_json_file(output_config_file)
-        tokenizer.save_vocabulary(args.output_dir)
-
-        # Load a trained model and vocabulary that you have fine-tuned
-        model = BertForMultipleChoice.from_pretrained(args.output_dir, num_choices=4)
-        tokenizer = BertTokenizer.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
-    else:
-        model = BertForMultipleChoice.from_pretrained(args.bert_model, num_choices=4)
-    model.to(device)
-
-
-    if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
-        eval_examples = read_swag_examples(os.path.join(args.data_dir, 'val.csv'), is_training = True)
-        eval_features = convert_examples_to_features(
-            eval_examples, tokenizer, args.max_seq_length, True)
-        logger.info("***** Running evaluation *****")
-        logger.info("  Num examples = %d", len(eval_examples))
-        logger.info("  Batch size = %d", args.eval_batch_size)
-        all_input_ids = torch.tensor(select_field(eval_features, 'input_ids'), dtype=torch.long)
-        all_input_mask = torch.tensor(select_field(eval_features, 'input_mask'), dtype=torch.long)
-        all_segment_ids = torch.tensor(select_field(eval_features, 'segment_ids'), dtype=torch.long)
-        all_label = torch.tensor([f.label for f in eval_features], dtype=torch.long)
-        eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label)
-        # Run prediction for full data
-        eval_sampler = SequentialSampler(eval_data)
-        eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)
-
-        model.eval()
-        eval_loss, eval_accuracy = 0, 0
-        nb_eval_steps, nb_eval_examples = 0, 0
-        for input_ids, input_mask, segment_ids, label_ids in tqdm(eval_dataloader, desc="Evaluating"):
-            input_ids = input_ids.to(device)
-            input_mask = input_mask.to(device)
-            segment_ids = segment_ids.to(device)
-            label_ids = label_ids.to(device)
-
-            with torch.no_grad():
-                tmp_eval_loss = model(input_ids, segment_ids, input_mask, label_ids)
-                logits = model(input_ids, segment_ids, input_mask)
-
-            logits = logits.detach().cpu().numpy()
-            label_ids = label_ids.to('cpu').numpy()
-            tmp_eval_accuracy = accuracy(logits, label_ids)
-
-            eval_loss += tmp_eval_loss.mean().item()
-            eval_accuracy += tmp_eval_accuracy
-
-            nb_eval_examples += input_ids.size(0)
-            nb_eval_steps += 1
-
-        eval_loss = eval_loss / nb_eval_steps
-        eval_accuracy = eval_accuracy / nb_eval_examples
-
-        result = {'eval_loss': eval_loss,
-                  'eval_accuracy': eval_accuracy,
-                  'global_step': global_step,
-                  'loss': tr_loss/global_step}
-
-        output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
-        with open(output_eval_file, "w") as writer:
-            logger.info("***** Eval results *****")
-            for key in sorted(result.keys()):
-                logger.info("  %s = %s", key, str(result[key]))
-                writer.write("%s = %s\n" % (key, str(result[key])))
-
-
-if __name__ == "__main__":
-    main()

From 7d4b200e4098dfdbe2dba4bf8252511445a807b8 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Sat, 13 Jul 2019 15:25:03 +0200
Subject: [PATCH 115/139] good quality generation example for GPT, GPT-2,
 Transfo-XL, XLNet

---
 examples/run_generation.py                    | 198 ++++++++++++++++++
 examples/test_examples.py                     |  14 ++
 pytorch_transformers/modeling_gpt2.py         |   8 +-
 pytorch_transformers/modeling_openai.py       |   4 +
 pytorch_transformers/modeling_transfo_xl.py   |   4 +
 pytorch_transformers/modeling_xlnet.py        |  48 ++---
 .../tests/modeling_xlnet_test.py              |  15 +-
 .../tokenization_transfo_xl.py                |   1 -
 pytorch_transformers/tokenization_utils.py    |   4 +-
 pytorch_transformers/tokenization_xlnet.py    |   2 +-
 10 files changed, 252 insertions(+), 46 deletions(-)
 create mode 100644 examples/run_generation.py

diff --git a/examples/run_generation.py b/examples/run_generation.py
new file mode 100644
index 0000000000..047e24679f
--- /dev/null
+++ b/examples/run_generation.py
@@ -0,0 +1,198 @@
+#!/usr/bin/env python3
+# coding=utf-8
+# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Generation with GPT/GPT-2/Transformer-XL/XLNet models
+"""
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import argparse
+import logging
+from tqdm import trange
+
+import torch
+import torch.nn.functional as F
+import numpy as np
+
+from pytorch_transformers import GPT2Config, OpenAIGPTConfig, XLNetConfig, TransfoXLConfig
+
+from pytorch_transformers import GPT2LMHeadModel, GPT2Tokenizer
+from pytorch_transformers import OpenAIGPTLMHeadModel, OpenAIGPTTokenizer
+from pytorch_transformers import XLNetLMHeadModel, XLNetTokenizer
+from pytorch_transformers import TransfoXLLMHeadModel, TransfoXLTokenizer
+
+
+logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
+                    datefmt = '%m/%d/%Y %H:%M:%S',
+                    level = logging.INFO)
+logger = logging.getLogger(__name__)
+
+MAX_LENGTH = int(10000)  # Hardcoded max length to avoid infinite loop
+
+ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) for conf in (GPT2Config, OpenAIGPTConfig, XLNetConfig, TransfoXLConfig)), ())
+
+MODEL_CLASSES = {
+    'gpt2': (GPT2LMHeadModel, GPT2Tokenizer),
+    'openai-gpt': (OpenAIGPTLMHeadModel, OpenAIGPTTokenizer),
+    'xlnet': (XLNetLMHeadModel, XLNetTokenizer),
+    'transfo-xl': (TransfoXLLMHeadModel, TransfoXLTokenizer),
+}
+
+# Padding text to help Transformer-XL and XLNet with short prompts as proposed by Aman Rusia
+# in https://github.com/rusiaaman/XLNet-gen#methodology
+# and https://medium.com/@amanrusia/xlnet-speaks-comparison-to-gpt-2-ea1a4e9ba39e
+PADDING_TEXT = """ In 1991, the remains of Russian Tsar Nicholas II and his family
+(except for Alexei and Maria) are discovered.
+The voice of Nicholas's young son, Tsarevich Alexei Nikolaevich, narrates the
+remainder of the story. 1883 Western Siberia,
+a young Grigori Rasputin is asked by his father and a group of men to perform magic.
+Rasputin has a vision and denounces one of the men as a horse thief. Although his
+father initially slaps him for making such an accusation, Rasputin watches as the
+man is chased outside and beaten. Twenty years later, Rasputin sees a vision of
+the Virgin Mary, prompting him to become a priest. Rasputin quickly becomes famous,
+with people, even a bishop, begging for his blessing. <eod> </s> <eos>"""
+
+
+def set_seed(args):
+    np.random.seed(args.seed)
+    torch.manual_seed(args.seed)
+    if args.n_gpu > 0:
+        torch.cuda.manual_seed_all(args.seed)
+
+
+def top_k_top_p_filtering(logits, top_k=0, top_p=0.0, filter_value=-float('Inf')):
+    """ Filter a distribution of logits using top-k and/or nucleus (top-p) filtering
+        Args:
+            logits: logits distribution shape (vocabulary size)
+            top_k > 0: keep only top k tokens with highest probability (top-k filtering).
+            top_p > 0.0: keep the top tokens with cumulative probability >= top_p (nucleus filtering).
+                Nucleus filtering is described in Holtzman et al. (http://arxiv.org/abs/1904.09751)
+        From: https://gist.github.com/thomwolf/1a5a29f6962089e871b94cbd09daf317
+    """
+    assert logits.dim() == 1  # batch size 1 for now - could be updated for more but the code would be less clear
+    top_k = min(top_k, logits.size(-1))  # Safety check
+    if top_k > 0:
+        # Remove all tokens with a probability less than the last token of the top-k
+        indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
+        logits[indices_to_remove] = filter_value
+
+    if top_p > 0.0:
+        sorted_logits, sorted_indices = torch.sort(logits, descending=True)
+        cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
+
+        # Remove tokens with cumulative probability above the threshold
+        sorted_indices_to_remove = cumulative_probs > top_p
+        # Shift the indices to the right to keep also the first token above the threshold
+        sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
+        sorted_indices_to_remove[..., 0] = 0
+
+        indices_to_remove = sorted_indices[sorted_indices_to_remove]
+        logits[indices_to_remove] = filter_value
+    return logits
+
+
+def sample_sequence(model, length, context, num_samples=1, temperature=1, top_k=0, top_p=0.0, is_xlnet=False, device='cpu'):
+    context = torch.tensor(context, dtype=torch.long, device=device)
+    context = context.unsqueeze(0).repeat(num_samples, 1)
+    generated = context
+    with torch.no_grad():
+        for _ in trange(length):
+
+            inputs = {'input_ids': generated}
+            if is_xlnet: 
+                # XLNet is a direct (predict same token, not next token) and bi-directional model by default
+                # => need one additional dummy token in the input (will be masked), attention mask and target mapping (see model docstring)
+                input_ids = torch.cat((generated, torch.zeros((1, 1), dtype=torch.long, device=device)), dim=1)
+                perm_mask = torch.zeros((1, input_ids.shape[1], input_ids.shape[1]), dtype=torch.float, device=device)
+                perm_mask[:, :, -1] = 1.0  # Previous tokens don't see last token
+                target_mapping = torch.zeros((1, 1, input_ids.shape[1]), dtype=torch.float, device=device)
+                target_mapping[0, 0, -1] = 1.0  # predict last token
+                inputs = {'input_ids': input_ids, 'perm_mask': perm_mask, 'target_mapping': target_mapping}
+
+            outputs = model(**inputs)  # Note: we could also use 'past' with GPT-2/Transfo-XL/XLNet (cached hidden-states)
+            next_token_logits = outputs[0][0, -1, :] / temperature
+            filtered_logits = top_k_top_p_filtering(next_token_logits, top_k=top_k, top_p=top_p)
+            next_token = torch.multinomial(F.softmax(filtered_logits, dim=-1), num_samples=1)
+            generated = torch.cat((generated, next_token.unsqueeze(0)), dim=1)
+    return generated
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--model_name', type=str, default=None, required=True,
+                        help="GPT, GPT-2, Transformer-XL or XLNet pre-trained model selected in the list: " + ", ".join(ALL_MODELS))
+    parser.add_argument("--prompt", type=str, default="")
+    parser.add_argument("--padding_text", type=str, default="")
+    parser.add_argument("--length", type=int, default=20)
+    parser.add_argument("--temperature", type=float, default=1.0)
+    parser.add_argument("--top_k", type=int, default=0)
+    parser.add_argument("--top_p", type=float, default=0.9)
+    parser.add_argument("--no_cuda", action='store_true',
+                        help="Avoid using CUDA when available")
+    parser.add_argument('--seed', type=int, default=42,
+                        help="random seed for initialization")
+    args = parser.parse_args()
+
+    args.device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
+    args.n_gpu = torch.cuda.device_count()
+
+    set_seed(args)
+
+    args.model_type = ""
+    for key in MODEL_CLASSES:
+        if key in args.model_name.lower():
+            args.model_type = key  # take the first match in model types
+            break
+
+    model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
+    tokenizer = tokenizer_class.from_pretrained(args.model_name)
+    model = model_class.from_pretrained(args.model_name)
+    model.to(args.device)
+    model.eval()
+
+    if args.length < 0 and model.config.max_position_embeddings > 0:
+        args.length = model.config.max_position_embeddings
+    elif 0 < model.config.max_position_embeddings < args.length:
+        args.length = model.config.max_position_embeddings  # No generation bigger than model size 
+    elif args.length < 0:
+        args.length = MAX_LENGTH  # avoid infinite loop
+
+    print(args)
+    while True:
+        raw_text = args.prompt if args.prompt else input("Model prompt >>> ")
+        if args.model_type in ["transfo-xl", "xlnet"]:
+            # Models with memory likes to have a long prompt for short inputs.
+            raw_text = (args.padding_text if args.padding_text else PADDING_TEXT) + raw_text
+        context_tokens = tokenizer.encode(raw_text)
+        out = sample_sequence(
+            model=model,
+            context=context_tokens,
+            length=args.length,
+            temperature=args.temperature,
+            top_k=args.top_k,
+            top_p=args.top_p,
+            device=args.device,
+            is_xlnet=bool(args.model_type == "xlnet"),
+        )
+        out = out[0, len(context_tokens):].tolist()
+        text = tokenizer.decode(out, clean_up_tokenization_spaces=True)
+        print(text)
+        if args.prompt:
+            break
+    return text
+
+
+if __name__ == '__main__':
+    main()
diff --git a/examples/test_examples.py b/examples/test_examples.py
index 6b7090bcb9..8ea51b5726 100644
--- a/examples/test_examples.py
+++ b/examples/test_examples.py
@@ -29,6 +29,7 @@ except ImportError:
 
 import run_glue
 import run_squad
+import run_generation
 
 logging.basicConfig(level=logging.DEBUG)
 
@@ -91,5 +92,18 @@ class ExamplesTests(unittest.TestCase):
             self.assertGreaterEqual(result['exact'], 30)
 
 
+    def test_generation(self):
+        stream_handler = logging.StreamHandler(sys.stdout)
+        logger.addHandler(stream_handler)
+
+        testargs = ["run_generation.py",
+                    "--prompt=Hello",
+                    "--seed=42"]
+        model_name = "--model_name=openai-gpt"
+        with patch.object(sys, 'argv', testargs + [model_name]):
+            result = run_generation.main()
+            self.assertGreaterEqual(result['f1'], 30)
+            self.assertGreaterEqual(result['exact'], 30)
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/pytorch_transformers/modeling_gpt2.py b/pytorch_transformers/modeling_gpt2.py
index b5fc6fc49b..0747c7a026 100644
--- a/pytorch_transformers/modeling_gpt2.py
+++ b/pytorch_transformers/modeling_gpt2.py
@@ -37,9 +37,9 @@ from .modeling_bert import BertLayerNorm as LayerNorm
 logger = logging.getLogger(__name__)
 
 GPT2_PRETRAINED_MODEL_ARCHIVE_MAP = {"gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-pytorch_model.bin",
-                                "gpt2-medium": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-pytorch_model.bin"}
+                                     "gpt2-medium": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-pytorch_model.bin"}
 GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP = {"gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-config.json",
-                                 "gpt2-medium": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-config.json"}
+                                      "gpt2-medium": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-config.json"}
 
 def load_tf_weights_in_gpt2(model, config, gpt2_checkpoint_path):
     """ Load tf checkpoints in a pytorch model
@@ -195,6 +195,10 @@ class GPT2Config(PretrainedConfig):
                 "or the path to a pretrained model config file (str)"
             )
 
+    @property
+    def max_position_embeddings(self):
+        return self.n_positions
+
     @property
     def hidden_size(self):
         return self.n_embd
diff --git a/pytorch_transformers/modeling_openai.py b/pytorch_transformers/modeling_openai.py
index 9fb4720e93..d873aef619 100644
--- a/pytorch_transformers/modeling_openai.py
+++ b/pytorch_transformers/modeling_openai.py
@@ -214,6 +214,10 @@ class OpenAIGPTConfig(PretrainedConfig):
                 "or the path to a pretrained model config file (str)"
             )
 
+    @property
+    def max_position_embeddings(self):
+        return self.n_positions
+
     @property
     def hidden_size(self):
         return self.n_embd
diff --git a/pytorch_transformers/modeling_transfo_xl.py b/pytorch_transformers/modeling_transfo_xl.py
index b31723168a..f368d32636 100644
--- a/pytorch_transformers/modeling_transfo_xl.py
+++ b/pytorch_transformers/modeling_transfo_xl.py
@@ -287,6 +287,10 @@ class TransfoXLConfig(PretrainedConfig):
             raise ValueError("First argument must be either a vocabulary size (int)"
                              "or the path to a pretrained model config file (str)")
 
+    @property
+    def max_position_embeddings(self):
+        return self.tgt_len + self.ext_len + self.mem_len
+
     @property
     def vocab_size(self):
         return self.n_token
diff --git a/pytorch_transformers/modeling_xlnet.py b/pytorch_transformers/modeling_xlnet.py
index 051cc4e112..d3efd2799a 100644
--- a/pytorch_transformers/modeling_xlnet.py
+++ b/pytorch_transformers/modeling_xlnet.py
@@ -211,9 +211,6 @@ class XLNetConfig(PretrainedConfig):
             layers in the embeddings, encoder, and pooler.
         dropatt: The dropout ratio for the attention
             probabilities.
-        max_position_embeddings: The maximum sequence length that this model might
-            ever be used with. Typically set this to something large just in case
-            (e.g., 512 or 1024 or 2048).
         initializer_range: The sttdev of the truncated_normal_initializer for
             initializing all weight matrices.
         layer_norm_eps: The epsilon used by LayerNorm.
@@ -247,7 +244,6 @@ class XLNetConfig(PretrainedConfig):
                  untie_r=True,
                  attn_type="bi",
 
-                 max_position_embeddings=512,
                  initializer_range=0.02,
                  layer_norm_eps=1e-12,
 
@@ -289,7 +285,6 @@ class XLNetConfig(PretrainedConfig):
             self.untie_r = untie_r
             self.attn_type = attn_type
 
-            self.max_position_embeddings = max_position_embeddings
             self.initializer_range = initializer_range
             self.layer_norm_eps = layer_norm_eps
 
@@ -312,6 +307,10 @@ class XLNetConfig(PretrainedConfig):
             raise ValueError("First argument must be either a vocabulary size (int)"
                              "or the path to a pretrained model config file (str)")
 
+    @property
+    def max_position_embeddings(self):
+        return -1
+
     @property
     def vocab_size(self):
         return self.n_token
@@ -765,7 +764,7 @@ class XLNetModel(XLNetPreTrainedModel):
         return pos_emb
 
     def forward(self, input_ids, token_type_ids=None, input_mask=None, attention_mask=None,
-                mems=None, perm_mask=None, target_mapping=None, inp_q=None, head_mask=None):
+                mems=None, perm_mask=None, target_mapping=None, head_mask=None):
         """
         Performs a model forward pass. **Can be called by calling the class directly, once it has been instantiated.**
 
@@ -790,10 +789,6 @@ class XLNetModel(XLNetPreTrainedModel):
                 on the j-th token.
                 Only used during pretraining for partial prediction.
                 Set to None during finetuning.
-            inp_q: [optional] float32 Tensor in shape [bsz, len].
-                1 for tokens with losses and 0 for tokens without losses.
-                Only used during pretraining for two-stream attention.
-                Set to None during finetuning.
             head_mask: TODO Lysandre didn't fill
 
 
@@ -823,7 +818,6 @@ class XLNetModel(XLNetPreTrainedModel):
         attention_mask = attention_mask.transpose(0, 1).contiguous() if attention_mask is not None else None
         perm_mask = perm_mask.permute(1, 2, 0).contiguous() if perm_mask is not None else None
         target_mapping = target_mapping.permute(1, 2, 0).contiguous() if target_mapping is not None else None
-        inp_q = inp_q.transpose(0, 1).contiguous() if inp_q is not None else None
 
         qlen, bsz = input_ids.shape[0], input_ids.shape[1]
         mlen = mems[0].shape[0] if mems is not None else 0
@@ -878,12 +872,11 @@ class XLNetModel(XLNetPreTrainedModel):
         ##### Word embeddings and prepare h & g hidden states
         word_emb_k = self.word_embedding(input_ids)
         output_h = self.dropout(word_emb_k)
-        if inp_q is not None:
-            if target_mapping is not None:
-                word_emb_q = self.mask_emb.expand(target_mapping.shape[0], bsz, -1)
-            else:
-                inp_q_ext = inp_q[:, :, None]
-                word_emb_q = inp_q_ext * self.mask_emb + (1 - inp_q_ext) * word_emb_k
+        if target_mapping is not None:
+            word_emb_q = self.mask_emb.expand(target_mapping.shape[0], bsz, -1)
+        # else:  # We removed the inp_q input which was same as target mapping
+        #     inp_q_ext = inp_q[:, :, None]
+        #     word_emb_q = inp_q_ext * self.mask_emb + (1 - inp_q_ext) * word_emb_k
             output_g = self.dropout(word_emb_q)
         else:
             output_g = None
@@ -994,7 +987,7 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
         self._tie_or_clone_weights(self.lm_loss, self.transformer.word_embedding)
 
     def forward(self, input_ids, token_type_ids=None, input_mask=None, attention_mask=None,
-                mems=None, perm_mask=None, target_mapping=None, inp_q=None,
+                mems=None, perm_mask=None, target_mapping=None,
                 labels=None, head_mask=None):
         """
          all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)
@@ -1020,11 +1013,6 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
                 on the j-th token.
                 Only used during pretraining for partial prediction.
                 Set to None during finetuning.
-            inp_q: [optional] float32 Tensor in shape [bsz, len].
-                1 for tokens with losses and 0 for tokens without losses.
-                Only used during pretraining for two-stream attention.
-                Set to None during finetuning.
-
 
         Returns:
             A ``tuple(encoded_layers, pooled_output)``, with
@@ -1054,7 +1042,7 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
             all_encoder_layers, pooled_output = model.forward(input_ids, token_type_ids, input_mask)
         """
         transformer_outputs = self.transformer(input_ids, token_type_ids, input_mask, attention_mask,
-                                               mems, perm_mask, target_mapping, inp_q, head_mask)
+                                               mems, perm_mask, target_mapping, head_mask)
 
         logits = self.lm_loss(transformer_outputs[0])
 
@@ -1103,7 +1091,7 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel):
         self.apply(self.init_weights)
 
     def forward(self, input_ids, token_type_ids=None, input_mask=None, attention_mask=None,
-                mems=None, perm_mask=None, target_mapping=None, inp_q=None,
+                mems=None, perm_mask=None, target_mapping=None,
                 labels=None, head_mask=None):
         """
         Performs a model forward pass. **Can be called by calling the class directly, once it has been instantiated.**
@@ -1129,10 +1117,6 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel):
                 on the j-th token.
                 Only used during pre-training for partial prediction.
                 Set to None during fine-tuning.
-            inp_q: float32 Tensor in shape [bsz, len].
-                1 for tokens with losses and 0 for tokens without losses.
-                Only used during pre-training for two-stream attention.
-                Set to None during fine-tuning.
             labels: TODO Lysandre didn't fill
             head_mask: an optional ``torch.Tensor`` of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
                 It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
@@ -1161,7 +1145,7 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel):
             all_encoder_layers, pooled_output = model.forward(input_ids, token_type_ids, input_mask)
         """
         transformer_outputs = self.transformer(input_ids, token_type_ids, input_mask, attention_mask,
-                                               mems, perm_mask, target_mapping, inp_q, head_mask)
+                                               mems, perm_mask, target_mapping, head_mask)
         output = transformer_outputs[0]
 
         output = self.sequence_summary(output)
@@ -1215,7 +1199,7 @@ class XLNetForQuestionAnswering(XLNetPreTrainedModel):
         self.apply(self.init_weights)
 
     def forward(self, input_ids, token_type_ids=None, input_mask=None, attention_mask=None,
-                mems=None, perm_mask=None, target_mapping=None, inp_q=None,
+                mems=None, perm_mask=None, target_mapping=None,
                 start_positions=None, end_positions=None, cls_index=None, is_impossible=None, p_mask=None,
                 head_mask=None):
 
@@ -1266,7 +1250,7 @@ class XLNetForQuestionAnswering(XLNetPreTrainedModel):
             start_logits, end_logits = model.forward(input_ids, token_type_ids, input_mask)
         """
         transformer_outputs = self.transformer(input_ids, token_type_ids, input_mask, attention_mask,
-                                               mems, perm_mask, target_mapping, inp_q, head_mask)
+                                               mems, perm_mask, target_mapping, head_mask)
         hidden_states = transformer_outputs[0]
         start_logits = self.start_logits(hidden_states, p_mask)
 
diff --git a/pytorch_transformers/tests/modeling_xlnet_test.py b/pytorch_transformers/tests/modeling_xlnet_test.py
index 3792125d6e..8360a08d60 100644
--- a/pytorch_transformers/tests/modeling_xlnet_test.py
+++ b/pytorch_transformers/tests/modeling_xlnet_test.py
@@ -97,7 +97,6 @@ class XLNetModelTest(CommonTestCases.CommonModelTester):
             perm_mask[:, :, -1] = 1.0  # Previous tokens don't see last token
             target_mapping = torch.zeros(self.batch_size, 1, self.seq_length + 1, dtype=torch.float)
             target_mapping[:, 0, -1] = 1.0  # predict last token
-            inp_q = target_mapping[:, 0, :].clone()  # predict last token
 
             sequence_labels = None
             lm_labels = None
@@ -124,14 +123,14 @@ class XLNetModelTest(CommonTestCases.CommonModelTester):
                 num_labels=self.type_sequence_label_size)
 
             return (config, input_ids_1, input_ids_2, input_ids_q, perm_mask, input_mask,
-                    target_mapping, inp_q, segment_ids, lm_labels, sequence_labels, is_impossible_labels)
+                    target_mapping, segment_ids, lm_labels, sequence_labels, is_impossible_labels)
 
         def set_seed(self):
             random.seed(self.seed)
             torch.manual_seed(self.seed)
 
         def create_and_check_xlnet_base_model(self, config, input_ids_1, input_ids_2, input_ids_q, perm_mask, input_mask,
-                target_mapping, inp_q, segment_ids, lm_labels, sequence_labels, is_impossible_labels):
+                target_mapping, segment_ids, lm_labels, sequence_labels, is_impossible_labels):
             model = XLNetModel(config)
             model.eval()
 
@@ -153,7 +152,7 @@ class XLNetModelTest(CommonTestCases.CommonModelTester):
                 [[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers)
 
         def create_and_check_xlnet_lm_head(self, config, input_ids_1, input_ids_2, input_ids_q, perm_mask, input_mask,
-                target_mapping, inp_q, segment_ids, lm_labels, sequence_labels, is_impossible_labels):
+                target_mapping, segment_ids, lm_labels, sequence_labels, is_impossible_labels):
             model = XLNetLMHeadModel(config)
             model.eval()
 
@@ -161,7 +160,7 @@ class XLNetModelTest(CommonTestCases.CommonModelTester):
 
             loss_2, all_logits_2, mems_2 = model(input_ids_2, token_type_ids=segment_ids, labels=lm_labels, mems=mems_1)
 
-            logits, _ = model(input_ids_q, perm_mask=perm_mask, target_mapping=target_mapping, inp_q=inp_q)
+            logits, _ = model(input_ids_q, perm_mask=perm_mask, target_mapping=target_mapping)
 
             result = {
                 "loss_1": loss_1,
@@ -193,7 +192,7 @@ class XLNetModelTest(CommonTestCases.CommonModelTester):
                 [[self.mem_len, self.batch_size, self.hidden_size]] * self.num_hidden_layers)
 
         def create_and_check_xlnet_qa(self, config, input_ids_1, input_ids_2, input_ids_q, perm_mask, input_mask,
-                target_mapping, inp_q, segment_ids, lm_labels, sequence_labels, is_impossible_labels):
+                target_mapping, segment_ids, lm_labels, sequence_labels, is_impossible_labels):
             model = XLNetForQuestionAnswering(config)
             model.eval()
 
@@ -243,7 +242,7 @@ class XLNetModelTest(CommonTestCases.CommonModelTester):
                 [[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers)
 
         def create_and_check_xlnet_sequence_classif(self, config, input_ids_1, input_ids_2, input_ids_q, perm_mask, input_mask,
-                target_mapping, inp_q, segment_ids, lm_labels, sequence_labels, is_impossible_labels):
+                target_mapping, segment_ids, lm_labels, sequence_labels, is_impossible_labels):
             model = XLNetForSequenceClassification(config)
             model.eval()
 
@@ -269,7 +268,7 @@ class XLNetModelTest(CommonTestCases.CommonModelTester):
         def prepare_config_and_inputs_for_common(self):
             config_and_inputs = self.prepare_config_and_inputs()
             (config, input_ids_1, input_ids_2, input_ids_q, perm_mask, input_mask,
-                target_mapping, inp_q, segment_ids, lm_labels,
+                target_mapping, segment_ids, lm_labels,
                 sequence_labels, is_impossible_labels) = config_and_inputs
             inputs_dict = {'input_ids': input_ids_1}
             return config, inputs_dict
diff --git a/pytorch_transformers/tokenization_transfo_xl.py b/pytorch_transformers/tokenization_transfo_xl.py
index fe1fe28e9a..98b4eb6ff5 100644
--- a/pytorch_transformers/tokenization_transfo_xl.py
+++ b/pytorch_transformers/tokenization_transfo_xl.py
@@ -25,7 +25,6 @@ import os
 import sys
 from collections import Counter, OrderedDict
 from io import open
-import unicodedata
 
 import torch
 import numpy as np
diff --git a/pytorch_transformers/tokenization_utils.py b/pytorch_transformers/tokenization_utils.py
index a84b8d6f44..9840e75225 100644
--- a/pytorch_transformers/tokenization_utils.py
+++ b/pytorch_transformers/tokenization_utils.py
@@ -343,7 +343,7 @@ class PreTrainedTokenizer(object):
             return sum((split_on_tokens(tok_list[1:], sub_text.strip()) + [tok] \
                         for sub_text in split_text), [])[:-1]
 
-        added_tokens = list(self.added_tokens_encoder.keys())
+        added_tokens = list(self.added_tokens_encoder.keys()) + self.all_special_tokens
         tokenized_text = split_on_tokens(added_tokens, text)
         return tokenized_text
 
@@ -466,7 +466,7 @@ class PreTrainedTokenizer(object):
 
 
 def clean_up_tokenization(out_string):
-    out_string.replace(' .', '.').replace(' ?', '?').replace(' !', '!').replace(' ,', ','
+    out_string = out_string.replace(' .', '.').replace(' ?', '?').replace(' !', '!').replace(' ,', ','
                     ).replace(" ' ", "'").replace(" n't", "n't").replace(" 'm", "'m").replace(" do not", " don't"
                     ).replace(" 's", "'s").replace(" 've", "'ve").replace(" 're", "'re")
     return out_string
diff --git a/pytorch_transformers/tokenization_xlnet.py b/pytorch_transformers/tokenization_xlnet.py
index 48ec3d88a1..d7317b2afc 100644
--- a/pytorch_transformers/tokenization_xlnet.py
+++ b/pytorch_transformers/tokenization_xlnet.py
@@ -172,7 +172,7 @@ class XLNetTokenizer(PreTrainedTokenizer):
 
     def _convert_ids_to_string(self, tokens_ids):
         """Converts a sequence of ids in a string."""
-        out_string = ''.join(tokens_ids)
+        out_string = ''.join(tokens_ids).replace(SPIECE_UNDERLINE, ' ')
         return out_string
 
     def save_vocabulary(self, save_directory):

From c490f5ce8703ebdf49f8dc62f280045df92c1049 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Sat, 13 Jul 2019 15:26:58 +0200
Subject: [PATCH 116/139] added generation examples in tests

---
 examples/test_examples.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/test_examples.py b/examples/test_examples.py
index 8ea51b5726..989ec367ee 100644
--- a/examples/test_examples.py
+++ b/examples/test_examples.py
@@ -98,12 +98,12 @@ class ExamplesTests(unittest.TestCase):
 
         testargs = ["run_generation.py",
                     "--prompt=Hello",
+                    "--length=10",
                     "--seed=42"]
         model_name = "--model_name=openai-gpt"
         with patch.object(sys, 'argv', testargs + [model_name]):
             result = run_generation.main()
-            self.assertGreaterEqual(result['f1'], 30)
-            self.assertGreaterEqual(result['exact'], 30)
+            self.assertGreaterEqual(len(result), 10)
 
 if __name__ == "__main__":
     unittest.main()

From 2397f958f99767290e8bc54f96e1df62f63d34af Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Sun, 14 Jul 2019 23:20:10 +0200
Subject: [PATCH 117/139] updating examples and doc

---
 README.md                                     |  21 +-
 docs/source/index.rst                         |  19 +-
 docs/source/model_doc/bert.rst                |   4 +-
 docs/source/model_doc/gpt.rst                 |   7 -
 docs/source/model_doc/overview.rst            |   2 +-
 .../lm_finetuning/finetune_on_pregenerated.py |   4 +-
 examples/run_bertology.py                     | 384 +++++----
 examples/run_generation.py                    |   2 +-
 examples/run_glue.py                          |   9 +-
 examples/run_squad.py                         |  38 +-
 .../run_openai_gpt.py                         |   4 +-
 .../{ => single_model_scripts}/run_swag.py    |   2 +-
 .../run_transfo_xl.py                         |   0
 examples/test_examples.py                     |   1 -
 pytorch_transformers/modeling_bert.py         | 764 ++++++++----------
 pytorch_transformers/modeling_utils.py        |   7 +
 16 files changed, 601 insertions(+), 667 deletions(-)
 rename examples/{ => single_model_scripts}/run_openai_gpt.py (98%)
 rename examples/{ => single_model_scripts}/run_swag.py (99%)
 rename examples/{ => single_model_scripts}/run_transfo_xl.py (100%)

diff --git a/README.md b/README.md
index f916627b90..dba18a0d5e 100644
--- a/README.md
+++ b/README.md
@@ -131,11 +131,8 @@ This package comprises the following classes that can be imported in Python and
 - Tokenizer for **OpenAI GPT-2** (using byte-level Byte-Pair-Encoding) (in the [`tokenization_gpt2.py`](./pytorch_transformers/tokenization_gpt2.py) file):
   - `GPT2Tokenizer` - perform byte-level Byte-Pair-Encoding (BPE) tokenization.
 
-- Optimizer for **BERT** (in the [`optimization.py`](./pytorch_transformers/optimization.py) file):
-  - `BertAdam` - Bert version of Adam algorithm with weight decay fix, warmup and linear decay of the learning rate.
-
-- Optimizer for **OpenAI GPT** (in the [`optimization_openai.py`](./pytorch_transformers/optimization_openai.py) file):
-  - `OpenAIAdam` - OpenAI GPT version of Adam algorithm with weight decay fix, warmup and linear decay of the learning rate.
+- Optimizer (in the [`optimization.py`](./pytorch_transformers/optimization.py) file):
+  - `AdamW` - Version of Adam algorithm with weight decay fix, warmup and linear decay of the learning rate.
 
 - Configuration classes for BERT, OpenAI GPT and Transformer-XL (in the respective [`modeling.py`](./pytorch_transformers/modeling.py), [`modeling_openai.py`](./pytorch_transformers/modeling_openai.py), [`modeling_transfo_xl.py`](./pytorch_transformers/modeling_transfo_xl.py) files):
   - `BertConfig` - Configuration class to store the configuration of a `BertModel` with utilities to read and write from JSON configuration files.
@@ -1104,12 +1101,11 @@ Please refer to [`tokenization_gpt2.py`](./pytorch_transformers/tokenization_gpt
 
 ### Optimizers
 
-#### `BertAdam`
+#### `AdamW`
 
-`BertAdam` is a `torch.optimizer` adapted to be closer to the optimizer used in the TensorFlow implementation of Bert. The differences with PyTorch Adam optimizer are the following:
+`AdamW` is a `torch.optimizer` adapted to be closer to the optimizer used in the TensorFlow implementation of Bert. The differences with PyTorch Adam optimizer are the following:
 
-- BertAdam implements weight decay fix,
-- BertAdam doesn't compensate for bias as in the regular Adam optimizer.
+- AdamW implements weight decay fix,
 
 The optimizer accepts the following arguments:
 
@@ -1127,13 +1123,6 @@ The optimizer accepts the following arguments:
 - `weight_decay:` Weight decay. Default : `0.01`
 - `max_grad_norm` : Maximum norm for the gradients (`-1` means no clipping). Default : `1.0`
 
-#### `OpenAIAdam`
-
-`OpenAIAdam` is similar to `BertAdam`.
-The differences with `BertAdam` is that `OpenAIAdam` compensate for bias as in the regular Adam optimizer.
-
-`OpenAIAdam` accepts the same arguments as `BertAdam`.
-
 #### Learning Rate Schedules
 
 The `.optimization` module also provides additional schedules in the form of schedule objects that inherit from `_LRSchedule`.
diff --git a/docs/source/index.rst b/docs/source/index.rst
index ded234354d..aedb231163 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -60,10 +60,10 @@ This PyTorch implementation of Transformer-XL is an adaptation of the original `
 This PyTorch implementation of OpenAI GPT-2 is an adaptation of the `OpenAI's implementation <https://github.com/openai/gpt-2>`__ and is provided with `OpenAI's pre-trained model <https://github.com/openai/gpt-2>`__ and a command-line interface that was used to convert the TensorFlow checkpoint in PyTorch.
 
 **Facebook Research's XLM** was released together with the paper `Cross-lingual Language Model Pretraining <https://arxiv.org/abs/1901.07291>`__ by Guillaume Lample and Alexis Conneau.
-This PyTorch implementation of XLM is an adaptation of the original `PyTorch implementation <https://github.com/facebookresearch/XLM>`__. TODO Lysandre filled
+This PyTorch implementation of XLM is an adaptation of the original `PyTorch implementation <https://github.com/facebookresearch/XLM>`__.
 
 **Google's XLNet** was released together with the paper `XLNet: Generalized Autoregressive Pretraining for Language Understanding <https://arxiv.org/abs/1906.08237>`__ by Zhilin Yang\*, Zihang Dai\*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov and Quoc V. Le.
-This PyTorch implementation of XLM is an adaptation of the `Tensorflow implementation <https://github.com/zihangdai/xlnet>`__. TODO Lysandre filled
+This PyTorch implementation of XLM is an adaptation of the `Tensorflow implementation <https://github.com/zihangdai/xlnet>`__.
 
 
 Content
@@ -91,7 +91,7 @@ Content
    * - `Migration <./migration.html>`__
      - Migrating from ``pytorch_pretrained_BERT`` (v0.6) to ``pytorch_transformers`` (v1.0)
    * - `Bertology <./bertology.html>`__
-     - TODO Lysandre didn't know how to fill
+     - Exploring the internals of the pretrained models.
    * - `TorchScript <./torchscript.html>`__
      - Convert a model to TorchScript for use in other programming languages
 
@@ -115,8 +115,6 @@ Content
    * - `XLNet <./model_doc/xlnet.html>`__
      - XLNet Models, Tokenizers and optimizers
 
-TODO Lysandre filled: might need an introduction for both parts. Is it even necessary, since there is a summary? Up to you Thom.
-
 Overview
 --------
 
@@ -219,17 +217,10 @@ TODO Lysandre filled: I filled in XLM and XLNet. I didn't do the Tokenizers beca
 
 
 *
-  Optimizer for **BERT** (in the `optimization.py <./_modules/pytorch_transformers/optimization.html>`__ file):
+  Optimizer (in the `optimization.py <./_modules/pytorch_transformers/optimization.html>`__ file):
 
 
-  * ``BertAdam`` - Bert version of Adam algorithm with weight decay fix, warmup and linear decay of the learning rate.
-
-
-*
-  Optimizer for **OpenAI GPT** (in the `optimization_openai.py <./_modules/pytorch_transformers/optimization_openai.html>`__ file):
-
-
-  * ``OpenAIAdam`` - OpenAI GPT version of Adam algorithm with weight decay fix, warmup and linear decay of the learning rate.
+  * ``AdamW`` - Version of Adam algorithm with weight decay fix, warmup and linear decay of the learning rate.
 
 
 *
diff --git a/docs/source/model_doc/bert.rst b/docs/source/model_doc/bert.rst
index 3a2e12a6dd..8c786aa24f 100644
--- a/docs/source/model_doc/bert.rst
+++ b/docs/source/model_doc/bert.rst
@@ -15,10 +15,10 @@ BERT
     :members:
 
 
-``BertAdam``
+``AdamW``
 ~~~~~~~~~~~~~~~~
 
-.. autoclass:: pytorch_transformers.BertAdam
+.. autoclass:: pytorch_transformers.AdamW
     :members:
 
 ``BertModel``
diff --git a/docs/source/model_doc/gpt.rst b/docs/source/model_doc/gpt.rst
index 815cbe5787..26762ae011 100644
--- a/docs/source/model_doc/gpt.rst
+++ b/docs/source/model_doc/gpt.rst
@@ -15,13 +15,6 @@ OpenAI GPT
     :members:
 
 
-``OpenAIAdam``
-~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: pytorch_transformers.OpenAIAdam
-    :members:
-
-
 ``OpenAIGPTModel``
 ~~~~~~~~~~~~~~~~~~~~~~~~~
 
diff --git a/docs/source/model_doc/overview.rst b/docs/source/model_doc/overview.rst
index 00e538e68d..7e24115ae3 100644
--- a/docs/source/model_doc/overview.rst
+++ b/docs/source/model_doc/overview.rst
@@ -236,7 +236,7 @@ Learning Rate Schedules
 
 The ``.optimization`` module also provides additional schedules in the form of schedule objects that inherit from ``_LRSchedule``.
 All ``_LRSchedule`` subclasses accept ``warmup`` and ``t_total`` arguments at construction.
-When an ``_LRSchedule`` object is passed into ``BertAdam`` or ``OpenAIAdam``\ ,
+When an ``_LRSchedule`` object is passed into ``AdamW``\ ,
 the ``warmup`` and ``t_total`` arguments on the optimizer are ignored and the ones in the ``_LRSchedule`` object are used.
 An overview of the implemented schedules:
 
diff --git a/examples/lm_finetuning/finetune_on_pregenerated.py b/examples/lm_finetuning/finetune_on_pregenerated.py
index 505cd466f6..fe958345d1 100644
--- a/examples/lm_finetuning/finetune_on_pregenerated.py
+++ b/examples/lm_finetuning/finetune_on_pregenerated.py
@@ -16,7 +16,7 @@ from tqdm import tqdm
 from pytorch_transformers import WEIGHTS_NAME, CONFIG_NAME
 from pytorch_transformers.modeling_bert import BertForPreTraining
 from pytorch_transformers.tokenization_bert import BertTokenizer
-from pytorch_transformers.optimization import BertAdam, WarmupLinearSchedule
+from pytorch_transformers.optimization import AdamW, WarmupLinearSchedule
 
 InputFeatures = namedtuple("InputFeatures", "input_ids input_mask segment_ids lm_label_ids is_next")
 
@@ -273,7 +273,7 @@ def main():
         warmup_linear = WarmupLinearSchedule(warmup=args.warmup_proportion,
                                              t_total=num_train_optimization_steps)
     else:
-        optimizer = BertAdam(optimizer_grouped_parameters,
+        optimizer = AdamW(optimizer_grouped_parameters,
                              lr=args.learning_rate,
                              warmup=args.warmup_proportion,
                              t_total=num_train_optimization_steps)
diff --git a/examples/run_bertology.py b/examples/run_bertology.py
index 096b1b44fc..61c7440ecb 100644
--- a/examples/run_bertology.py
+++ b/examples/run_bertology.py
@@ -1,4 +1,24 @@
 #!/usr/bin/env python3
+# Copyright 2018 CMU and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Bertology: this script shows how you can explore the internals of the models in the library to:
+    - compute the entropy of the head attentions
+    - compute the importance of each head
+    - prune (remove) the low importance head.
+    Some parts of this script are adapted from the code of Michel et al. (http://arxiv.org/abs/1905.10650)
+    which is available at https://github.com/pmichel31415/are-16-heads-really-better-than-1
+"""
 import os
 import argparse
 import logging
@@ -12,43 +32,49 @@ from torch.utils.data import DataLoader, SequentialSampler, TensorDataset, Subse
 from torch.utils.data.distributed import DistributedSampler
 from torch.nn import CrossEntropyLoss, MSELoss
 
-from pytorch_transformers import BertForSequenceClassification, BertTokenizer
+from pytorch_transformers import (WEIGHTS_NAME,
+                                  BertConfig, BertForSequenceClassification, BertTokenizer,
+                                  XLMConfig, XLMForSequenceClassification, XLMTokenizer,
+                                  XLNetConfig, XLNetForSequenceClassification, XLNetTokenizer)
 
-from utils_glue import processors, output_modes, convert_examples_to_features, compute_metrics
+from run_glue import set_seed, load_and_cache_examples, ALL_MODELS, MODEL_CLASSES
 
+from utils_glue import (compute_metrics, convert_examples_to_features,
+                        output_modes, processors)
 
 logger = logging.getLogger(__name__)
 
 
 def entropy(p):
+    """ Compute the entropy of a probability distribution """
     plogp = p * torch.log(p)
     plogp[p == 0] = 0
     return -plogp.sum(dim=-1)
 
 
-def print_1d_tensor(tensor, prefix=""):
-    if tensor.dtype != torch.long:
-        logger.info(prefix + "\t".join(f"{x:.5f}" for x in tensor.cpu().data))
-    else:
-        logger.info(prefix + "\t".join(f"{x:d}" for x in tensor.cpu().data))
-
-
 def print_2d_tensor(tensor):
+    """ Print a 2D tensor """
     logger.info("lv, h >\t" + "\t".join(f"{x + 1}" for x in range(len(tensor))))
     for row in range(len(tensor)):
-        print_1d_tensor(tensor[row], prefix=f"layer {row + 1}:\t")
+        if tensor.dtype != torch.long:
+            logger.info(f"layer {row + 1}:\t" + "\t".join(f"{x:.5f}" for x in tensor[row].cpu().data))
+        else:
+            logger.info(f"layer {row + 1}:\t" + "\t".join(f"{x:d}" for x in tensor[row].cpu().data))
 
 
 def compute_heads_importance(args, model, eval_dataloader, compute_entropy=True, compute_importance=True, head_mask=None):
-    """ Example on how to use model outputs to compute:
-        - head attention entropy (activated by setting output_attentions=True when we created the model
+    """ This method shows how to compute:
+        - head attention entropy
         - head importance scores according to http://arxiv.org/abs/1905.10650
-            (activated by setting keep_multihead_output=True when we created the model)
     """
     # Prepare our tensors
     n_layers, n_heads = model.bert.config.num_hidden_layers, model.bert.config.num_attention_heads
     head_importance = torch.zeros(n_layers, n_heads).to(args.device)
     attn_entropy = torch.zeros(n_layers, n_heads).to(args.device)
+
+    if head_mask is None:
+        head_mask = torch.ones(n_layers, n_heads).to(args.device)
+    head_mask.requires_grad_(requires_grad=True)
     preds = None
     labels = None
     tot_tokens = 0.0
@@ -58,29 +84,17 @@ def compute_heads_importance(args, model, eval_dataloader, compute_entropy=True,
         input_ids, input_mask, segment_ids, label_ids = batch
 
         # Do a forward pass (not with torch.no_grad() since we need gradients for importance score - see below)
-        all_attentions, logits = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask, head_mask=head_mask)
+        outputs = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids, head_mask=head_mask)
+        loss, logits, all_attentions = outputs[0], outputs[1], outputs[-1]  # Loss and logits are the first, attention the last
+        loss.backward()  # Backpropagate to populate the gradients in the head mask
 
         if compute_entropy:
-            # Update head attention entropy
             for layer, attn in enumerate(all_attentions):
                 masked_entropy = entropy(attn.detach()) * input_mask.float().unsqueeze(1)
                 attn_entropy[layer] += masked_entropy.sum(-1).sum(0).detach()
 
         if compute_importance:
-            # Update head importance scores with regards to our loss
-            # First, backpropagate to populate the gradients
-            if args.output_mode == "classification":
-                loss_fct = CrossEntropyLoss()
-                loss = loss_fct(logits.view(-1, args.num_labels), label_ids.view(-1))
-            elif args.output_mode == "regression":
-                loss_fct = MSELoss()
-                loss = loss_fct(logits.view(-1), label_ids.view(-1))
-            loss.backward()
-            # Second, compute importance scores according to http://arxiv.org/abs/1905.10650
-            multihead_outputs = model.bert.get_multihead_outputs()
-            for layer, mh_layer_output in enumerate(multihead_outputs):
-                dot = torch.einsum("bhli,bhli->bhl", [mh_layer_output.grad, mh_layer_output])
-                head_importance[layer] += dot.abs().sum(-1).sum(0).detach()
+            head_importance += head_mask.grad.abs().detach()
 
         # Also store our logits/labels if we want to compute metrics afterwards
         if preds is None:
@@ -104,30 +118,137 @@ def compute_heads_importance(args, model, eval_dataloader, compute_entropy=True,
     if not args.dont_normalize_global_importance:
         head_importance = (head_importance - head_importance.min()) / (head_importance.max() - head_importance.min())
 
+    # Print/save matrices
+    np.save(os.path.join(args.output_dir, 'attn_entropy.npy'), attn_entropy.detach().cpu().numpy())
+    np.save(os.path.join(args.output_dir, 'head_importance.npy'), head_importance.detach().cpu().numpy())
+
+    logger.info("Attention entropies")
+    print_2d_tensor(attn_entropy)
+    logger.info("Head importance scores")
+    print_2d_tensor(head_importance)
+    logger.info("Head ranked by importance scores")
+    head_ranks = torch.zeros(head_importance.numel(), dtype=torch.long, device=args.device)
+    head_ranks[head_importance.view(-1).sort(descending=True)[1]] = torch.arange(head_importance.numel(), device=args.device)
+    head_ranks = head_ranks.view_as(head_importance)
+    print_2d_tensor(head_ranks)
+
     return attn_entropy, head_importance, preds, labels
 
 
-def run_model():
+def mask_heads(args, model, eval_dataloader):
+    """ This method shows how to mask head (set some heads to zero), to test the effect on the network,
+        based on the head importance scores, as described in Michel et al. (http://arxiv.org/abs/1905.10650)
+    """
+    _, head_importance, preds, labels = compute_heads_importance(args, model, eval_dataloader, compute_entropy=False)
+    preds = np.argmax(preds, axis=1) if args.output_mode == "classification" else np.squeeze(preds)
+    original_score = compute_metrics(args.task_name, preds, labels)[args.metric_name]
+    logger.info("Pruning: original score: %f, threshold: %f", original_score, original_score * args.masking_threshold)
+
+    new_head_mask = torch.ones_like(head_importance)
+    num_to_mask = max(1, int(new_head_mask.numel() * args.masking_amount))
+
+    current_score = original_score
+    while current_score >= original_score * args.masking_threshold:
+        head_mask = new_head_mask.clone() # save current head mask
+        # heads from least important to most - keep only not-masked heads
+        head_importance[head_mask == 0.0] = float('Inf')
+        current_heads_to_mask = head_importance.view(-1).sort()[1]
+
+        if len(current_heads_to_mask) <= num_to_mask:
+            break
+
+        # mask heads
+        current_heads_to_mask = current_heads_to_mask[:num_to_mask]
+        logger.info("Heads to mask: %s", str(current_heads_to_mask.tolist()))
+        new_head_mask = new_head_mask.view(-1)
+        new_head_mask[current_heads_to_mask] = 0.0
+        new_head_mask = new_head_mask.view_as(head_mask)
+        print_2d_tensor(new_head_mask)
+
+        # Compute metric and head importance again
+        _, head_importance, preds, labels = compute_heads_importance(args, model, eval_dataloader, compute_entropy=False, head_mask=new_head_mask)
+        preds = np.argmax(preds, axis=1) if args.output_mode == "classification" else np.squeeze(preds)
+        current_score = compute_metrics(args.task_name, preds, labels)[args.metric_name]
+        logger.info("Masking: current score: %f, remaning heads %d (%.1f percents)", current_score, new_head_mask.sum(), new_head_mask.sum()/new_head_mask.numel() * 100)
+
+    logger.info("Final head mask")
+    print_2d_tensor(head_mask)
+    np.save(os.path.join(args.output_dir, 'head_mask.npy'), head_mask.detach().cpu().numpy())
+
+    return head_mask
+
+
+def prune_heads(args, model, eval_dataloader, head_mask):
+    """ This method shows how to prune head (remove heads weights) based on
+        the head importance scores as described in Michel et al. (http://arxiv.org/abs/1905.10650)
+    """
+    # Try pruning and test time speedup
+    # Pruning is like masking but we actually remove the masked weights
+    before_time = datetime.now()
+    _, _, preds, labels = compute_heads_importance(args, model, eval_dataloader,
+                                                   compute_entropy=False, compute_importance=False, head_mask=head_mask)
+    preds = np.argmax(preds, axis=1) if args.output_mode == "classification" else np.squeeze(preds)
+    score_masking = compute_metrics(args.task_name, preds, labels)[args.metric_name]
+    original_time = datetime.now() - before_time
+
+    original_num_params = sum(p.numel() for p in model.parameters())
+    heads_to_prune = dict((layer, (1 - head_mask[layer].long()).nonzero().tolist()) for layer in range(len(head_mask)))
+    assert sum(len(h) for h in heads_to_prune.values()) == (1 - head_mask.long()).sum().item()
+    model.prune_heads(heads_to_prune)
+    pruned_num_params = sum(p.numel() for p in model.parameters())
+
+    before_time = datetime.now()
+    _, _, preds, labels = compute_heads_importance(args, model, eval_dataloader,
+                                                    compute_entropy=False, compute_importance=False, head_mask=None)
+    preds = np.argmax(preds, axis=1) if args.output_mode == "classification" else np.squeeze(preds)
+    score_pruning = compute_metrics(args.task_name, preds, labels)[args.metric_name]
+    new_time = datetime.now() - before_time
+
+    logger.info("Pruning: original num of params: %.2e, after pruning %.2e (%.1f percents)", original_num_params, pruned_num_params, pruned_num_params/original_num_params * 100)
+    logger.info("Pruning: score with masking: %f score with pruning: %f", score_masking, score_pruning)
+    logger.info("Pruning: speed ratio (new timing / original timing): %f percents", original_time/new_time * 100)
+
+
+def main():
     parser = argparse.ArgumentParser()
-    parser.add_argument('--model_name_or_path', type=str, default='bert-base-cased-finetuned-mrpc', help='pretrained model name or path to local checkpoint')
-    parser.add_argument("--task_name", type=str, default='mrpc', help="The name of the task to train.")
-    parser.add_argument("--data_dir", type=str, required=True, help="The input data dir. Should contain the .tsv files (or other data files) for the task.")
-    parser.add_argument("--output_dir", type=str, required=True, help="The output directory where the model predictions and checkpoints will be written.")
-    parser.add_argument("--data_subset", type=int, default=-1, help="If > 0: limit the data to a subset of data_subset instances.")
-    parser.add_argument("--overwrite_output_dir", action='store_true', help="Whether to overwrite data in output directory")
+    parser.add_argument("--data_dir", default=None, type=str, required=True,
+                        help="The input data dir. Should contain the .tsv files (or other data files) for the task.")
+    parser.add_argument("--model_name", default=None, type=str, required=True,
+                        help="Bert/XLNet/XLM pre-trained model selected in the list: " + ", ".join(ALL_MODELS))
+    parser.add_argument("--task_name", default=None, type=str, required=True,
+                        help="The name of the task to train selected in the list: " + ", ".join(processors.keys()))
+    parser.add_argument("--output_dir", default=None, type=str, required=True,
+                        help="The output directory where the model predictions and checkpoints will be written.")
 
-    parser.add_argument("--dont_normalize_importance_by_layer", action='store_true', help="Don't normalize importance score by layers")
-    parser.add_argument("--dont_normalize_global_importance", action='store_true', help="Don't normalize all importance scores between 0 and 1")
+    ## Other parameters
+    parser.add_argument("--config_name", default="", type=str,
+                        help="Pretrained config name or path if not the same as model_name")
+    parser.add_argument("--tokenizer_name", default="", type=str,
+                        help="Pretrained tokenizer name or path if not the same as model_name")
+    parser.add_argument("--cache_dir", default="", type=str,
+                        help="Where do you want to store the pre-trained models downloaded from s3")
+    parser.add_argument("--data_subset", type=int, default=-1,
+                        help="If > 0: limit the data to a subset of data_subset instances.")
+    parser.add_argument("--overwrite_output_dir", action='store_true',
+                        help="Whether to overwrite data in output directory")
 
-    parser.add_argument("--try_masking", action='store_true', help="Whether to try to mask head until a threshold of accuracy.")
-    parser.add_argument("--masking_threshold", default=0.9, type=float, help="masking threshold in term of metrics"
-                                                                             "(stop masking when metric < threshold * original metric value).")
-    parser.add_argument("--masking_amount", default=0.1, type=float, help="Amount to heads to masking at each masking step.")
-    parser.add_argument("--metric_name", default="acc", type=str, help="Metric to use for head masking.")
+    parser.add_argument("--dont_normalize_importance_by_layer", action='store_true',
+                        help="Don't normalize importance score by layers")
+    parser.add_argument("--dont_normalize_global_importance", action='store_true',
+                        help="Don't normalize all importance scores between 0 and 1")
 
-    parser.add_argument("--max_seq_length", default=128, type=int, help="The maximum total input sequence length after WordPiece tokenization. \n"
-                             "Sequences longer than this will be truncated, and sequences shorter \n"
-                             "than this will be padded.")
+    parser.add_argument("--try_masking", action='store_true',
+                        help="Whether to try to mask head until a threshold of accuracy.")
+    parser.add_argument("--masking_threshold", default=0.9, type=float,
+                        help="masking threshold in term of metrics (stop masking when metric < threshold * original metric value).")
+    parser.add_argument("--masking_amount", default=0.1, type=float,
+                        help="Amount to heads to masking at each masking step.")
+    parser.add_argument("--metric_name", default="acc", type=str,
+                        help="Metric to use for head masking.")
+
+    parser.add_argument("--max_seq_length", default=128, type=int,
+                        help="The maximum total input sequence length after WordPiece tokenization. \n"
+                             "Sequences longer than this will be truncated, sequences shorter padded.")
     parser.add_argument("--batch_size", default=1, type=int, help="Batch size.")
 
     parser.add_argument("--seed", type=int, default=42)
@@ -147,164 +268,79 @@ def run_model():
     # Setup devices and distributed training
     if args.local_rank == -1 or args.no_cuda:
         args.device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
-        n_gpu = torch.cuda.device_count()
+        args.n_gpu = torch.cuda.device_count()
     else:
         torch.cuda.set_device(args.local_rank)
         args.device = torch.device("cuda", args.local_rank)
-        n_gpu = 1
+        args.n_gpu = 1
         torch.distributed.init_process_group(backend='nccl')  # Initializes the distributed backend
 
     # Setup logging
     logging.basicConfig(level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
-    logger.info("device: {} n_gpu: {}, distributed: {}".format(args.device, n_gpu, bool(args.local_rank != -1)))
+    logger.info("device: {} n_gpu: {}, distributed: {}".format(args.device, args.n_gpu, bool(args.local_rank != -1)))
 
     # Set seeds
-    np.random.seed(args.seed)
-    torch.random.manual_seed(args.seed)
-    if n_gpu > 0:
-        torch.cuda.manual_seed(args.seed)
+    set_seed(args)
 
     # Prepare GLUE task
-    task_name = args.task_name.lower()
-    processor = processors[task_name]()
+    args.task_name = args.task_name.lower()
+    if args.task_name not in processors:
+        raise ValueError("Task not found: %s" % (args.task_name))
+    processor = processors[args.task_name]()
+    args.output_mode = output_modes[args.task_name]
     label_list = processor.get_labels()
-    args.output_mode = output_modes[task_name]
-    args.num_labels = len(label_list)
+    num_labels = len(label_list)
 
-    # Prepare output directory
-    if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and not args.overwrite_output_dir:
-        raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir))
-    if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
-        os.makedirs(args.output_dir)
-
-    # Load model & tokenizer
+    # Load pretrained model and tokenizer
     if args.local_rank not in [-1, 0]:
-        torch.distributed.barrier()  # Make sure only one distributed process download model & vocab
-    tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path)
+        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
+
+    args.model_type = ""
+    for key in MODEL_CLASSES:
+        if key in args.model_name.lower():
+            args.model_type = key  # take the first match in model types
+            break
+    config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
+    config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name,
+                                          num_labels=num_labels, finetuning_task=args.task_name,
+                                          output_attentions=True)
+    tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name)
+    model = model_class.from_pretrained(args.model_name, from_tf=bool('.ckpt' in args.model_name), config=config)
 
-    # Load a model with all BERTology options on:
-    #   output_attentions => will output attention weights
-    #   keep_multihead_output => will store gradient of attention head outputs for head importance computation
-    #       see: http://arxiv.org/abs/1905.10650
-    model = BertForSequenceClassification.from_pretrained(args.model_name_or_path,
-                                                          num_labels=args.num_labels,
-                                                          output_attentions=True,
-                                                          keep_multihead_output=True)
     if args.local_rank == 0:
-        torch.distributed.barrier()  # Make sure only one distributed process download model & vocab
+        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
+
+    # Distributed and parallel training
     model.to(args.device)
     if args.local_rank != -1:
-        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True)
-    model.eval()
+        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
+                                                          output_device=args.local_rank,
+                                                          find_unused_parameters=True)
+    elif args.n_gpu > 1:
+        model = torch.nn.DataParallel(model)
+
+    # Print/save training arguments
+    torch.save(args, os.path.join(args.output_dir, 'run_args.bin'))
+    logger.info("Training/evaluation parameters %s", args)
 
     # Prepare dataset for the GLUE task
-    eval_examples = processor.get_dev_examples(args.data_dir)
-    cached_eval_features_file = os.path.join(args.data_dir, 'dev_{0}_{1}_{2}'.format(
-        list(filter(None, args.model_name_or_path.split('/'))).pop(), str(args.max_seq_length), str(task_name)))
-    try:
-        eval_features = torch.load(cached_eval_features_file)
-    except:
-        eval_features = convert_examples_to_features(eval_examples, label_list, args.max_seq_length, tokenizer, args.output_mode)
-        if args.local_rank in [-1, 0]:
-            logger.info("Saving eval features to cache file %s", cached_eval_features_file)
-            torch.save(eval_features, cached_eval_features_file)
-
-    all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
-    all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
-    all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
-    all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long if args.output_mode == "classification" else torch.float)
-    eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
-
+    eval_data = load_and_cache_examples(args, args.task_name, tokenizer, evaluate=True)
     if args.data_subset > 0:
         eval_data = Subset(eval_data, list(range(min(args.data_subset, len(eval_data)))))
-
     eval_sampler = SequentialSampler(eval_data) if args.local_rank == -1 else DistributedSampler(eval_data)
     eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.batch_size)
 
-    # Print/save training arguments
-    print(args)
-    torch.save(args, os.path.join(args.output_dir, 'run_args.bin'))
 
     # Compute head entropy and importance score
-    attn_entropy, head_importance, _, _ = compute_heads_importance(args, model, eval_dataloader)
+    compute_heads_importance(args, model, eval_dataloader)
 
-    # Print/save matrices
-    np.save(os.path.join(args.output_dir, 'attn_entropy.npy'), attn_entropy.detach().cpu().numpy())
-    np.save(os.path.join(args.output_dir, 'head_importance.npy'), head_importance.detach().cpu().numpy())
 
-    logger.info("Attention entropies")
-    print_2d_tensor(attn_entropy)
-    logger.info("Head importance scores")
-    print_2d_tensor(head_importance)
-    logger.info("Head ranked by importance scores")
-    head_ranks = torch.zeros(head_importance.numel(), dtype=torch.long, device=args.device)
-    head_ranks[head_importance.view(-1).sort(descending=True)[1]] = torch.arange(head_importance.numel(), device=args.device)
-    head_ranks = head_ranks.view_as(head_importance)
-    print_2d_tensor(head_ranks)
-
-    # Do masking if we want to
+    # Try head masking (set heads to zero until the score goes under a threshole)
+    # and head pruning (remove masked heads and see the effect on the network)
     if args.try_masking and args.masking_threshold > 0.0 and args.masking_threshold < 1.0:
-        _, head_importance, preds, labels = compute_heads_importance(args, model, eval_dataloader, compute_entropy=False)
-        preds = np.argmax(preds, axis=1) if args.output_mode == "classification" else np.squeeze(preds)
-        original_score = compute_metrics(task_name, preds, labels)[args.metric_name]
-        logger.info("Pruning: original score: %f, threshold: %f", original_score, original_score * args.masking_threshold)
+        head_mask = mask_heads(args, model, eval_dataloader)
+        prune_heads(args, model, eval_dataloader, head_mask)
 
-        new_head_mask = torch.ones_like(head_importance)
-        num_to_mask = max(1, int(new_head_mask.numel() * args.masking_amount))
-
-        current_score = original_score
-        while current_score >= original_score * args.masking_threshold:
-            head_mask = new_head_mask.clone() # save current head mask
-            # heads from least important to most - keep only not-masked heads
-            head_importance[head_mask == 0.0] = float('Inf')
-            current_heads_to_mask = head_importance.view(-1).sort()[1]
-
-            if len(current_heads_to_mask) <= num_to_mask:
-                break
-
-            # mask heads
-            current_heads_to_mask = current_heads_to_mask[:num_to_mask]
-            logger.info("Heads to mask: %s", str(current_heads_to_mask.tolist()))
-            new_head_mask = new_head_mask.view(-1)
-            new_head_mask[current_heads_to_mask] = 0.0
-            new_head_mask = new_head_mask.view_as(head_mask)
-            print_2d_tensor(new_head_mask)
-
-            # Compute metric and head importance again
-            _, head_importance, preds, labels = compute_heads_importance(args, model, eval_dataloader, compute_entropy=False, head_mask=new_head_mask)
-            preds = np.argmax(preds, axis=1) if args.output_mode == "classification" else np.squeeze(preds)
-            current_score = compute_metrics(task_name, preds, labels)[args.metric_name]
-            logger.info("Masking: current score: %f, remaning heads %d (%.1f percents)", current_score, new_head_mask.sum(), new_head_mask.sum()/new_head_mask.numel() * 100)
-
-        logger.info("Final head mask")
-        print_2d_tensor(head_mask)
-        np.save(os.path.join(args.output_dir, 'head_mask.npy'), head_mask.detach().cpu().numpy())
-
-        # Try pruning and test time speedup
-        # Pruning is like masking but we actually remove the masked weights
-        before_time = datetime.now()
-        _, _, preds, labels = compute_heads_importance(args, model, eval_dataloader,
-                                                       compute_entropy=False, compute_importance=False, head_mask=head_mask)
-        preds = np.argmax(preds, axis=1) if args.output_mode == "classification" else np.squeeze(preds)
-        score_masking = compute_metrics(task_name, preds, labels)[args.metric_name]
-        original_time = datetime.now() - before_time
-
-        original_num_params = sum(p.numel() for p in model.parameters())
-        heads_to_prune = dict((layer, (1 - head_mask[layer].long()).nonzero().tolist()) for layer in range(len(head_mask)))
-        assert sum(len(h) for h in heads_to_prune.values()) == (1 - head_mask.long()).sum().item()
-        model.bert.prune_heads(heads_to_prune)
-        pruned_num_params = sum(p.numel() for p in model.parameters())
-
-        before_time = datetime.now()
-        _, _, preds, labels = compute_heads_importance(args, model, eval_dataloader,
-                                                       compute_entropy=False, compute_importance=False, head_mask=None)
-        preds = np.argmax(preds, axis=1) if args.output_mode == "classification" else np.squeeze(preds)
-        score_pruning = compute_metrics(task_name, preds, labels)[args.metric_name]
-        new_time = datetime.now() - before_time
-
-        logger.info("Pruning: original num of params: %.2e, after pruning %.2e (%.1f percents)", original_num_params, pruned_num_params, pruned_num_params/original_num_params * 100)
-        logger.info("Pruning: score with masking: %f score with pruning: %f", score_masking, score_pruning)
-        logger.info("Pruning: speed ratio (new timing / original timing): %f percents", original_time/new_time * 100)
 
 if __name__ == '__main__':
-    run_model()
+    main()
diff --git a/examples/run_generation.py b/examples/run_generation.py
index 047e24679f..4108b2894a 100644
--- a/examples/run_generation.py
+++ b/examples/run_generation.py
@@ -14,7 +14,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Generation with GPT/GPT-2/Transformer-XL/XLNet models
+""" Conditional text generation with the auto-regressive models of the library (GPT/GPT-2/Transformer-XL/XLNet)
 """
 from __future__ import absolute_import, division, print_function, unicode_literals
 
diff --git a/examples/run_glue.py b/examples/run_glue.py
index f0633c3f12..ea5cc9f42d 100644
--- a/examples/run_glue.py
+++ b/examples/run_glue.py
@@ -13,7 +13,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Finetuning a classification model (Bert, XLM, XLNet,...) on GLUE."""
+""" Finetuning the library models for sequence classification on GLUE (Bert, XLM, XLNet)."""
 
 from __future__ import absolute_import, division, print_function
 
@@ -230,6 +230,9 @@ def evaluate(args, model, tokenizer, prefix=""):
                 logger.info("  %s = %s", key, str(result[key]))
                 writer.write("%s = %s\n" % (key, str(result[key])))
 
+    if args.local_rank in [-1, 0]:
+        tb_writer.close()
+
     return results
 
 
@@ -242,7 +245,7 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False):
         list(filter(None, args.model_name.split('/'))).pop(),
         str(args.max_seq_length),
         str(task)))
-    if os.path.exists(cached_features_file) and not args.overwrite_cache:
+    if os.path.exists(cached_features_file):
         logger.info("Loading features from cached file %s", cached_features_file)
         features = torch.load(cached_features_file)
     else:
@@ -410,7 +413,7 @@ def main():
     if args.local_rank == 0:
         torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
 
-    # Distributed and parrallel training
+    # Distributed and parallel training
     model.to(args.device)
     if args.local_rank != -1:
         model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
diff --git a/examples/run_squad.py b/examples/run_squad.py
index af4a771f4a..24f00e0518 100644
--- a/examples/run_squad.py
+++ b/examples/run_squad.py
@@ -13,7 +13,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Finetuning a question-answering model (Bert, XLM, XLNet,...) on SQuAD."""
+""" Finetuning the library models for question-answering on SQuAD (Bert, XLM, XLNet)."""
 
 from __future__ import absolute_import, division, print_function
 
@@ -21,7 +21,7 @@ import argparse
 import logging
 import os
 import random
-from io import open
+import glob
 
 import numpy as np
 import torch
@@ -43,6 +43,9 @@ from pytorch_transformers import AdamW, WarmupLinearSchedule
 
 from utils_squad import read_squad_examples, convert_examples_to_features, RawResult, write_predictions
 
+# The follwing import is the official SQuAD evaluation script (2.0).
+# You can remove it from the dependencies if you are using this script outside of the library
+# We've added it here for automated tests (see examples/test_examples.py file)
 from utils_squad_evaluate import EVAL_OPTS, main as evaluate_on_squad
 
 logger = logging.getLogger(__name__)
@@ -123,7 +126,7 @@ def train(args, train_dataset, model, tokenizer):
             loss = ouputs[0]  # model outputs are always tuple in pytorch-transformers (see doc)
 
             if args.n_gpu > 1:
-                loss = loss.mean() # mean() to average on multi-gpu parallel training
+                loss = loss.mean() # mean() to average on multi-gpu parallel (not distributed) training
             if args.gradient_accumulation_steps > 1:
                 loss = loss / args.gradient_accumulation_steps
 
@@ -169,6 +172,9 @@ def train(args, train_dataset, model, tokenizer):
             train_iterator.close()
             break
 
+    if args.local_rank in [-1, 0]:
+        tb_writer.close()
+
     return global_step, tr_loss / global_step
 
 
@@ -208,16 +214,16 @@ def evaluate(args, model, tokenizer, prefix=""):
                                          start_logits=start_logits,
                                          end_logits=end_logits))
 
+    # Compute predictions
     output_prediction_file = os.path.join(args.output_dir, "predictions_{}.json".format(prefix))
     output_nbest_file = os.path.join(args.output_dir, "nbest_predictions_{}.json".format(prefix))
     output_null_log_odds_file = os.path.join(args.output_dir, "null_odds_{}.json".format(prefix))
-    all_predictions = write_predictions(examples, features, all_results,
-                                        args.n_best_size, args.max_answer_length,
-                                        args.do_lower_case, output_prediction_file,
-                                        output_nbest_file, output_null_log_odds_file,
-                                        args.verbose_logging, args.version_2_with_negative,
-                                        args.null_score_diff_threshold)
+    write_predictions(examples, features, all_results, args.n_best_size, args.max_answer_length,
+                      args.do_lower_case, output_prediction_file, output_nbest_file,
+                      output_null_log_odds_file, args.verbose_logging,
+                      args.version_2_with_negative, args.null_score_diff_threshold)
 
+    # Evaluate with the official SQuAD script
     evaluate_options = EVAL_OPTS(data_file=args.predict_file,
                                  pred_file=output_prediction_file,
                                  na_prob_file=output_null_log_odds_file)
@@ -432,7 +438,7 @@ def main():
         logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
 
 
-    # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
+    # Save the trained model and the tokenizer
     if args.local_rank == -1 or torch.distributed.get_rank() == 0:
         # Create output directory if needed
         if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
@@ -454,22 +460,30 @@ def main():
         model.to(args.device)
 
 
-    # Evaluation
+    # Evaluation - we can ask to evaluate all the checkpoints (sub-directories) in a directory
     results = {}
     if args.do_eval and args.local_rank in [-1, 0]:
         checkpoints = [args.output_dir]
         if args.eval_all_checkpoints:
             checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True)))
-            logging.getLogger("pytorch_transformers.modeling_utils").setLevel(logging.WARN)  # Reduce logging
+            logging.getLogger("pytorch_transformers.modeling_utils").setLevel(logging.WARN)  # Reduce model loading logs
+
         logger.info("Evaluate the following checkpoints: %s", checkpoints)
+
         for checkpoint in checkpoints:
+            # Reload the model
             global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else ""
             model = model_class.from_pretrained(checkpoint)
             model.to(args.device)
+
+            # Evaluate
             result = evaluate(args, model, tokenizer, prefix=global_step)
+
             result = dict((k + ('_{}'.format(global_step) if global_step else ''), v) for k, v in result.items())
             results.update(result)
+
     logger.info("Results: {}".format(results))
+
     return results
 
 
diff --git a/examples/run_openai_gpt.py b/examples/single_model_scripts/run_openai_gpt.py
similarity index 98%
rename from examples/run_openai_gpt.py
rename to examples/single_model_scripts/run_openai_gpt.py
index 02b86b3a22..b2e85271cb 100644
--- a/examples/run_openai_gpt.py
+++ b/examples/single_model_scripts/run_openai_gpt.py
@@ -40,7 +40,7 @@ from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
                               TensorDataset)
 
 from pytorch_transformers import (OpenAIGPTDoubleHeadsModel, OpenAIGPTTokenizer,
-                                     OpenAIAdam, cached_path, WEIGHTS_NAME, CONFIG_NAME)
+                                     AdamW, cached_path, WEIGHTS_NAME, CONFIG_NAME)
 
 ROCSTORIES_URL = "https://s3.amazonaws.com/datasets.huggingface.co/ROCStories.tar.gz"
 
@@ -191,7 +191,7 @@ def main():
             {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
             ]
         num_train_optimization_steps = len(train_dataloader) * args.num_train_epochs
-        optimizer = OpenAIAdam(optimizer_grouped_parameters,
+        optimizer = AdamW(optimizer_grouped_parameters,
                                lr=args.learning_rate,
                                warmup=args.warmup_proportion,
                                max_grad_norm=args.max_grad_norm,
diff --git a/examples/run_swag.py b/examples/single_model_scripts/run_swag.py
similarity index 99%
rename from examples/run_swag.py
rename to examples/single_model_scripts/run_swag.py
index 00cd3a7840..fdda56e40b 100644
--- a/examples/run_swag.py
+++ b/examples/single_model_scripts/run_swag.py
@@ -34,7 +34,7 @@ from tqdm import tqdm, trange
 
 from pytorch_transformers.file_utils import PYTORCH_PRETRAINED_BERT_CACHE, WEIGHTS_NAME, CONFIG_NAME
 from pytorch_transformers.modeling_bert import BertForMultipleChoice, BertConfig
-from pytorch_transformers.optimization import BertAdam, WarmupLinearSchedule
+from pytorch_transformers.optimization import AdamW, WarmupLinearSchedule
 from pytorch_transformers.tokenization_bert import BertTokenizer
 
 logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
diff --git a/examples/run_transfo_xl.py b/examples/single_model_scripts/run_transfo_xl.py
similarity index 100%
rename from examples/run_transfo_xl.py
rename to examples/single_model_scripts/run_transfo_xl.py
diff --git a/examples/test_examples.py b/examples/test_examples.py
index 989ec367ee..a07c0ea31b 100644
--- a/examples/test_examples.py
+++ b/examples/test_examples.py
@@ -91,7 +91,6 @@ class ExamplesTests(unittest.TestCase):
             self.assertGreaterEqual(result['f1'], 30)
             self.assertGreaterEqual(result['exact'], 30)
 
-
     def test_generation(self):
         stream_handler = logging.StreamHandler(sys.stdout)
         logger.addHandler(stream_handler)
diff --git a/pytorch_transformers/modeling_bert.py b/pytorch_transformers/modeling_bert.py
index 8c75925a07..a8239038a7 100644
--- a/pytorch_transformers/modeling_bert.py
+++ b/pytorch_transformers/modeling_bert.py
@@ -13,7 +13,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""PyTorch BERT model."""
+"""PyTorch BERT model. """
 
 from __future__ import absolute_import, division, print_function, unicode_literals
 
@@ -28,7 +28,8 @@ import torch
 from torch import nn
 from torch.nn import CrossEntropyLoss, MSELoss
 
-from .modeling_utils import WEIGHTS_NAME, CONFIG_NAME, PretrainedConfig, PreTrainedModel, prune_linear_layer
+from .modeling_utils import (WEIGHTS_NAME, CONFIG_NAME, PretrainedConfig, PreTrainedModel,
+                             prune_linear_layer, add_start_docstrings)
 
 logger = logging.getLogger(__name__)
 
@@ -66,7 +67,7 @@ BERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 
 
 def load_tf_weights_in_bert(model, config, tf_checkpoint_path):
-    """ Load tf checkpoints in a pytorch model
+    """ Load tf checkpoints in a pytorch model.
     """
     try:
         import re
@@ -583,25 +584,84 @@ class BertPreTrainedModel(PreTrainedModel):
             module.bias.data.zero_()
 
 
+BERT_START_DOCSTRING = r"""    The BERT model was proposed in
+    `BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding`_
+    by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova. It's a bidirectional transformer
+    pre-trained using a combination of masked language modeling objective and next sentence prediction
+    on a large corpus comprising the Toronto Book Corpus and Wikipedia.
+
+    This model is a PyTorch `torch.nn.Module`_ sub-class. Use it as a regular PyTorch Module and
+    refer to the PyTorch documentation for all matter related to general usage and behavior.
+
+    .. _`BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding`:
+        https://arxiv.org/abs/1810.04805
+
+    .. _`torch.nn.Module`:
+        https://pytorch.org/docs/stable/nn.html#module
+
+    Parameters:
+        config (:class:`~pytorch_transformers.BertConfig`): Model configuration class with all the parameters of the model.
+"""
+
+BERT_INPUTS_DOCSTRING = r"""
+    Inputs:
+        **input_ids**: ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+            Indices of input sequence tokens in the vocabulary.
+            To match pre-training, BERT input sequence should be formatted with [CLS] and [SEP] tokens as follows:
+
+            (a) For sequence pairs:
+
+                ``tokens:         [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]``
+                
+                ``token_type_ids:   0   0  0    0    0     0       0   0   1  1  1  1   1   1``
+
+            (b) For single sequences:
+
+                ``tokens:         [CLS] the dog is hairy . [SEP]``
+                
+                ``token_type_ids:   0   0   0   0  0     0   0``
+    
+            Indices can be obtained using :class:`pytorch_transformers.BertTokenizer`.
+            See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
+            :func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
+        **token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+            Segment token indices to indicate first and second portions of the inputs.
+            Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1``
+            corresponds to a `sentence B` token
+            (see `BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding`_ for more details).
+        **attention_mask**: (`optional`) ``torch.Tensor`` of shape ``(batch_size, sequence_length)``:
+            Mask to avoid performing attention on padding token indices.
+            Mask indices selected in ``[0, 1]``:
+            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
+        **head_mask**: (`optional`) ``torch.Tensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
+            Mask to nullify selected heads of the self-attention modules.
+            Mask indices selected in ``[0, 1]``:
+            ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
+"""
+
+@add_start_docstrings("The bare Bert Model transformer outputing raw hidden-states without any specific head on top.",
+                      BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING)
 class BertModel(BertPreTrainedModel):
-    r"""BERT model ("Bidirectional Embedding Representations from a Transformer").
+    r"""
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **last_hidden_state**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, hidden_size)``
+            Sequence of hidden-states at the last layer of the model.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
 
-    :class:`~pytorch_transformers.BertModel` is the basic BERT Transformer model with a layer of summed token, \
-    position and sequence embeddings followed by a series of identical self-attention blocks (12 for BERT-base, 24 \
-    for BERT-large). The model is instantiated with the following parameters.
+    Examples::
 
-    Arguments:
-        config: a BertConfig class instance with the configuration to build a new model
-        output_attentions: If True, also output attentions weights computed by the model at each layer. Default: False
-        output_hidden_states: If True, also output hidden states computed by the model at each layer. Default: Fals
-
-
-    Example::
-
-        config = modeling.BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
-            num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
-
-        model = modeling.BertModel(config=config)
+        >>> config = BertConfig.from_pretrained('bert-base-uncased')
+        >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+        >>> model = BertModel(config)
+        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        >>> outputs = model(input_ids)
+        >>> last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
 
     """
     def __init__(self, config):
@@ -628,58 +688,6 @@ class BertModel(BertPreTrainedModel):
             self.encoder.layer[layer].attention.prune_heads(heads)
 
     def forward(self, input_ids, token_type_ids=None, attention_mask=None, head_mask=None):
-        """
-        Performs a model forward pass. **Can be called by calling the class directly, once it has been instantiated.**
-
-
-        Arguments:
-            input_ids: a ``torch.LongTensor`` of shape [batch_size, sequence_length] with the word token indices in the \
-                vocabulary(see the tokens pre-processing logic in the scripts `run_bert_extract_features.py`, \
-                `run_bert_classifier.py` and `run_bert_squad.py`)
-            token_type_ids: an optional ``torch.LongTensor`` of shape [batch_size, sequence_length] with the token \
-                types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to \
-                a `sentence B` token (see BERT paper for more details).
-            attention_mask: an optional ``torch.LongTensor`` of shape [batch_size, sequence_length] with indices \
-                selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max \
-                input sequence length in the current batch. It's the mask that we typically use for attention when \
-                a batch has varying length sentences.
-            output_all_encoded_layers: boolean which controls the content of the `encoded_layers` output as described \
-            below. Default: `True`.
-            head_mask: an optional ``torch.Tensor`` of shape [num_heads] or [num_layers, num_heads] with indices between 0 \
-            and 1. It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 \
-            => head is not masked.
-
-
-        Returns:
-            A tuple composed of (encoded_layers, pooled_output). Encoded layers are controlled by the \
-            ``output_all_encoded_layers`` argument.
-
-            If ``output_all_encoded_layers`` is set to True, outputs a list of the full sequences of \
-            encoded-hidden-states at the end of each attention \
-            block (i.e. 12 full sequences for BERT-base, 24 for BERT-large), each encoded-hidden-state is a\
-            ``torch.FloatTensor`` of size [batch_size, sequence_length, hidden_size].
-
-            If set to False, outputs only the full sequence of hidden-states corresponding \
-            to the last attention block of shape [batch_size, sequence_length, hidden_size].
-
-            ``pooled_output`` is a ``torch.FloatTensor`` of size [batch_size, hidden_size] which is the output of a \
-            classifier pretrained on top of the hidden state associated to the first character of the \
-            input (`CLS`) to train on the Next-Sentence task (see BERT's paper).
-
-        Example::
-
-            # Already been converted into WordPiece token ids
-            input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
-            input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
-            token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
-
-
-            all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)
-            # or
-            all_encoder_layers, pooled_output = model.forward(input_ids, token_type_ids, input_mask)
-
-
-        """
         if attention_mask is None:
             attention_mask = torch.ones_like(input_ids)
         if token_type_ids is None:
@@ -726,25 +734,47 @@ class BertModel(BertPreTrainedModel):
         return outputs  # sequence_output, pooled_output, (hidden_states), (attentions)
 
 
+@add_start_docstrings("""Bert Model transformer BERT model with two heads on top as done during the pre-training:
+    a `masked language modeling` head and a `next sentence prediction (classification)` head. """,
+    BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING)
 class BertForPreTraining(BertPreTrainedModel):
-    """BERT model with pre-training heads.
-    This module comprises the BERT model followed by the two pre-training heads:
+    r"""
+        **masked_lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+            Labels for computing the masked language modeling loss.
+            Indices should be in ``[-1, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
+            Tokens with indices set to ``-1`` are ignored (masked), the loss is only computed for the tokens with labels
+            in ``[0, ..., config.vocab_size]``
+        **next_sentence_label**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
+            Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair (see ``input_ids`` docstring)
+            Indices should be in ``[0, 1]``.
+            ``0`` indicates sequence B is a continuation of sequence A,
+            ``1`` indicates sequence B is a random sequence.
 
-        - the masked language modeling head, and
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **loss**: (`optional`, returned when both ``masked_lm_labels`` and ``next_sentence_label`` are provided) ``torch.FloatTensor`` of shape ``(1,)``:
+            Total loss as the sum of the masked language modeling loss and the next sequence prediction (classification) loss.
+        **prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        **seq_relationship_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, 2)``
+            Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation before SoftMax).
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
 
-        - the next sentence classification head.
+    Examples::
 
-    Args:
-        `config`: a BertConfig class instance with the configuration to build a new model
-        `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
-        `output_hidden_states`: If True, also output hidden states computed by the model at each layer. Default: False
+        >>> config = BertConfig.from_pretrained('bert-base-uncased')
+        >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+        >>> 
+        >>> model = BertForPreTraining(config)
+        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        >>> outputs = model(input_ids)
+        >>> prediction_scores, seq_relationship_scores = outputs[:1]
 
-    Example ::
-
-        config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
-            num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
-
-        model = BertForPreTraining(config)
     """
     def __init__(self, config):
         super(BertForPreTraining, self).__init__(config)
@@ -764,58 +794,6 @@ class BertForPreTraining(BertPreTrainedModel):
 
     def forward(self, input_ids, token_type_ids=None, attention_mask=None, masked_lm_labels=None,
                 next_sentence_label=None, head_mask=None):
-        """
-        Performs a model forward pass. **Can be called by calling the class directly, once it has been instantiated.**
-
-        Args:
-            `input_ids`: a ``torch.LongTensor`` of shape [batch_size, sequence_length]
-                with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
-                `run_bert_extract_features.py`, `run_bert_classifier.py` and `run_bert_squad.py`)
-            `token_type_ids`: an optional ``torch.LongTensor`` of shape [batch_size, sequence_length] with the token
-                types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
-                a `sentence B` token (see BERT paper for more details).
-            `attention_mask`: an optional ``torch.LongTensor`` of shape [batch_size, sequence_length] with indices
-                selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
-                input sequence length in the current batch. It's the mask that we typically use for attention when
-                a batch has varying length sentences.
-            `masked_lm_labels`: optional masked language modeling labels: ``torch.LongTensor`` of shape [batch_size, sequence_length]
-                with indices selected in [-1, 0, ..., vocab_size]. All labels set to -1 are ignored (masked), the loss
-                is only computed for the labels set in [0, ..., vocab_size]
-            `next_sentence_label`: optional next sentence classification loss: ``torch.LongTensor`` of shape [batch_size]
-                with indices selected in [0, 1].
-                0 => next sentence is the continuation, 1 => next sentence is a random sentence.
-            `head_mask`: an optional ``torch.Tensor`` of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
-                It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
-
-
-        Returns:
-            Either a ``torch.Tensor`` or ``tuple(torch.Tensor, torch.Tensor)``.
-
-            if ``masked_lm_labels`` and ``next_sentence_label`` are not ``None``, outputs the total_loss which is the \
-             sum of the masked language modeling loss and the next \
-            sentence classification loss.
-
-            if ``masked_lm_labels`` or ``next_sentence_label`` is ``None``, outputs a tuple made of:
-
-                - the masked language modeling logits of shape [batch_size, sequence_length, vocab_size]
-
-                - the next sentence classification logits of shape [batch_size, 2].
-
-        Example ::
-
-            # Already been converted into WordPiece token ids
-            input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
-            input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
-            token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
-
-            config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
-                num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
-
-            model = BertForPreTraining(config)
-            masked_lm_logits_scores, seq_relationship_logits = model(input_ids, token_type_ids, input_mask)
-            # or
-            masked_lm_logits_scores, seq_relationship_logits = model.forward(input_ids, token_type_ids, input_mask)
-        """
         outputs = self.bert(input_ids, token_type_ids, attention_mask, head_mask=head_mask)
 
         sequence_output, pooled_output = outputs[:2]
@@ -833,21 +811,39 @@ class BertForPreTraining(BertPreTrainedModel):
         return outputs  # (loss), prediction_scores, seq_relationship_score, (hidden_states), (attentions)
 
 
+@add_start_docstrings("""Bert Model transformer BERT model with a `language modeling` head on top. """,
+    BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING)
 class BertForMaskedLM(BertPreTrainedModel):
-    """BERT model with the masked language modeling head.
-    This module comprises the BERT model followed by the masked language modeling head.
+    r"""
+        **masked_lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+            Labels for computing the masked language modeling loss.
+            Indices should be in ``[-1, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
+            Tokens with indices set to ``-1`` are ignored (masked), the loss is only computed for the tokens with labels
+            in ``[0, ..., config.vocab_size]``
 
-    Args:
-        `config`: a BertConfig class instance with the configuration to build a new model
-        `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
-        `output_hidden_states`: If True, also output hidden states computed by the model at each layer. Default: False
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **loss**: (`optional`, returned when ``masked_lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
+            Masked language modeling loss.
+        **prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
 
-    Example::
+    Examples::
 
-        config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
-            num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
+        >>> config = BertConfig.from_pretrained('bert-base-uncased')
+        >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+        >>> 
+        >>> model = BertForMaskedLM(config)
+        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        >>> outputs = model(input_ids, masked_lm_labels=input_ids)
+        >>> loss, prediction_scores = outputs[:1]
 
-        model = BertForMaskedLM(config)
     """
     def __init__(self, config):
         super(BertForMaskedLM, self).__init__(config)
@@ -866,45 +862,6 @@ class BertForMaskedLM(BertPreTrainedModel):
                                    self.bert.embeddings.word_embeddings)
 
     def forward(self, input_ids, token_type_ids=None, attention_mask=None, masked_lm_labels=None, head_mask=None):
-        """
-        Performs a model forward pass. **Can be called by calling the class directly, once it has been instantiated.**
-
-        Args:
-            `input_ids`: a ``torch.LongTensor`` of shape [batch_size, sequence_length]
-                with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
-                `run_bert_extract_features.py`, `run_bert_classifier.py` and `run_bert_squad.py`)
-            `token_type_ids`: an optional ``torch.LongTensor`` of shape [batch_size, sequence_length] with the token
-                types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
-                a `sentence B` token (see BERT paper for more details).
-            `attention_mask`: an optional ``torch.LongTensor`` of shape [batch_size, sequence_length] with indices
-                selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
-                input sequence length in the current batch. It's the mask that we typically use for attention when
-                a batch has varying length sentences.
-            `masked_lm_labels`: masked language modeling labels: ``torch.LongTensor`` of shape [batch_size, sequence_length]
-                with indices selected in [-1, 0, ..., vocab_size]. All labels set to -1 are ignored (masked), the loss
-                is only computed for the labels set in [0, ..., vocab_size]
-            `head_mask`: an optional ``torch.LongTensor`` of shape [num_heads] with indices
-                selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
-                input sequence length in the current batch. It's the mask that we typically use for attention when
-                a batch has varying length sentences.
-            `head_mask`: an optional ``torch.Tensor`` of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
-                It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
-
-        Returns:
-            Masked language modeling loss if ``masked_lm_labels`` is specified, masked language modeling
-            logits of shape [batch_size, sequence_length, vocab_size] otherwise.
-
-        Example::
-
-            # Already been converted into WordPiece token ids
-            input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
-            input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
-            token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
-
-            masked_lm_logits_scores = model(input_ids, token_type_ids, input_mask)
-            # or
-            masked_lm_logits_scores = model.forward(input_ids, token_type_ids, input_mask)
-        """
         outputs = self.bert(input_ids, token_type_ids, attention_mask, head_mask=head_mask)
 
         sequence_output = outputs[0]
@@ -919,21 +876,39 @@ class BertForMaskedLM(BertPreTrainedModel):
         return outputs  # (masked_lm_loss), prediction_scores, (hidden_states), (attentions)
 
 
+@add_start_docstrings("""Bert Model transformer BERT model with a `next sentence prediction (classification)` head on top. """,
+    BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING)
 class BertForNextSentencePrediction(BertPreTrainedModel):
-    """BERT model with next sentence prediction head.
-    This module comprises the BERT model followed by the next sentence classification head.
+    r"""
+        **next_sentence_label**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
+            Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair (see ``input_ids`` docstring)
+            Indices should be in ``[0, 1]``.
+            ``0`` indicates sequence B is a continuation of sequence A,
+            ``1`` indicates sequence B is a random sequence.
 
-    Args:
-        `config`: a BertConfig class instance with the configuration to build a new model
-        `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
-        `output_hidden_states`: If True, also output hidden states computed by the model at each layer. Default: False
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **loss**: (`optional`, returned when ``next_sentence_label`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
+            Next sequence prediction (classification) loss.
+        **seq_relationship_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, 2)``
+            Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation before SoftMax).
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
 
-    Example::
+    Examples::
 
-        config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
-            num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
+        >>> config = BertConfig.from_pretrained('bert-base-uncased')
+        >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+        >>> 
+        >>> model = BertForNextSentencePrediction(config)
+        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        >>> outputs = model(input_ids)
+        >>> seq_relationship_scores = outputs[0]
 
-        model = BertForNextSentencePrediction(config)
     """
     def __init__(self, config):
         super(BertForNextSentencePrediction, self).__init__(config)
@@ -944,44 +919,6 @@ class BertForNextSentencePrediction(BertPreTrainedModel):
         self.apply(self.init_weights)
 
     def forward(self, input_ids, token_type_ids=None, attention_mask=None, next_sentence_label=None, head_mask=None):
-        """
-        Performs a model forward pass. **Can be called by calling the class directly, once it has been instantiated.**
-
-        Args:
-            `input_ids`: a ``torch.LongTensor`` of shape [batch_size, sequence_length]
-                with the word token indices in the vocabulary(see the tokens pre-processing logic in the scripts
-                `run_bert_extract_features.py`, `run_bert_classifier.py` and `run_bert_squad.py`)
-            `token_type_ids`: an optional ``torch.LongTensor`` of shape [batch_size, sequence_length] with the token
-                types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
-                a `sentence B` token (see BERT paper for more details).
-            `attention_mask`: an optional ``torch.LongTensor`` of shape [batch_size, sequence_length] with indices
-                selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
-                input sequence length in the current batch. It's the mask that we typically use for attention when
-                a batch has varying length sentences.
-            `next_sentence_label`: next sentence classification loss: ``torch.LongTensor`` of shape [batch_size]
-                with indices selected in [0, 1].
-                0 => next sentence is the continuation, 1 => next sentence is a random sentence.
-            `head_mask`: an optional ``torch.Tensor`` of shape [num_heads] or [num_layers, num_heads] with indices between
-                0 and 1.It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked,
-                0.0 => head is not masked.
-
-        Returns:
-            If ``next_sentence_label`` is specified, outputs the total_loss which is the sum of the masked language
-            modeling loss and the next sentence classification loss. If ``next_sentence_label`` is ``None``, outputs
-            the next sentence classification logits of shape [batch_size, 2].
-
-
-        Example::
-
-            # Already been converted into WordPiece token ids
-            input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
-            input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
-            token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
-
-            seq_relationship_logits = model(input_ids, token_type_ids, input_mask)
-            # or
-            seq_relationship_logits = model.forward(input_ids, token_type_ids, input_mask)
-        """
         outputs = self.bert(input_ids, token_type_ids, attention_mask, head_mask=head_mask)
         pooled_output = outputs[1]
 
@@ -996,25 +933,41 @@ class BertForNextSentencePrediction(BertPreTrainedModel):
         return outputs  # (next_sentence_loss), seq_relationship_score, (hidden_states), (attentions)
 
 
+@add_start_docstrings("""Bert Model transformer BERT model with a sequence classification/regression head on top (a linear layer on top of
+    the pooled output) e.g. for GLUE tasks. """,
+    BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING)
 class BertForSequenceClassification(BertPreTrainedModel):
-    """BERT model for classification.
-    This module is composed of the BERT model with a linear layer on top of
-    the pooled output.
+    r"""
+        **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
+            Labels for computing the sequence classification/regression loss.
+            Indices should be in ``[0, ..., config.num_labels]``.
+            If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss),
+            If ``config.num_labels > 1`` a classification loss is computed (Cross-Entropy).
 
-    Params:
-        `config`: a BertConfig class instance with the configuration to build a new model
-        `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
-        `output_hidden_states`: If True, also output hidden states computed by the model at each layer. Default: False
-        `num_labels`: the number of classes for the classifier. Default = 2.
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
+            Classification (or regression if config.num_labels==1) loss.
+        **logits**: ``torch.FloatTensor`` of shape ``(batch_size, config.num_labels)``
+            Classification (or regression if config.num_labels==1) scores (before SoftMax).
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
 
-    Example::
+    Examples::
 
-        config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
-            num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
+        >>> config = BertConfig.from_pretrained('bert-base-uncased')
+        >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+        >>> 
+        >>> model = BertForSequenceClassification(config)
+        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        >>> labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
+        >>> outputs = model(input_ids, labels=labels)
+        >>> loss, logits = outputs[:1]
 
-        num_labels = 2
-
-        model = BertForSequenceClassification(config, num_labels)
     """
     def __init__(self, config):
         super(BertForSequenceClassification, self).__init__(config)
@@ -1027,40 +980,6 @@ class BertForSequenceClassification(BertPreTrainedModel):
         self.apply(self.init_weights)
 
     def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None, head_mask=None):
-        """
-        Performs a model forward pass. **Can be called by calling the class directly, once it has been instantiated.**
-
-        Parameters:
-            `input_ids`: a ``torch.LongTensor`` of shape [batch_size, sequence_length]
-                with the word token indices in the vocabulary. Items in the batch should begin with the special "CLS" token. (see the tokens preprocessing logic in the scripts
-                `run_bert_extract_features.py`, `run_bert_classifier.py` and `run_bert_squad.py`)
-            `token_type_ids`: an optional ``torch.LongTensor`` of shape [batch_size, sequence_length] with the token
-                types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
-                a `sentence B` token (see BERT paper for more details).
-            `attention_mask`: an optional ``torch.LongTensor`` of shape [batch_size, sequence_length] with indices
-                selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
-                input sequence length in the current batch. It's the mask that we typically use for attention when
-                a batch has varying length sentences.
-            `labels`: labels for the classification output: ``torch.LongTensor`` of shape [batch_size]
-                with indices selected in [0, ..., num_labels].
-            `head_mask`: an optional ``torch.Tensor`` of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
-                It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
-
-        Returns:
-            If ``labels`` is not ``None``, outputs the CrossEntropy classification loss of the output with the labels.
-            If ``labels`` is ``None``, outputs the classification logits of shape [batch_size, num_labels].
-
-        Example::
-
-            # Already been converted into WordPiece token ids
-            input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
-            input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
-            token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
-
-            logits = model(input_ids, token_type_ids, input_mask)
-            # or
-            logits = model.forward(input_ids, token_type_ids, input_mask)
-        """
         outputs = self.bert(input_ids, token_type_ids, attention_mask, head_mask=head_mask)
         pooled_output = outputs[1]
 
@@ -1082,26 +1001,78 @@ class BertForSequenceClassification(BertPreTrainedModel):
         return outputs  # (loss), logits, (hidden_states), (attentions)
 
 
+@add_start_docstrings("""Bert Model transformer BERT model with a multiple choice classification head on top (a linear layer on top of
+    the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """,
+    BERT_START_DOCSTRING)
 class BertForMultipleChoice(BertPreTrainedModel):
-    """BERT model for multiple choice tasks.
-    This module is composed of the BERT model with a linear layer on top of the pooled output.
+    r"""
+    Inputs:
+        **input_ids**: ``torch.LongTensor`` of shape ``(batch_size, num_choices, sequence_length)``:
+            Indices of input sequence tokens in the vocabulary.
+            The second dimension of the input (`num_choices`) indicates the number of choices to score.
+            To match pre-training, BERT input sequence should be formatted with [CLS] and [SEP] tokens as follows:
 
-    Parameters:
-        `config`: a BertConfig class instance with the configuration to build a new model
-        `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
-        `output_hidden_states`: If True, also output hidden states computed by the model at each layer. Default: False
+            (a) For sequence pairs:
 
-    Example::
+                ``tokens:         [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]``
+                
+                ``token_type_ids:   0   0  0    0    0     0       0   0   1  1  1  1   1   1``
 
-        # Already been converted into WordPiece token ids
-        input_ids = torch.LongTensor([[[31, 51, 99], [15, 5, 0]], [[12, 16, 42], [14, 28, 57]]])
-        input_mask = torch.LongTensor([[[1, 1, 1], [1, 1, 0]],[[1,1,0], [1, 0, 0]]])
-        token_type_ids = torch.LongTensor([[[0, 0, 1], [0, 1, 0]],[[0, 1, 1], [0, 0, 1]]])
-        config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
-            num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
+            (b) For single sequences:
+
+                ``tokens:         [CLS] the dog is hairy . [SEP]``
+                
+                ``token_type_ids:   0   0   0   0  0     0   0``
+    
+            Indices can be obtained using :class:`pytorch_transformers.BertTokenizer`.
+            See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
+            :func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
+        **token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, num_choices, sequence_length)``:
+            Segment token indices to indicate first and second portions of the inputs.
+            The second dimension of the input (`num_choices`) indicates the number of choices to score.
+            Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1``
+            corresponds to a `sentence B` token
+            (see `BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding`_ for more details).
+        **attention_mask**: (`optional`) ``torch.Tensor`` of shape ``(batch_size, num_choices, sequence_length)``:
+            Mask to avoid performing attention on padding token indices.
+            The second dimension of the input (`num_choices`) indicates the number of choices to score.
+            Mask indices selected in ``[0, 1]``:
+            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
+        **head_mask**: (`optional`) ``torch.Tensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
+            Mask to nullify selected heads of the self-attention modules.
+            Mask indices selected in ``[0, 1]``:
+            ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
+        **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
+            Labels for computing the multiple choice classification loss.
+            Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension
+            of the input tensors. (see `input_ids` above)
+
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
+            Classification loss.
+        **classification_scores**: ``torch.FloatTensor`` of shape ``(batch_size, num_choices)`` where `num_choices` is the size of the second dimension
+            of the input tensors. (see `input_ids` above).
+            Classification scores (before SoftMax).
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+
+    Examples::
+
+        >>> config = BertConfig.from_pretrained('bert-base-uncased')
+        >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+        >>> 
+        >>> model = BertForMultipleChoice(config)
+        >>> choices = ["Hello, my dog is cute", "Hello, my cat is amazing"]
+        >>> input_ids = torch.tensor([tokenizer.encode(s) for s in choices]).unsqueeze(0)  # Batch size 1, 2 choices
+        >>> labels = torch.tensor(1).unsqueeze(0)  # Batch size 1
+        >>> outputs = model(input_ids, labels=labels)
+        >>> loss, classification_scores = outputs[:1]
 
-        model = BertForMultipleChoice(config)
-        logits = model(input_ids, token_type_ids, input_mask)
     """
     def __init__(self, config):
         super(BertForMultipleChoice, self).__init__(config)
@@ -1113,42 +1084,6 @@ class BertForMultipleChoice(BertPreTrainedModel):
         self.apply(self.init_weights)
 
     def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None, head_mask=None):
-        """
-        Performs a model forward pass. **Can be called by calling the class directly, once it has been instantiated.**
-
-        Parameters:
-            `input_ids`: a ``torch.LongTensor`` of shape [batch_size, num_choices, sequence_length]
-                with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
-                `run_bert_extract_features.py`, `run_bert_classifier.py` and `run_bert_squad.py`)
-            `token_type_ids`: an optional ``torch.LongTensor`` of shape [batch_size, num_choices, sequence_length]
-                with the token types indices selected in [0, 1]. Type 0 corresponds to a `sentence A`
-                and type 1 corresponds to a `sentence B` token (see BERT paper for more details).
-            `attention_mask`: an optional ``torch.LongTensor`` of shape [batch_size, num_choices, sequence_length] with indices
-                selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
-                input sequence length in the current batch. It's the mask that we typically use for attention when
-                a batch has varying length sentences.
-            `labels`: labels for the classification output: ``torch.LongTensor`` of shape [batch_size]
-                with indices selected in [0, ..., num_choices].
-            `head_mask`: an optional ``torch.Tensor`` of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
-                It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
-
-        Returns:
-            If ``labels`` is not ``None``, outputs the CrossEntropy classification loss of the output with the labels.
-            If ``labels`` is ``None``, outputs the classification logits of shape [batch_size, num_labels].
-
-        Example::
-
-            # Already been converted into WordPiece token ids
-            input_ids = torch.LongTensor([[[31, 51, 99], [15, 5, 0]], [[12, 16, 42], [14, 28, 57]]])
-            input_mask = torch.LongTensor([[[1, 1, 1], [1, 1, 0]],[[1,1,0], [1, 0, 0]]])
-            token_type_ids = torch.LongTensor([[[0, 0, 1], [0, 1, 0]],[[0, 1, 1], [0, 0, 1]]])
-            config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
-                num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
-
-            model = BertForMultipleChoice(config)
-            logits = model(input_ids, token_type_ids, input_mask)
-        """
-        """ Input shapes should be [bsz, num choices, seq length] """
         num_choices = input_ids.shape[1]
 
         flat_input_ids = input_ids.view(-1, input_ids.size(-1))
@@ -1171,25 +1106,39 @@ class BertForMultipleChoice(BertPreTrainedModel):
         return outputs  # (loss), reshaped_logits, (hidden_states), (attentions)
 
 
+@add_start_docstrings("""Bert Model transformer BERT model with a token classification head on top (a linear layer on top of
+    the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
+    BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING)
 class BertForTokenClassification(BertPreTrainedModel):
-    """BERT model for token-level classification.
-    This module is composed of the BERT model with a linear layer on top of
-    the full hidden state of the last layer.
+    r"""
+        **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+            Labels for computing the token classification loss.
+            Indices should be in ``[0, ..., config.num_labels]``.
 
-    Parameters:
-        `config`: a BertConfig class instance with the configuration to build a new model
-        `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
-        `output_hidden_states`: If True, also output hidden states computed by the model at each layer. Default: False
-        `num_labels`: the number of classes for the classifier. Default = 2.
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
+            Classification loss.
+        **scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.num_labels)``
+            Classification scores (before SoftMax).
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
 
-    Example::
+    Examples::
 
-        config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
-            num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
+        >>> config = BertConfig.from_pretrained('bert-base-uncased')
+        >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+        >>> 
+        >>> model = BertForTokenClassification(config)
+        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        >>> labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0)  # Batch size 1
+        >>> outputs = model(input_ids, labels=labels)
+        >>> loss, scores = outputs[:1]
 
-        num_labels = 2
-
-        model = BertForTokenClassification(config, num_labels)
     """
     def __init__(self, config):
         super(BertForTokenClassification, self).__init__(config)
@@ -1202,40 +1151,6 @@ class BertForTokenClassification(BertPreTrainedModel):
         self.apply(self.init_weights)
 
     def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None, head_mask=None):
-        """
-        Performs a model forward pass. **Can be called by calling the class directly, once it has been instantiated.**
-
-        Parameters:
-            `input_ids`: a ``torch.LongTensor`` of shape [batch_size, sequence_length]
-                with the word token indices in the vocabulary(see the tokens pre-processing logic in the scripts
-                `run_bert_extract_features.py`, `run_bert_classifier.py` and `run_bert_squad.py`)
-            `token_type_ids`: an optional ``torch.LongTensor`` of shape [batch_size, sequence_length] with the token
-                types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
-                a `sentence B` token (see BERT paper for more details).
-            `attention_mask`: an optional ``torch.LongTensor`` of shape [batch_size, sequence_length] with indices
-                selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
-                input sequence length in the current batch. It's the mask that we typically use for attention when
-                a batch has varying length sentences.
-            `labels`: labels for the classification output: ``torch.LongTensor`` of shape [batch_size, sequence_length]
-                with indices selected in [0, ..., num_labels].
-            `head_mask`: an optional ``torch.Tensor`` of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
-                It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
-
-        Returns:
-            If ``labels`` is not ``None``, outputs the CrossEntropy classification loss of the output with the labels.
-            If ``labels`` is ``None``, outputs the classification logits of shape [batch_size, sequence_length, num_labels].
-
-        Example::
-
-            # Already been converted into WordPiece token ids
-            input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
-            input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
-            token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
-
-            logits = model(input_ids, token_type_ids, input_mask)
-            # or
-            logits = model.forward(input_ids, token_type_ids, input_mask)
-        """
         outputs = self.bert(input_ids, token_type_ids, attention_mask, head_mask=head_mask)
         sequence_output = outputs[0]
 
@@ -1255,25 +1170,50 @@ class BertForTokenClassification(BertPreTrainedModel):
                 loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
             outputs = (loss,) + outputs
 
-        return outputs  # (loss), logits, (hidden_states), (attentions)
+        return outputs  # (loss), scores, (hidden_states), (attentions)
 
 
+@add_start_docstrings("""Bert Model transformer BERT model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
+    the hidden-states output to compute `span start logits` and `span end logits`). """,
+    BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING)
 class BertForQuestionAnswering(BertPreTrainedModel):
-    """BERT model for Question Answering (span extraction).
-    This module is composed of the BERT model with a linear layer on top of
-    the sequence output that computes start_logits and end_logits
+    r"""
+        **start_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
+            Position (index) of the start of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`).
+            Position outside of the sequence are not taken into account for computing the loss.
+        **end_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
+            Position (index) of the end of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`).
+            Position outside of the sequence are not taken into account for computing the loss.
 
-    Parameters:
-        `config`: a BertConfig class instance with the configuration to build a new model
-        `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
-        `output_hidden_states`: If True, also output hidden states computed by the model at each layer. Default: False
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
+            Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
+        **start_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length,)``
+            Span-start scores (before SoftMax).
+        **end_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length,)``
+            Span-end scores (before SoftMax).
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
 
-    Example::
+    Examples::
 
-        config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
-            num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
+        >>> config = BertConfig.from_pretrained('bert-base-uncased')
+        >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+        >>> 
+        >>> model = BertForQuestionAnswering(config)
+        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        >>> start_positions = torch.tensor([1])
+        >>> end_positions = torch.tensor([3])
+        >>> outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
+        >>> loss, start_scores, end_scores = outputs[:2]
 
-        model = BertForQuestionAnswering(config)
     """
     def __init__(self, config):
         super(BertForQuestionAnswering, self).__init__(config)
@@ -1286,44 +1226,6 @@ class BertForQuestionAnswering(BertPreTrainedModel):
 
     def forward(self, input_ids, token_type_ids=None, attention_mask=None, start_positions=None,
                 end_positions=None, head_mask=None):
-        """
-        Performs a model forward pass. **Can be called by calling the class directly, once it has been instantiated.**
-
-        Parameters:
-            `input_ids`: a ``torch.LongTensor`` of shape [batch_size, sequence_length]
-                with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
-                `run_bert_extract_features.py`, `run_bert_classifier.py` and `run_bert_squad.py`)
-            `token_type_ids`: an optional ``torch.LongTensor`` of shape [batch_size, sequence_length] with the token
-                types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
-                a `sentence B` token (see BERT paper for more details).
-            `attention_mask`: an optional ``torch.LongTensor`` of shape [batch_size, sequence_length] with indices
-                selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
-                input sequence length in the current batch. It's the mask that we typically use for attention when
-                a batch has varying length sentences.
-            `start_positions`: position of the first token for the labeled span: ``torch.LongTensor`` of shape [batch_size].
-                Positions are clamped to the length of the sequence and position outside of the sequence are not taken
-                into account for computing the loss.
-            `end_positions`: position of the last token for the labeled span: ``torch.LongTensor`` of shape [batch_size].
-                Positions are clamped to the length of the sequence and position outside of the sequence are not taken
-                into account for computing the loss.
-            `head_mask`: an optional ``torch.Tensor`` of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
-                It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
-
-        Returns:
-            If ``start_positions`` and ``end_positions`` are not ``None``, outputs the total_loss which is the sum of the
-            CrossEntropy loss for the start and end token positions.
-            If ``start_positions`` or ``end_positions`` is ``None``, outputs a tuple of start_logits, end_logits which are the
-            logits respectively for the start and end position tokens of shape [batch_size, sequence_length].
-
-        Example::
-
-            # Already been converted into WordPiece token ids
-            input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
-            input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
-            token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
-
-            start_logits, end_logits = model(input_ids, token_type_ids, input_mask)
-        """
         outputs = self.bert(input_ids, token_type_ids, attention_mask, head_mask=head_mask)
         sequence_output = outputs[0]
 
diff --git a/pytorch_transformers/modeling_utils.py b/pytorch_transformers/modeling_utils.py
index bb2b82b41c..8971af306e 100644
--- a/pytorch_transformers/modeling_utils.py
+++ b/pytorch_transformers/modeling_utils.py
@@ -36,6 +36,13 @@ WEIGHTS_NAME = "pytorch_model.bin"
 TF_WEIGHTS_NAME = 'model.ckpt'
 
 
+def add_start_docstrings(*docstr):
+    def docstring_decorator(fn):
+        fn.__doc__ = ''.join(docstr) + fn.__doc__
+        return fn
+    return docstring_decorator
+
+
 class PretrainedConfig(object):
     """ An abstract class to handle dowloading a model pretrained config.
     """

From 0e9825e2527dfbf1d51520a3ec88416327bb750e Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Sun, 14 Jul 2019 23:43:28 +0200
Subject: [PATCH 118/139] small fix to run_glue

---
 examples/run_glue.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/run_glue.py b/examples/run_glue.py
index ea5cc9f42d..979c644471 100644
--- a/examples/run_glue.py
+++ b/examples/run_glue.py
@@ -166,6 +166,9 @@ def train(args, train_dataset, model, tokenizer):
             train_iterator.close()
             break
 
+    if args.local_rank in [-1, 0]:
+        tb_writer.close()
+
     return global_step, tr_loss / global_step
 
 
@@ -230,9 +233,6 @@ def evaluate(args, model, tokenizer, prefix=""):
                 logger.info("  %s = %s", key, str(result[key]))
                 writer.write("%s = %s\n" % (key, str(result[key])))
 
-    if args.local_rank in [-1, 0]:
-        tb_writer.close()
-
     return results
 
 

From 183fedfed5e302d581714173d9ac1f232a128fbd Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Mon, 15 Jul 2019 09:00:09 +0200
Subject: [PATCH 119/139] fix doc on python2

---
 pytorch_transformers/modeling_bert.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/pytorch_transformers/modeling_bert.py b/pytorch_transformers/modeling_bert.py
index a8239038a7..c931ac6ab2 100644
--- a/pytorch_transformers/modeling_bert.py
+++ b/pytorch_transformers/modeling_bert.py
@@ -642,7 +642,7 @@ BERT_INPUTS_DOCSTRING = r"""
 @add_start_docstrings("The bare Bert Model transformer outputing raw hidden-states without any specific head on top.",
                       BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING)
 class BertModel(BertPreTrainedModel):
-    r"""
+    __doc__ = r"""
     Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
         **last_hidden_state**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, hidden_size)``
             Sequence of hidden-states at the last layer of the model.
@@ -738,7 +738,7 @@ class BertModel(BertPreTrainedModel):
     a `masked language modeling` head and a `next sentence prediction (classification)` head. """,
     BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING)
 class BertForPreTraining(BertPreTrainedModel):
-    r"""
+    __doc__ = r"""
         **masked_lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
             Labels for computing the masked language modeling loss.
             Indices should be in ``[-1, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
@@ -814,7 +814,7 @@ class BertForPreTraining(BertPreTrainedModel):
 @add_start_docstrings("""Bert Model transformer BERT model with a `language modeling` head on top. """,
     BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING)
 class BertForMaskedLM(BertPreTrainedModel):
-    r"""
+    __doc__ = r"""
         **masked_lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
             Labels for computing the masked language modeling loss.
             Indices should be in ``[-1, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
@@ -879,7 +879,7 @@ class BertForMaskedLM(BertPreTrainedModel):
 @add_start_docstrings("""Bert Model transformer BERT model with a `next sentence prediction (classification)` head on top. """,
     BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING)
 class BertForNextSentencePrediction(BertPreTrainedModel):
-    r"""
+    __doc__ = r"""
         **next_sentence_label**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
             Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair (see ``input_ids`` docstring)
             Indices should be in ``[0, 1]``.
@@ -937,7 +937,7 @@ class BertForNextSentencePrediction(BertPreTrainedModel):
     the pooled output) e.g. for GLUE tasks. """,
     BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING)
 class BertForSequenceClassification(BertPreTrainedModel):
-    r"""
+    __doc__ = r"""
         **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
             Labels for computing the sequence classification/regression loss.
             Indices should be in ``[0, ..., config.num_labels]``.
@@ -1005,7 +1005,7 @@ class BertForSequenceClassification(BertPreTrainedModel):
     the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """,
     BERT_START_DOCSTRING)
 class BertForMultipleChoice(BertPreTrainedModel):
-    r"""
+    __doc__ = r"""
     Inputs:
         **input_ids**: ``torch.LongTensor`` of shape ``(batch_size, num_choices, sequence_length)``:
             Indices of input sequence tokens in the vocabulary.
@@ -1110,7 +1110,7 @@ class BertForMultipleChoice(BertPreTrainedModel):
     the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
     BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING)
 class BertForTokenClassification(BertPreTrainedModel):
-    r"""
+    __doc__ = r"""
         **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
             Labels for computing the token classification loss.
             Indices should be in ``[0, ..., config.num_labels]``.

From 5bc3d0cc5b72efc765d35ba3b99620ccb7a38ff1 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Mon, 15 Jul 2019 09:40:05 +0200
Subject: [PATCH 120/139] added gpt2 doc

---
 pytorch_transformers/modeling_bert.py |  55 ++--
 pytorch_transformers/modeling_gpt2.py | 366 +++++++++++++-------------
 2 files changed, 210 insertions(+), 211 deletions(-)

diff --git a/pytorch_transformers/modeling_bert.py b/pytorch_transformers/modeling_bert.py
index c931ac6ab2..ea908b1ca0 100644
--- a/pytorch_transformers/modeling_bert.py
+++ b/pytorch_transformers/modeling_bert.py
@@ -277,10 +277,11 @@ class BertEmbeddings(nn.Module):
         self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps)
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
 
-    def forward(self, input_ids, token_type_ids=None):
+    def forward(self, input_ids, position_ids=None, token_type_ids=None):
         seq_length = input_ids.size(1)
-        position_ids = torch.arange(seq_length, dtype=torch.long, device=input_ids.device)
-        position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
+        if position_ids is None:
+            position_ids = torch.arange(seq_length, dtype=torch.long, device=input_ids.device)
+            position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
         if token_type_ids is None:
             token_type_ids = torch.zeros_like(input_ids)
 
@@ -624,6 +625,9 @@ BERT_INPUTS_DOCSTRING = r"""
             Indices can be obtained using :class:`pytorch_transformers.BertTokenizer`.
             See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
             :func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
+        **position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+            Indices of positions of each input sequence tokens in the position embeddings.
+            Selected in the range ``[0, config.max_position_embeddings - 1[``.
         **token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
             Segment token indices to indicate first and second portions of the inputs.
             Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1``
@@ -687,7 +691,7 @@ class BertModel(BertPreTrainedModel):
         for layer, heads in heads_to_prune.items():
             self.encoder.layer[layer].attention.prune_heads(heads)
 
-    def forward(self, input_ids, token_type_ids=None, attention_mask=None, head_mask=None):
+    def forward(self, input_ids, position_ids=None, token_type_ids=None, attention_mask=None, head_mask=None):
         if attention_mask is None:
             attention_mask = torch.ones_like(input_ids)
         if token_type_ids is None:
@@ -723,7 +727,7 @@ class BertModel(BertPreTrainedModel):
         else:
             head_mask = [None] * self.config.num_hidden_layers
 
-        embedding_output = self.embeddings(input_ids, token_type_ids)
+        embedding_output = self.embeddings(input_ids, position_ids, token_type_ids)
         encoder_outputs = self.encoder(embedding_output,
                                        extended_attention_mask,
                                        head_mask=head_mask)
@@ -773,7 +777,7 @@ class BertForPreTraining(BertPreTrainedModel):
         >>> model = BertForPreTraining(config)
         >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
         >>> outputs = model(input_ids)
-        >>> prediction_scores, seq_relationship_scores = outputs[:1]
+        >>> prediction_scores, seq_relationship_scores = outputs[:2]
 
     """
     def __init__(self, config):
@@ -792,9 +796,9 @@ class BertForPreTraining(BertPreTrainedModel):
         self._tie_or_clone_weights(self.cls.predictions.decoder,
                                    self.bert.embeddings.word_embeddings)
 
-    def forward(self, input_ids, token_type_ids=None, attention_mask=None, masked_lm_labels=None,
+    def forward(self, input_ids, position_ids=None, token_type_ids=None, attention_mask=None, masked_lm_labels=None,
                 next_sentence_label=None, head_mask=None):
-        outputs = self.bert(input_ids, token_type_ids, attention_mask, head_mask=head_mask)
+        outputs = self.bert(input_ids, position_ids, token_type_ids, attention_mask, head_mask=head_mask)
 
         sequence_output, pooled_output = outputs[:2]
         prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output)
@@ -842,7 +846,7 @@ class BertForMaskedLM(BertPreTrainedModel):
         >>> model = BertForMaskedLM(config)
         >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
         >>> outputs = model(input_ids, masked_lm_labels=input_ids)
-        >>> loss, prediction_scores = outputs[:1]
+        >>> loss, prediction_scores = outputs[:2]
 
     """
     def __init__(self, config):
@@ -861,8 +865,8 @@ class BertForMaskedLM(BertPreTrainedModel):
         self._tie_or_clone_weights(self.cls.predictions.decoder,
                                    self.bert.embeddings.word_embeddings)
 
-    def forward(self, input_ids, token_type_ids=None, attention_mask=None, masked_lm_labels=None, head_mask=None):
-        outputs = self.bert(input_ids, token_type_ids, attention_mask, head_mask=head_mask)
+    def forward(self, input_ids, position_ids=None, token_type_ids=None, attention_mask=None, masked_lm_labels=None, head_mask=None):
+        outputs = self.bert(input_ids, position_ids, token_type_ids, attention_mask, head_mask=head_mask)
 
         sequence_output = outputs[0]
         prediction_scores = self.cls(sequence_output)
@@ -918,8 +922,8 @@ class BertForNextSentencePrediction(BertPreTrainedModel):
 
         self.apply(self.init_weights)
 
-    def forward(self, input_ids, token_type_ids=None, attention_mask=None, next_sentence_label=None, head_mask=None):
-        outputs = self.bert(input_ids, token_type_ids, attention_mask, head_mask=head_mask)
+    def forward(self, input_ids, position_ids=None, token_type_ids=None, attention_mask=None, next_sentence_label=None, head_mask=None):
+        outputs = self.bert(input_ids, position_ids, token_type_ids, attention_mask, head_mask=head_mask)
         pooled_output = outputs[1]
 
         seq_relationship_score = self.cls(pooled_output)
@@ -966,7 +970,7 @@ class BertForSequenceClassification(BertPreTrainedModel):
         >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
         >>> labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
         >>> outputs = model(input_ids, labels=labels)
-        >>> loss, logits = outputs[:1]
+        >>> loss, logits = outputs[:2]
 
     """
     def __init__(self, config):
@@ -979,8 +983,8 @@ class BertForSequenceClassification(BertPreTrainedModel):
 
         self.apply(self.init_weights)
 
-    def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None, head_mask=None):
-        outputs = self.bert(input_ids, token_type_ids, attention_mask, head_mask=head_mask)
+    def forward(self, input_ids, position_ids=None, token_type_ids=None, attention_mask=None, labels=None, head_mask=None):
+        outputs = self.bert(input_ids, position_ids, token_type_ids, attention_mask, head_mask=head_mask)
         pooled_output = outputs[1]
 
         pooled_output = self.dropout(pooled_output)
@@ -1071,7 +1075,7 @@ class BertForMultipleChoice(BertPreTrainedModel):
         >>> input_ids = torch.tensor([tokenizer.encode(s) for s in choices]).unsqueeze(0)  # Batch size 1, 2 choices
         >>> labels = torch.tensor(1).unsqueeze(0)  # Batch size 1
         >>> outputs = model(input_ids, labels=labels)
-        >>> loss, classification_scores = outputs[:1]
+        >>> loss, classification_scores = outputs[:2]
 
     """
     def __init__(self, config):
@@ -1083,13 +1087,14 @@ class BertForMultipleChoice(BertPreTrainedModel):
 
         self.apply(self.init_weights)
 
-    def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None, head_mask=None):
+    def forward(self, input_ids, position_ids=None, token_type_ids=None, attention_mask=None, labels=None, head_mask=None):
         num_choices = input_ids.shape[1]
 
         flat_input_ids = input_ids.view(-1, input_ids.size(-1))
+        flat_position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
         flat_token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
         flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
-        outputs = self.bert(flat_input_ids, flat_token_type_ids, flat_attention_mask, head_mask=head_mask)
+        outputs = self.bert(flat_input_ids, flat_position_ids, flat_token_type_ids, flat_attention_mask, head_mask=head_mask)
         pooled_output = outputs[1]
 
         pooled_output = self.dropout(pooled_output)
@@ -1137,7 +1142,7 @@ class BertForTokenClassification(BertPreTrainedModel):
         >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
         >>> labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0)  # Batch size 1
         >>> outputs = model(input_ids, labels=labels)
-        >>> loss, scores = outputs[:1]
+        >>> loss, scores = outputs[:2]
 
     """
     def __init__(self, config):
@@ -1150,8 +1155,8 @@ class BertForTokenClassification(BertPreTrainedModel):
 
         self.apply(self.init_weights)
 
-    def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None, head_mask=None):
-        outputs = self.bert(input_ids, token_type_ids, attention_mask, head_mask=head_mask)
+    def forward(self, input_ids, position_ids=None, token_type_ids=None, attention_mask=None, labels=None, head_mask=None):
+        outputs = self.bert(input_ids, position_ids, token_type_ids, attention_mask, head_mask=head_mask)
         sequence_output = outputs[0]
 
         sequence_output = self.dropout(sequence_output)
@@ -1177,7 +1182,7 @@ class BertForTokenClassification(BertPreTrainedModel):
     the hidden-states output to compute `span start logits` and `span end logits`). """,
     BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING)
 class BertForQuestionAnswering(BertPreTrainedModel):
-    r"""
+    __doc__ = r"""
         **start_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
             Position (index) of the start of the labelled span for computing the token classification loss.
             Positions are clamped to the length of the sequence (`sequence_length`).
@@ -1224,9 +1229,9 @@ class BertForQuestionAnswering(BertPreTrainedModel):
 
         self.apply(self.init_weights)
 
-    def forward(self, input_ids, token_type_ids=None, attention_mask=None, start_positions=None,
+    def forward(self, input_ids, position_ids=None, token_type_ids=None, attention_mask=None, start_positions=None,
                 end_positions=None, head_mask=None):
-        outputs = self.bert(input_ids, token_type_ids, attention_mask, head_mask=head_mask)
+        outputs = self.bert(input_ids, position_ids, token_type_ids, attention_mask, head_mask=head_mask)
         sequence_output = outputs[0]
 
         logits = self.qa_outputs(sequence_output)
diff --git a/pytorch_transformers/modeling_gpt2.py b/pytorch_transformers/modeling_gpt2.py
index 0747c7a026..561228a47e 100644
--- a/pytorch_transformers/modeling_gpt2.py
+++ b/pytorch_transformers/modeling_gpt2.py
@@ -365,44 +365,81 @@ class GPT2PreTrainedModel(PreTrainedModel):
             module.weight.data.fill_(1.0)
 
 
+GPT2_START_DOCSTRING = r"""    OpenAI GPT-2 model was proposed in
+    `Language Models are Unsupervised Multitask Learners`_
+    by Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever**.
+    It's a causal (unidirectional) transformer pre-trained using  language modeling on a very large
+    corpus of ~40 GB of text data.
+
+    This model is a PyTorch `torch.nn.Module`_ sub-class. Use it as a regular PyTorch Module and
+    refer to the PyTorch documentation for all matter related to general usage and behavior.
+
+    .. _`Language Models are Unsupervised Multitask Learners`:
+        https://openai.com/blog/better-language-models/
+
+    .. _`torch.nn.Module`:
+        https://pytorch.org/docs/stable/nn.html#module
+
+    Parameters:
+        config (:class:`~pytorch_transformers.BertConfig`): Model configuration class with all the parameters of the model.
+"""
+
+GPT2_INPUTS_DOCTRING = r"""    Inputs:
+        **input_ids**: ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+            Indices of input sequence tokens in the vocabulary.
+            Indices can be obtained using :class:`pytorch_transformers.BPT2Tokenizer`.
+            See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
+            :func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
+        **position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+            Indices of positions of each input sequence tokens in the position embeddings.
+            Selected in the range ``[0, config.max_position_embeddings - 1[``.
+        **token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+            A parallel sequence of tokens (can be used to indicate various portions of the inputs).
+            The embeddings from these tokens will be summed with the respective token embeddings.
+            Indices are selected in the vocabulary (unlike BERT which has a specific vocabulary for segment indices).
+        **past**:
+            list of ``torch.FloatTensor`` (one for each layer):
+            that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
+            (see `past` output below). Can be used to speed up sequential decoding.
+        **attention_mask**: (`optional`) ``torch.Tensor`` of shape ``(batch_size, sequence_length)``:
+            Mask to avoid performing attention on padding token indices.
+            Mask indices selected in ``[0, 1]``:
+            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
+        **head_mask**: (`optional`) ``torch.Tensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
+            Mask to nullify selected heads of the self-attention modules.
+            Mask indices selected in ``[0, 1]``:
+            ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
+"""
+
+@add_start_docstrings("The bare GPT2 Model transformer outputing raw hidden-states without any specific head on top.",
+                      GPT2_START_DOCSTRING, GPT2_INPUTS_DOCTRING)
 class GPT2Model(GPT2PreTrainedModel):
-    """OpenAI GPT-2 model ("Language Models are Unsupervised Multitask Learners").
+    __doc__ = r"""
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **last_hidden_state**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, hidden_size)``
+            Sequence of hidden-states at the last layer of the model.
+        **past**:
+            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            that contains pre-computed hidden-states (key and values in the attention blocks).
+            Can be used (see `past` input) to speed up sequential decoding.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
 
-    GPT-2 use a single embedding matrix to store the word and special embeddings.
-    Special tokens embeddings are additional tokens that are not pre-trained: [SEP], [CLS]...
-    Special tokens need to be trained during the fine-tuning if you use them.
-    The number of special embeddings can be controlled using the `set_num_special_tokens(num_special_tokens)` function.
+    Examples::
 
-    The embeddings are ordered as follow in the token embeddings matrix:
-    ::
+        >>> config = GPT2Config.from_pretrained('gpt2')
+        >>> tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
+        >>> model = GPT2Model(config)
+        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        >>> outputs = model(input_ids)
+        >>> last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
 
-        [0,                                                         ----------------------
-         ...                                                        -> word embeddings
-         config.vocab_size - 1,                                     ______________________
-         config.vocab_size,
-         ...                                                        -> special embeddings
-         config.vocab_size + n_special - 1]                  ______________________
-
-    where total_tokens_embeddings is equal to
-
-    ::
-
-        total_tokens_embeddings = vocab_size + n_special
-
-    You should use the associated indices to index the embeddings.
-
-    Args:
-        `config`: a GPT2Config class instance with the configuration to build a new model
-        `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
-
-
-
-    Example::
-
-        config = modeling_gpt2.GPT2Config()
-        model = modeling_gpt2.GPT2Model(config)
     """
-
     def __init__(self, config):
         super(GPT2Model, self).__init__(config)
         self.output_hidden_states = config.output_hidden_states
@@ -428,47 +465,6 @@ class GPT2Model(GPT2PreTrainedModel):
             self.h[layer].attn.prune_heads(heads)
 
     def forward(self, input_ids, position_ids=None, token_type_ids=None, past=None, head_mask=None):
-        """
-        Performs a model forward pass. **Can be called by calling the class directly, once it has been instantiated.**
-
-        Args:
-            `input_ids`: a ``torch.LongTensor`` of shape [batch_size, sequence_length] (or more generally [d_1, ..., d_n, sequence_length]
-                were d_1 ... d_n are arbitrary dimensions) with the word BPE token indices selected in the range [0, config.vocab_size[
-            `position_ids`: an optional ``torch.LongTensor`` with the same shape as input_ids
-                with the position indices (selected in the range [0, config.n_positions - 1[.
-            `token_type_ids`: an optional ``torch.LongTensor`` with the same shape as input_ids
-                You can use it to add a third type of embedding to each input token in the sequence
-                (the previous two being the word and position embeddings).
-                The input, position and token_type embeddings are summed inside the Transformer before the first
-                self-attention block.
-            `past`: an optional list of ``torch.LongTensor`` that contains pre-computed hidden-states
-                (key and values in the attention blocks) to speed up sequential decoding
-                (this is the presents output of the model, cf. below).
-            `head_mask`: an optional ``torch.Tensor`` of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
-                It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
-
-        Returns:
-             A tuple consisting of ``hidden_states`` and ``presents``.
-
-                 ``hidden_states`` are a list of all the encoded-hidden-states in the model (length of the list: number of
-                 layers + 1 for the output of the embeddings) as ``torch.FloatTensor`` of size [batch_size, sequence_length,
-                 hidden_size] (or more generally [d_1, ..., d_n, hidden_size] were d_1 ... d_n are the dimension of
-                 input_ids).
-
-                 ``presents`` are a list of pre-computed hidden-states (key and values in each attention blocks) as
-                 torch.FloatTensors. They can be reused to speed up sequential decoding.
-
-
-        Example::
-
-            # Already been converted into BPE token ids
-            input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
-
-            hidden_states, presents = model(input_ids)
-            # or
-            hidden_states, presents = model.forward(input_ids)
-
-        """
         if past is None:
             past_length = 0
             past = [None] * len(self.h)
@@ -540,21 +536,44 @@ class GPT2Model(GPT2PreTrainedModel):
         return outputs  # last hidden state, presents, (all hidden_states), (attentions)
 
 
+@add_start_docstrings("""The GPT2 Model transformer with a language modeling head on top
+(linear layer with weights tied to the input embeddings). """, GPT2_START_DOCSTRING, GPT2_INPUTS_DOCTRING)
 class GPT2LMHeadModel(GPT2PreTrainedModel):
-    """OpenAI GPT-2 model with a Language Modeling head ("Language Models are Unsupervised Multitask Learners").
+    __doc__ = r"""
+        **lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+            Labels for language modeling.
+            Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids``
+            Indices are selected in ``[-1, 0, ..., config.vocab_size]``
+            All labels set to ``-1`` are ignored (masked), the loss is only
+            computed for labels in ``[0, ..., config.vocab_size]``
 
-    Args:
-        `config`: a GPT2Config class instance with the configuration to build a new model
-        `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
-        `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
-            This can be used to compute head importance metrics. Default: False
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **loss**: (`optional`, returned when ``lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
+            Language modeling loss.
+        **prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        **past**:
+            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            that contains pre-computed hidden-states (key and values in the attention blocks).
+            Can be used (see `past` input) to speed up sequential decoding.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
 
-    Example::
+    Examples::
+
+        >>> config = GPT2Config.from_pretrained('gpt2')
+        >>> tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
+        >>> model = GPT2LMHeadModel(config)
+        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        >>> outputs = model(input_ids, lm_labels=input_ids)
+        >>> loss, logits = outputs[:2]
 
-        config = modeling_gpt2.GPT2Config()
-        model = modeling_gpt2.GPT2LMHeadModel(config)
     """
-
     def __init__(self, config):
         super(GPT2LMHeadModel, self).__init__(config)
         self.transformer = GPT2Model(config)
@@ -571,49 +590,6 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
                                    self.transformer.wte)
 
     def forward(self, input_ids, position_ids=None, token_type_ids=None, lm_labels=None, past=None, head_mask=None):
-        """
-        Performs a model forward pass. **Can be called by calling the class directly, once it has been instantiated.**
-
-        Args:
-            `input_ids`: a ``torch.LongTensor`` of shape [batch_size, sequence_length] (or more generally [d_1, ..., d_n, sequence_length]
-                were d_1 ... d_n are arbitrary dimensions) with the word BPE token indices selected in the range [0, config.vocab_size[
-            `position_ids`: an optional ``torch.LongTensor`` with the same shape as input_ids
-                with the position indices (selected in the range [0, config.n_positions - 1[.
-            `token_type_ids`: an optional ``torch.LongTensor`` with the same shape as input_ids
-                You can use it to add a third type of embedding to each input token in the sequence
-                (the previous two being the word and position embeddings).
-                The input, position and token_type embeddings are summed inside the Transformer before the first
-                self-attention block.
-            `lm_labels`: optional language modeling labels: ``torch.LongTensor`` of shape [batch_size, sequence_length]
-                with indices selected in [-1, 0, ..., vocab_size]. All labels set to -1 are ignored (masked), the loss
-                is only computed for the labels set in [0, ..., vocab_size]
-            `past`: an optional list of ``torch.LongTensor`` that contains pre-computed hidden-states
-                (key and values in the attention blocks) to speed up sequential decoding
-                (this is the presents output of the model, cf. below).
-            `head_mask`: an optional ``torch.Tensor`` of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
-                It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
-
-        Returns:
-            If ``lm_labels`` is not ``None``, returns the language modeling loss. It ``lm_labels`` is ``None``, returns
-            a tuple of (``lm_logits``, ``presents``).
-
-                ``lm_logits`` is the language modeling logits as a ``torch.FloatTensor`` of size [batch_size,
-                sequence_length, config.vocab_size] (or more generally [d_1, ..., d_n, config.vocab_size] were d_1 ...
-                d_n are the dimension of input_ids).
-
-                ``presents`` is a list of pre-computed hidden-states (key and values in each attention blocks) as
-                torch.FloatTensors. They can be reused to speed up sequential decoding.
-
-        Example::
-
-            # Already been converted into BPE token ids
-            input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
-
-            lm_logits, presents = model(input_ids)
-            # or
-            lm_logits, presents = model.forward(input_ids)
-
-        """
         transformer_outputs = self.transformer(input_ids, position_ids, token_type_ids, past, head_mask)
         hidden_states = transformer_outputs[0]
 
@@ -633,21 +609,88 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
         return outputs  # (loss), lm_logits, presents, (all hidden_states), (attentions)
 
 
+@add_start_docstrings("""The GPT2 Model transformer with a language modeling and a multiple-choice classification
+head on top e.g. for RocStories/SWAG tasks. The two heads are two linear layers.
+The language modeling head has its weights tied to the input embeddings,
+the classification head takes as input the input of a specified classification token index in the intput sequence).
+""", GPT2_START_DOCSTRING)
 class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
-    """OpenAI GPT-2 model with a Language Modeling and a Multiple Choice head ("Language Models are Unsupervised Multitask Learners").
+    __doc__ = r"""    Inputs:
+        **input_ids**: ``torch.LongTensor`` of shape ``(batch_size, num_choices, sequence_length)``:
+            Indices of input sequence tokens in the vocabulary.
+            The second dimension of the input (`num_choices`) indicates the number of choices to score.
+            Indices can be obtained using :class:`pytorch_transformers.BPT2Tokenizer`.
+            See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
+            :func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
+        **mc_token_ids**: ``torch.LongTensor`` of shape ``(batch_size, num_choices)``:
+            Index of the classification token in each input sequence.
+            Selected in the range ``[0, input_ids.size(-1) - 1[``.
+        **position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, num_choices, sequence_length)``:
+            Indices of positions of each input sequence tokens in the position embeddings.
+            Selected in the range ``[0, config.max_position_embeddings - 1[``.
+        **token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, num_choices, sequence_length)``:
+            A parallel sequence of tokens (can be used to indicate various portions of the inputs).
+            The embeddings from these tokens will be summed with the respective token embeddings.
+            Indices are selected in the vocabulary (unlike BERT which has a specific vocabulary for segment indices).
+        **past**:
+            list of ``torch.FloatTensor`` (one for each layer):
+            that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
+            (see `past` output below). Can be used to speed up sequential decoding.
+        **attention_mask**: (`optional`) ``torch.Tensor`` of shape ``(batch_size, num_choices, sequence_length)``:
+            Mask to avoid performing attention on padding token indices.
+            Mask indices selected in ``[0, 1]``:
+            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
+        **head_mask**: (`optional`) ``torch.Tensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
+            Mask to nullify selected heads of the self-attention modules.
+            Mask indices selected in ``[0, 1]``:
+            ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
+        **lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+            Labels for language modeling.
+            Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids``
+            Indices are selected in ``[-1, 0, ..., config.vocab_size]``
+            All labels set to ``-1`` are ignored (masked), the loss is only
+            computed for labels in ``[0, ..., config.vocab_size]``
+        **multiple_choice_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size)``:
+            Labels for computing the multiple choice classification loss.
+            Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension
+            of the input tensors. (see `input_ids` above)
 
-    Args:
-        `config`: a GPT2Config class instance with the configuration to build a new model
-        `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
-        `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
-            This can be used to compute head importance metrics. Default: False
+            `multiple_choice_labels`: optional multiple choice labels: ``torch.LongTensor`` of shape [batch_size]
+                with indices selected in [0, ..., num_choices].
 
-    Example::
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **lm_loss**: (`optional`, returned when ``lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
+            Language modeling loss.
+        **mc_loss**: (`optional`, returned when ``multiple_choice_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
+            Multiple choice classification loss.
+        **lm_prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, num_choices, sequence_length, config.vocab_size)``
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        **mc_prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, num_choices)``
+            Prediction scores of the multiplechoice classification head (scores for each choice before SoftMax).
+        **past**:
+            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            that contains pre-computed hidden-states (key and values in the attention blocks).
+            Can be used (see `past` input) to speed up sequential decoding.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+
+    Examples::
+
+        >>> config = GPT2Config.from_pretrained('gpt2')
+        >>> tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
+        >>> model = GPT2DoubleHeadsModel(config)
+        >>> choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"]  # Assume you've added [CLS] to the vocabulary
+        >>> input_ids = torch.tensor([tokenizer.encode(s) for s in choices]).unsqueeze(0)  # Batch size 1, 2 choices
+        >>> mc_token_ids = torch.tensor([-1, -1]).unsqueeze(0)  # Batch size 1
+        >>> outputs = model(input_ids, mc_token_ids)
+        >>> lm_prediction_scores, mc_prediction_scores = outputs[:2]
 
-        config = modeling_gpt2.GPT2Config()
-        model = modeling_gpt2.GPT2DoubleHeadsModel(config)
     """
-
     def __init__(self, config):
         super(GPT2DoubleHeadsModel, self).__init__(config)
         self.transformer = GPT2Model(config)
@@ -665,55 +708,6 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
 
     def forward(self, input_ids, mc_token_ids=None, lm_labels=None, mc_labels=None, token_type_ids=None,
                 position_ids=None, past=None, head_mask=None):
-        """
-        Performs a model forward pass. **Can be called by calling the class directly, once it has been instantiated.**
-
-        Args:
-            `input_ids`: a ``torch.LongTensor`` of shape [batch_size, num_choices, sequence_length] with the BPE token
-                indices selected in the range [0, config.vocab_size[
-            `mc_token_ids`: a ``torch.LongTensor`` of shape [batch_size, num_choices] with the index of the token from
-                which we should take the hidden state to feed the multiple choice classifier (usually last token of the sequence)
-            `position_ids`: an optional ``torch.LongTensor`` with the same shape as input_ids
-                with the position indices (selected in the range [0, config.n_positions - 1[.
-            `token_type_ids`: an optional ``torch.LongTensor`` with the same shape as input_ids
-                You can use it to add a third type of embedding to each input token in the sequence
-                (the previous two being the word and position embeddings).
-                The input, position and token_type embeddings are summed inside the Transformer before the first
-                self-attention block.
-            `lm_labels`: optional language modeling labels: ``torch.LongTensor`` of shape [batch_size, num_choices, sequence_length]
-                with indices selected in [-1, 0, ..., config.vocab_size]. All labels set to -1 are ignored (masked), the loss
-                is only computed for the labels set in [0, ..., config.vocab_size]
-            `multiple_choice_labels`: optional multiple choice labels: ``torch.LongTensor`` of shape [batch_size]
-                with indices selected in [0, ..., num_choices].
-            `past`: an optional list of ``torch.LongTensor`` that contains pre-computed hidden-states
-                (key and values in the attention blocks) to speed up sequential decoding
-                (this is the presents output of the model, cf. below).
-            `head_mask`: an optional ``torch.Tensor`` of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
-                It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
-
-        Returns:
-            If ``lm_labels`` and ``multiple_choice_labels`` are not ``None``, outputs a
-            ``tuple(language_modeling_loss, multiple_choice_loss)``. If they are not ``None``, outputs a
-            ``tuple(lm_logits, multiple_choice_logits, presents)``.
-
-                ``lm_logits``: the language modeling logits as a ``torch.FloatTensor`` of size [batch_size, num_choices, sequence_length, config.vocab_size]
-
-                ``multiple_choice_logits``: the multiple choice logits as a ``torch.FloatTensor`` of size [batch_size, num_choices]
-
-                ``presents``: a list of pre-computed hidden-states (key and values in each attention blocks) as
-                torch.FloatTensors. They can be reused to speed up sequential decoding.
-
-        Example::
-
-            # Already been converted into BPE token ids
-            input_ids = torch.LongTensor([[[31, 51, 99], [15, 5, 0]]])  # (bsz, number of choice, seq length)
-            mc_token_ids = torch.LongTensor([[2], [1]]) # (bsz, number of choice)
-
-            lm_logits, multiple_choice_logits, presents = model(input_ids, mc_token_ids)
-            # or
-            lm_logits, multiple_choice_logits, presents = model.forward(input_ids, mc_token_ids)
-
-        """
         transformer_outputs = self.transformer(input_ids, position_ids, token_type_ids, past, head_mask)
         hidden_states = transformer_outputs[0]
 

From 62b8eb43c1b722f8c8a3c89fce5d788a08fc9653 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Mon, 15 Jul 2019 09:49:02 +0200
Subject: [PATCH 121/139] fix add_start_docstrings on python 2 (removed)

---
 pytorch_transformers/modeling_bert.py  | 16 +++++++--------
 pytorch_transformers/modeling_gpt2.py  |  9 +++++----
 pytorch_transformers/modeling_utils.py | 28 +++++++++++++++++---------
 3 files changed, 32 insertions(+), 21 deletions(-)

diff --git a/pytorch_transformers/modeling_bert.py b/pytorch_transformers/modeling_bert.py
index ea908b1ca0..f49aca0ddf 100644
--- a/pytorch_transformers/modeling_bert.py
+++ b/pytorch_transformers/modeling_bert.py
@@ -646,7 +646,7 @@ BERT_INPUTS_DOCSTRING = r"""
 @add_start_docstrings("The bare Bert Model transformer outputing raw hidden-states without any specific head on top.",
                       BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING)
 class BertModel(BertPreTrainedModel):
-    __doc__ = r"""
+    r"""
     Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
         **last_hidden_state**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, hidden_size)``
             Sequence of hidden-states at the last layer of the model.
@@ -742,7 +742,7 @@ class BertModel(BertPreTrainedModel):
     a `masked language modeling` head and a `next sentence prediction (classification)` head. """,
     BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING)
 class BertForPreTraining(BertPreTrainedModel):
-    __doc__ = r"""
+    r"""
         **masked_lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
             Labels for computing the masked language modeling loss.
             Indices should be in ``[-1, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
@@ -818,7 +818,7 @@ class BertForPreTraining(BertPreTrainedModel):
 @add_start_docstrings("""Bert Model transformer BERT model with a `language modeling` head on top. """,
     BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING)
 class BertForMaskedLM(BertPreTrainedModel):
-    __doc__ = r"""
+    r"""
         **masked_lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
             Labels for computing the masked language modeling loss.
             Indices should be in ``[-1, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
@@ -883,7 +883,7 @@ class BertForMaskedLM(BertPreTrainedModel):
 @add_start_docstrings("""Bert Model transformer BERT model with a `next sentence prediction (classification)` head on top. """,
     BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING)
 class BertForNextSentencePrediction(BertPreTrainedModel):
-    __doc__ = r"""
+    r"""
         **next_sentence_label**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
             Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair (see ``input_ids`` docstring)
             Indices should be in ``[0, 1]``.
@@ -941,7 +941,7 @@ class BertForNextSentencePrediction(BertPreTrainedModel):
     the pooled output) e.g. for GLUE tasks. """,
     BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING)
 class BertForSequenceClassification(BertPreTrainedModel):
-    __doc__ = r"""
+    r"""
         **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
             Labels for computing the sequence classification/regression loss.
             Indices should be in ``[0, ..., config.num_labels]``.
@@ -1009,7 +1009,7 @@ class BertForSequenceClassification(BertPreTrainedModel):
     the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """,
     BERT_START_DOCSTRING)
 class BertForMultipleChoice(BertPreTrainedModel):
-    __doc__ = r"""
+    r"""
     Inputs:
         **input_ids**: ``torch.LongTensor`` of shape ``(batch_size, num_choices, sequence_length)``:
             Indices of input sequence tokens in the vocabulary.
@@ -1115,7 +1115,7 @@ class BertForMultipleChoice(BertPreTrainedModel):
     the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
     BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING)
 class BertForTokenClassification(BertPreTrainedModel):
-    __doc__ = r"""
+    r"""
         **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
             Labels for computing the token classification loss.
             Indices should be in ``[0, ..., config.num_labels]``.
@@ -1182,7 +1182,7 @@ class BertForTokenClassification(BertPreTrainedModel):
     the hidden-states output to compute `span start logits` and `span end logits`). """,
     BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING)
 class BertForQuestionAnswering(BertPreTrainedModel):
-    __doc__ = r"""
+    r"""
         **start_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
             Position (index) of the start of the labelled span for computing the token classification loss.
             Positions are clamped to the length of the sequence (`sequence_length`).
diff --git a/pytorch_transformers/modeling_gpt2.py b/pytorch_transformers/modeling_gpt2.py
index 561228a47e..8aaf84a099 100644
--- a/pytorch_transformers/modeling_gpt2.py
+++ b/pytorch_transformers/modeling_gpt2.py
@@ -31,7 +31,8 @@ from torch.nn import CrossEntropyLoss
 from torch.nn.parameter import Parameter
 
 from .modeling_utils import (Conv1D, CONFIG_NAME, WEIGHTS_NAME, PretrainedConfig,
-                          PreTrainedModel, prune_conv1d_layer, SequenceSummary)
+                             PreTrainedModel, prune_conv1d_layer, SequenceSummary,
+                             add_start_docstrings)
 from .modeling_bert import BertLayerNorm as LayerNorm
 
 logger = logging.getLogger(__name__)
@@ -414,7 +415,7 @@ GPT2_INPUTS_DOCTRING = r"""    Inputs:
 @add_start_docstrings("The bare GPT2 Model transformer outputing raw hidden-states without any specific head on top.",
                       GPT2_START_DOCSTRING, GPT2_INPUTS_DOCTRING)
 class GPT2Model(GPT2PreTrainedModel):
-    __doc__ = r"""
+    r"""
     Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
         **last_hidden_state**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, hidden_size)``
             Sequence of hidden-states at the last layer of the model.
@@ -539,7 +540,7 @@ class GPT2Model(GPT2PreTrainedModel):
 @add_start_docstrings("""The GPT2 Model transformer with a language modeling head on top
 (linear layer with weights tied to the input embeddings). """, GPT2_START_DOCSTRING, GPT2_INPUTS_DOCTRING)
 class GPT2LMHeadModel(GPT2PreTrainedModel):
-    __doc__ = r"""
+    r"""
         **lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
             Labels for language modeling.
             Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids``
@@ -615,7 +616,7 @@ The language modeling head has its weights tied to the input embeddings,
 the classification head takes as input the input of a specified classification token index in the intput sequence).
 """, GPT2_START_DOCSTRING)
 class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
-    __doc__ = r"""    Inputs:
+    r"""    Inputs:
         **input_ids**: ``torch.LongTensor`` of shape ``(batch_size, num_choices, sequence_length)``:
             Indices of input sequence tokens in the vocabulary.
             The second dimension of the input (`num_choices`) indicates the number of choices to score.
diff --git a/pytorch_transformers/modeling_utils.py b/pytorch_transformers/modeling_utils.py
index 8971af306e..71fa9e3747 100644
--- a/pytorch_transformers/modeling_utils.py
+++ b/pytorch_transformers/modeling_utils.py
@@ -15,17 +15,20 @@
 # limitations under the License.
 """PyTorch BERT model."""
 
-from __future__ import absolute_import, division, print_function, unicode_literals
+from __future__ import (absolute_import, division, print_function,
+                        unicode_literals)
 
+import copy
+import json
 import logging
 import os
-import json
-import copy
 from io import open
 
+import six
 import torch
 from torch import nn
-from torch.nn import CrossEntropyLoss, functional as F
+from torch.nn import CrossEntropyLoss
+from torch.nn import functional as F
 
 from .file_utils import cached_path
 
@@ -36,11 +39,18 @@ WEIGHTS_NAME = "pytorch_model.bin"
 TF_WEIGHTS_NAME = 'model.ckpt'
 
 
-def add_start_docstrings(*docstr):
-    def docstring_decorator(fn):
-        fn.__doc__ = ''.join(docstr) + fn.__doc__
-        return fn
-    return docstring_decorator
+if not six.PY2:
+    def add_start_docstrings(*docstr):
+        def docstring_decorator(fn):
+            fn.__doc__ = ''.join(docstr) + fn.__doc__
+            return fn
+        return docstring_decorator
+else:
+    # Not possible to update class docstrings on python2
+    def add_start_docstrings(*docstr):
+        def docstring_decorator(fn):
+            return fn
+        return docstring_decorator
 
 
 class PretrainedConfig(object):

From 4cb489457f3a51be43fe3e2fbdf53ff00d49d3d0 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Mon, 15 Jul 2019 09:58:01 +0200
Subject: [PATCH 122/139] added doc for openai GPT

---
 pytorch_transformers/modeling_bert.py   |  26 +-
 pytorch_transformers/modeling_openai.py | 369 ++++++++++--------------
 2 files changed, 158 insertions(+), 237 deletions(-)

diff --git a/pytorch_transformers/modeling_bert.py b/pytorch_transformers/modeling_bert.py
index f49aca0ddf..78dbc69982 100644
--- a/pytorch_transformers/modeling_bert.py
+++ b/pytorch_transformers/modeling_bert.py
@@ -154,6 +154,7 @@ class BertConfig(PretrainedConfig):
         :class:`~pytorch_transformers.BertConfig` is the configuration class to store the configuration of a
         `BertModel`.
 
+
         Arguments:
             vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `BertModel`.
             hidden_size: Size of the encoder layers and the pooler layer.
@@ -193,31 +194,6 @@ class BertConfig(PretrainedConfig):
                  initializer_range=0.02,
                  layer_norm_eps=1e-12,
                  **kwargs):
-        """Constructs BertConfig.
-
-        Arguments:
-            vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `BertModel`.
-            hidden_size: Size of the encoder layers and the pooler layer.
-            num_hidden_layers: Number of hidden layers in the Transformer encoder.
-            num_attention_heads: Number of attention heads for each attention layer in
-                the Transformer encoder.
-            intermediate_size: The size of the "intermediate" (i.e., feed-forward)
-                layer in the Transformer encoder.
-            hidden_act: The non-linear activation function (function or string) in the
-                encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
-            hidden_dropout_prob: The dropout probabilitiy for all fully connected
-                layers in the embeddings, encoder, and pooler.
-            attention_probs_dropout_prob: The dropout ratio for the attention
-                probabilities.
-            max_position_embeddings: The maximum sequence length that this model might
-                ever be used with. Typically set this to something large just in case
-                (e.g., 512 or 1024 or 2048).
-            type_vocab_size: The vocabulary size of the `token_type_ids` passed into
-                `BertModel`.
-            initializer_range: The sttdev of the truncated_normal_initializer for
-                initializing all weight matrices.
-            layer_norm_eps: The epsilon used by LayerNorm.
-        """
         super(BertConfig, self).__init__(**kwargs)
         if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
                         and isinstance(vocab_size_or_config_json_file, unicode)):
diff --git a/pytorch_transformers/modeling_openai.py b/pytorch_transformers/modeling_openai.py
index d873aef619..6e5dc44f04 100644
--- a/pytorch_transformers/modeling_openai.py
+++ b/pytorch_transformers/modeling_openai.py
@@ -379,47 +379,73 @@ class OpenAIGPTPreTrainedModel(PreTrainedModel):
             module.weight.data.fill_(1.0)
 
 
+OPENAI_GPT_START_DOCSTRING = r"""    OpenAI GPT model was proposed in
+    `Improving Language Understanding by Generative Pre-Training`_
+    by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.
+    It's a causal (unidirectional) transformer pre-trained using language modeling on a large
+    corpus will long range dependencies, the Toronto Book Corpus.
+
+    This model is a PyTorch `torch.nn.Module`_ sub-class. Use it as a regular PyTorch Module and
+    refer to the PyTorch documentation for all matter related to general usage and behavior.
+
+    .. _`Improving Language Understanding by Generative Pre-Training`:
+        https://openai.com/blog/language-unsupervised/
+
+    .. _`torch.nn.Module`:
+        https://pytorch.org/docs/stable/nn.html#module
+
+    Parameters:
+        config (:class:`~pytorch_transformers.BertConfig`): Model configuration class with all the parameters of the model.
+"""
+
+OPENAI_GPT_INPUTS_DOCTRING = r"""    Inputs:
+        **input_ids**: ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+            Indices of input sequence tokens in the vocabulary.
+            Indices can be obtained using :class:`pytorch_transformers.BPT2Tokenizer`.
+            See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
+            :func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
+        **position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+            Indices of positions of each input sequence tokens in the position embeddings.
+            Selected in the range ``[0, config.max_position_embeddings - 1[``.
+        **token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+            A parallel sequence of tokens (can be used to indicate various portions of the inputs).
+            The embeddings from these tokens will be summed with the respective token embeddings.
+            Indices are selected in the vocabulary (unlike BERT which has a specific vocabulary for segment indices).
+        **attention_mask**: (`optional`) ``torch.Tensor`` of shape ``(batch_size, sequence_length)``:
+            Mask to avoid performing attention on padding token indices.
+            Mask indices selected in ``[0, 1]``:
+            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
+        **head_mask**: (`optional`) ``torch.Tensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
+            Mask to nullify selected heads of the self-attention modules.
+            Mask indices selected in ``[0, 1]``:
+            ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
+"""
+
+@add_start_docstrings("The bare OpenAI GPT transformer model outputing raw hidden-states without any specific head on top.",
+                      OPENAI_GPT_START_DOCSTRING, GPT2_INPUTS_DOCTRING)
 class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
-    """OpenAI GPT model ("Improving Language Understanding by Generative Pre-Training").
+    r"""
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **last_hidden_state**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, hidden_size)``
+            Sequence of hidden-states at the last layer of the model.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
 
-    OpenAI GPT uses a single embedding matrix to store the word and special embeddings.
-    Special tokens embeddings are additional tokens that are not pre-trained, such as: [SEP], [CLS]...
+    Examples::
 
-    Special tokens need to be trained during the fine-tuning if you use them.
-    The number of special embeddings can be controlled using the ``set_num_special_tokens(num_special_tokens)`` function.
+        >>> config = OpenAIGPTConfig.from_pretrained('openai-gpt')
+        >>> tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
+        >>> model = OpenAIGPTModel(config)
+        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        >>> outputs = model(input_ids)
+        >>> last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
 
-    The embeddings are ordered as follow in the token embeddings matrix:
-
-    ::
-
-        [0,                                                         ----------------------
-         ...                                                        -> word embeddings
-         config.vocab_size - 1,                                     ______________________
-         config.vocab_size,
-         ...                                                        -> special embeddings
-         config.vocab_size + n_special - 1]                  ______________________
-
-    where ``total_tokens_embeddings``  is:
-
-    ::
-
-        total_tokens_embeddings = config.vocab_size + n_special
-
-    You should use the associated indices to index the embeddings.
-
-    Args:
-        `config`: a OpenAIGPTConfig class instance with the configuration to build a new model
-        `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
-        `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
-            This can be used to compute head importance metrics. Default: False
-
-
-    Example::
-
-        config = modeling_openai.OpenAIGPTConfig()
-        model = modeling_openai.OpenAIGPTModel(config)
     """
-
     def __init__(self, config):
         super(OpenAIGPTModel, self).__init__(config)
         self.output_attentions = config.output_attentions
@@ -444,37 +470,6 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
             self.h[layer].attn.prune_heads(heads)
 
     def forward(self, input_ids, position_ids=None, token_type_ids=None, head_mask=None):
-        """
-        Performs a model forward pass. **Can be called by calling the class directly, once it has been instantiated.**
-
-        Args:
-            `input_ids`: a ``torch.LongTensor`` of shape [batch_size, sequence_length] (or more generally [d_1, ..., d_n, sequence_length]
-                were d_1 ... d_n are arbitrary dimensions) with the word BPE token indices selected in the range [0, total_tokens_embeddings[
-            `position_ids`: an optional ``torch.LongTensor`` with the same shape as input_ids
-                with the position indices (selected in the range [0, config.n_positions - 1[.
-            `token_type_ids`: an optional ``torch.LongTensor`` with the same shape as input_ids
-                You can use it to add a third type of embedding to each input token in the sequence
-                (the previous two being the word and position embeddings).
-                The input, position and token_type embeddings are summed inside the Transformer before the first
-                self-attention block.
-            `head_mask`: an optional ``torch.Tensor`` of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
-                It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
-
-        Returns:
-            ``hidden_states``, a list of all the encoded-hidden-states in the model (length of the list is number
-            of layers + 1 for the output of the embeddings)
-            as ``torch.FloatTensor`` of size [batch_size, sequence_length, hidden_size]
-            (or more generally [d_1, ..., d_n, hidden_size] were d_1 ... d_n are the dimension of input_ids)
-
-        Example::
-
-            # Already been converted into BPE token ids
-            input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
-
-            hidden_states = model(input_ids)
-            # or
-            hidden_states = model.forward(input_ids)
-        """
         if position_ids is None:
             # This was used when we had a single embedding matrice from position and token embeddings
             # start = self.config.vocab_size + self.config.n_special
@@ -536,46 +531,40 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
         return outputs  # last hidden state, (all hidden states), (all attentions)
 
 
+@add_start_docstrings("""OpenAI GPT Model transformer with a language modeling head on top
+(linear layer with weights tied to the input embeddings). """, OPENAI_GPT_START_DOCSTRING, OPENAI_GPT_INPUTS_DOCTRING)
 class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
-    """OpenAI GPT model with a Language Modeling head ("Improving Language Understanding by Generative Pre-Training").
+    r"""
+        **lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+            Labels for language modeling.
+            Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids``
+            Indices are selected in ``[-1, 0, ..., config.vocab_size]``
+            All labels set to ``-1`` are ignored (masked), the loss is only
+            computed for labels in ``[0, ..., config.vocab_size]``
 
-    OpenAI GPT use a single embedding matrix to store the word and special embeddings.
-    Special tokens embeddings are additional tokens that are not pre-trained: [SEP], [CLS]...
-    Special tokens need to be trained during the fine-tuning if you use them. The number of special embeddings
-    can be controlled using the ``set_num_special_tokens(num_special_tokens)`` function.
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **loss**: (`optional`, returned when ``lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
+            Language modeling loss.
+        **prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
 
-    The embeddings are ordered as follow in the token embeddings matrix:
+    Examples::
 
-    ::
+        >>> config = OpenAIGPTConfig.from_pretrained('openai-gpt')
+        >>> tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
+        >>> model = OpenAIGPTLMHeadModel(config)
+        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        >>> outputs = model(input_ids, lm_labels=input_ids)
+        >>> loss, logits = outputs[:2]
 
-        [0,                                                         ----------------------
-         ...                                                        -> word embeddings
-         config.vocab_size - 1,                                     ______________________
-         config.vocab_size,
-         ...                                                        -> special embeddings
-         config.vocab_size + config.n_special - 1]                  ______________________
-
-    where ``total_tokens_embeddings`` can be obtained as ``config.total_tokens_embeddings`` and is:
-
-    ::
-
-        total_tokens_embeddings = config.vocab_size + config.n_special
-
-    You should use the associated indices to index the embeddings.
-
-    Args:
-        `config`: a OpenAIGPTConfig class instance with the configuration to build a new model
-        `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
-        `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
-            This can be used to compute head importance metrics. Default: False
-
-
-    Example::
-
-        config = modeling_openai.OpenAIGPTConfig()
-        model = modeling_openai.OpenAIGPTLMHeadModel(config)
     """
-
     def __init__(self, config):
         super(OpenAIGPTLMHeadModel, self).__init__(config)
         self.transformer = OpenAIGPTModel(config)
@@ -592,40 +581,6 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
                                    self.transformer.tokens_embed)
 
     def forward(self, input_ids, position_ids=None, token_type_ids=None, lm_labels=None, head_mask=None):
-        """
-        Performs a model forward pass. **Can be called by calling the class directly, once it has been instantiated.**
-
-        Args:
-            `input_ids`: a ``torch.LongTensor`` of shape [batch_size, sequence_length] (or more generally [d_1, ..., d_n, sequence_length]
-                were d_1 ... d_n are arbitrary dimensions) with the word BPE token indices selected in the range [0, total_tokens_embeddings[
-            `position_ids`: an optional ``torch.LongTensor`` with the same shape as input_ids
-                with the position indices (selected in the range [0, config.n_positions - 1[.
-            `token_type_ids`: an optional ``torch.LongTensor`` with the same shape as input_ids
-                You can use it to add a third type of embedding to each input token in the sequence
-                (the previous two being the word and position embeddings).
-                The input, position and token_type embeddings are summed inside the Transformer before the first
-                self-attention block.
-            `lm_labels`: optional language modeling labels: ``torch.LongTensor`` of shape [batch_size, sequence_length]
-                with indices selected in [-1, 0, ..., vocab_size]. All labels set to -1 are ignored (masked), the loss
-                is only computed for the labels set in [0, ..., vocab_size]
-            `head_mask`: an optional ``torch.Tensor`` of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
-                It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
-
-        Returns:
-            if ``lm_labels`` is not ``None``, outputs the language modeling loss. Otherwise, outputs ``lm_logits``,
-            the language modeling logits as a ``torch.FloatTensor`` of size [batch_size, sequence_length,
-            total_tokens_embeddings] (or more generally [d_1, ..., d_n, total_tokens_embeddings] where d_1 ... d_n are
-            the dimension of input_ids)
-
-        Example::
-
-            # Already been converted into BPE token ids
-            input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
-
-            lm_logits = model(input_ids)
-            # or
-            lm_logits = model.forward(input_ids)
-        """
         transformer_outputs = self.transformer(input_ids, position_ids, token_type_ids, head_mask)
         hidden_states = transformer_outputs[0]
         lm_logits = self.lm_head(hidden_states)
@@ -644,46 +599,80 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
         return outputs  # (loss), lm_logits, (all hidden states), (all attentions)
 
 
+@add_start_docstrings("""OpenAI GPT Model transformer with a language modeling and a multiple-choice classification
+head on top e.g. for RocStories/SWAG tasks. The two heads are two linear layers.
+The language modeling head has its weights tied to the input embeddings,
+the classification head takes as input the input of a specified classification token index in the intput sequence).
+""", OPENAI_GPT_START_DOCSTRING)
 class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
-    """OpenAI GPT model with a Language Modeling and a Multiple Choice head ("Improving Language Understanding by Generative Pre-Training").
+    r"""    Inputs:
+        **input_ids**: ``torch.LongTensor`` of shape ``(batch_size, num_choices, sequence_length)``:
+            Indices of input sequence tokens in the vocabulary.
+            The second dimension of the input (`num_choices`) indicates the number of choices to score.
+            Indices can be obtained using :class:`pytorch_transformers.BPT2Tokenizer`.
+            See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
+            :func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
+        **mc_token_ids**: ``torch.LongTensor`` of shape ``(batch_size, num_choices)``:
+            Index of the classification token in each input sequence.
+            Selected in the range ``[0, input_ids.size(-1) - 1[``.
+        **position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, num_choices, sequence_length)``:
+            Indices of positions of each input sequence tokens in the position embeddings.
+            Selected in the range ``[0, config.max_position_embeddings - 1[``.
+        **token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, num_choices, sequence_length)``:
+            A parallel sequence of tokens (can be used to indicate various portions of the inputs).
+            The embeddings from these tokens will be summed with the respective token embeddings.
+            Indices are selected in the vocabulary (unlike BERT which has a specific vocabulary for segment indices).
+        **attention_mask**: (`optional`) ``torch.Tensor`` of shape ``(batch_size, num_choices, sequence_length)``:
+            Mask to avoid performing attention on padding token indices.
+            Mask indices selected in ``[0, 1]``:
+            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
+        **head_mask**: (`optional`) ``torch.Tensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
+            Mask to nullify selected heads of the self-attention modules.
+            Mask indices selected in ``[0, 1]``:
+            ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
+        **lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+            Labels for language modeling.
+            Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids``
+            Indices are selected in ``[-1, 0, ..., config.vocab_size]``
+            All labels set to ``-1`` are ignored (masked), the loss is only
+            computed for labels in ``[0, ..., config.vocab_size]``
+        **multiple_choice_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size)``:
+            Labels for computing the multiple choice classification loss.
+            Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension
+            of the input tensors. (see `input_ids` above)
 
-    OpenAI GPT use a single embedding matrix to store the word and special embeddings.
-    Special tokens embeddings are additional tokens that are not pre-trained: [SEP], [CLS]...
-    Special tokens need to be trained during the fine-tuning if you use them.
-    The number of special embeddings can be controlled using the ``set_num_special_tokens(num_special_tokens)``
-    function.
+            `multiple_choice_labels`: optional multiple choice labels: ``torch.LongTensor`` of shape [batch_size]
+                with indices selected in [0, ..., num_choices].
 
-    The embeddings are ordered as follow in the token embeddings matrix:
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **lm_loss**: (`optional`, returned when ``lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
+            Language modeling loss.
+        **mc_loss**: (`optional`, returned when ``multiple_choice_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
+            Multiple choice classification loss.
+        **lm_prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, num_choices, sequence_length, config.vocab_size)``
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        **mc_prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, num_choices)``
+            Prediction scores of the multiplechoice classification head (scores for each choice before SoftMax).
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
 
-    ::
+    Examples::
 
-        [0,                                                         ----------------------
-         ...                                                        -> word embeddings
-         config.vocab_size - 1,                                     ______________________
-         config.vocab_size,
-         ...                                                        -> special embeddings
-         config.vocab_size + n_special - 1]                  ______________________
+        >>> config = OpenAIGPTConfig.from_pretrained('openai-gpt')
+        >>> tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
+        >>> model = OpenAIGPTDoubleHeadsModel(config)
+        >>> choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"]  # Assume you've added [CLS] to the vocabulary
+        >>> input_ids = torch.tensor([tokenizer.encode(s) for s in choices]).unsqueeze(0)  # Batch size 1, 2 choices
+        >>> mc_token_ids = torch.tensor([-1, -1]).unsqueeze(0)  # Batch size 1
+        >>> outputs = model(input_ids, mc_token_ids)
+        >>> lm_prediction_scores, mc_prediction_scores = outputs[:2]
 
-    where ``total_tokens_embeddings`` is:
-
-    ::
-
-        total_tokens_embeddings = config.vocab_size + .n_special
-
-    You should use the associate indices to index the embeddings.
-
-    Args:
-        `config`: a OpenAIGPTConfig class instance with the configuration to build a new model
-        `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
-        `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
-            This can be used to compute head importance metrics. Default: False
-
-    Example::
-
-        config = modeling_openai.OpenAIGPTConfig()
-        model = modeling_openai.OpenAIGPTDoubleHeadsModel(config)
     """
-
     def __init__(self, config):
         super(OpenAIGPTDoubleHeadsModel, self).__init__(config)
 
@@ -703,50 +692,6 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
 
     def forward(self, input_ids, mc_token_ids=None, lm_labels=None, mc_labels=None, token_type_ids=None,
                 position_ids=None, head_mask=None):
-        """
-        Performs a model forward pass. **Can be called by calling the class directly, once it has been instantiated.**
-
-        Args:
-            `input_ids`: a ``torch.LongTensor`` of shape [batch_size, num_choices, sequence_length] with the BPE token
-                indices selected in the range [0, total_tokens_embeddings[
-            `mc_token_ids`: a ``torch.LongTensor`` of shape [batch_size, num_choices] with the index of the token from
-                which we should take the hidden state to feed the multiple choice classifier (usually last token of the sequence)
-            `position_ids`: an optional ``torch.LongTensor`` with the same shape as input_ids
-                with the position indices (selected in the range [0, config.n_positions - 1[.
-            `token_type_ids`: an optional ``torch.LongTensor`` with the same shape as input_ids
-                You can use it to add a third type of embedding to each input token in the sequence
-                (the previous two being the word and position embeddings).
-                The input, position and token_type embeddings are summed inside the Transformer before the first
-                self-attention block.
-            `lm_labels`: optional language modeling labels: ``torch.LongTensor`` of shape [batch_size, num_choices, sequence_length]
-                with indices selected in [-1, 0, ..., total_tokens_embeddings]. All labels set to -1 are ignored (masked), the loss
-                is only computed for the labels set in [0, ..., total_tokens_embeddings]
-            `multiple_choice_labels`: optional multiple choice labels: ``torch.LongTensor`` of shape [batch_size]
-                with indices selected in [0, ..., num_choices].
-            `head_mask`: an optional ``torch.Tensor`` of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
-                It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
-
-        Returns:
-            if ``lm_labels`` and ``multiple_choice_labels`` are not ``None``, outputs a tuple of losses with the
-            language modeling loss and the multiple choice loss. Otherwise, returns a
-            ``tuple(lm_logits, multiple_choice_logits)``.
-
-                ``lm_logits`` are the language modeling logits as a ``torch.FloatTensor`` of size
-                [batch_size, num_choices, sequence_length, total_tokens_embeddings]
-
-                ``multiple_choice_logits``: the multiple choice logits as a ``torch.FloatTensor`` of
-                size [batch_size, num_choices]
-
-        Example::
-
-            # Already been converted into BPE token ids
-            input_ids = torch.LongTensor([[[31, 51, 99], [15, 5, 0]]])  # (bsz, number of choice, seq length)
-            mc_token_ids = torch.LongTensor([[2], [1]]) # (bsz, number of choice)
-
-            lm_logits, multiple_choice_logits = model(input_ids, mc_token_ids)
-            # or
-            lm_logits, multiple_choice_logits = model.forward(input_ids, mc_token_ids)
-        """
         transformer_outputs = self.transformer(input_ids, position_ids, token_type_ids, head_mask)
         hidden_states = transformer_outputs[0]
 

From 0201d86015d6c79dac376933161c21395479f4d8 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Mon, 15 Jul 2019 10:11:09 +0200
Subject: [PATCH 123/139] added doc for transformer-xl

---
 pytorch_transformers/modeling_gpt2.py       |   8 +-
 pytorch_transformers/modeling_openai.py     |  11 +-
 pytorch_transformers/modeling_transfo_xl.py | 199 ++++++++++----------
 3 files changed, 106 insertions(+), 112 deletions(-)

diff --git a/pytorch_transformers/modeling_gpt2.py b/pytorch_transformers/modeling_gpt2.py
index 8aaf84a099..06386f9ace 100644
--- a/pytorch_transformers/modeling_gpt2.py
+++ b/pytorch_transformers/modeling_gpt2.py
@@ -382,10 +382,10 @@ GPT2_START_DOCSTRING = r"""    OpenAI GPT-2 model was proposed in
         https://pytorch.org/docs/stable/nn.html#module
 
     Parameters:
-        config (:class:`~pytorch_transformers.BertConfig`): Model configuration class with all the parameters of the model.
+        config (:class:`~pytorch_transformers.GPT2Config`): Model configuration class with all the parameters of the model.
 """
 
-GPT2_INPUTS_DOCTRING = r"""    Inputs:
+GPT2_INPUTS_DOCSTRING = r"""    Inputs:
         **input_ids**: ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
             Indices of input sequence tokens in the vocabulary.
             Indices can be obtained using :class:`pytorch_transformers.BPT2Tokenizer`.
@@ -413,7 +413,7 @@ GPT2_INPUTS_DOCTRING = r"""    Inputs:
 """
 
 @add_start_docstrings("The bare GPT2 Model transformer outputing raw hidden-states without any specific head on top.",
-                      GPT2_START_DOCSTRING, GPT2_INPUTS_DOCTRING)
+                      GPT2_START_DOCSTRING, GPT2_INPUTS_DOCSTRING)
 class GPT2Model(GPT2PreTrainedModel):
     r"""
     Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -538,7 +538,7 @@ class GPT2Model(GPT2PreTrainedModel):
 
 
 @add_start_docstrings("""The GPT2 Model transformer with a language modeling head on top
-(linear layer with weights tied to the input embeddings). """, GPT2_START_DOCSTRING, GPT2_INPUTS_DOCTRING)
+(linear layer with weights tied to the input embeddings). """, GPT2_START_DOCSTRING, GPT2_INPUTS_DOCSTRING)
 class GPT2LMHeadModel(GPT2PreTrainedModel):
     r"""
         **lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
diff --git a/pytorch_transformers/modeling_openai.py b/pytorch_transformers/modeling_openai.py
index 6e5dc44f04..268252a12c 100644
--- a/pytorch_transformers/modeling_openai.py
+++ b/pytorch_transformers/modeling_openai.py
@@ -31,7 +31,8 @@ from torch.nn import CrossEntropyLoss
 from torch.nn.parameter import Parameter
 
 from .modeling_utils import (Conv1D, CONFIG_NAME, WEIGHTS_NAME, PretrainedConfig,
-                             PreTrainedModel, prune_conv1d_layer, SequenceSummary)
+                             PreTrainedModel, prune_conv1d_layer, SequenceSummary,
+                             add_start_docstrings)
 from .modeling_bert import BertLayerNorm as LayerNorm
 
 logger = logging.getLogger(__name__)
@@ -395,10 +396,10 @@ OPENAI_GPT_START_DOCSTRING = r"""    OpenAI GPT model was proposed in
         https://pytorch.org/docs/stable/nn.html#module
 
     Parameters:
-        config (:class:`~pytorch_transformers.BertConfig`): Model configuration class with all the parameters of the model.
+        config (:class:`~pytorch_transformers.OpenAIGPTConfig`): Model configuration class with all the parameters of the model.
 """
 
-OPENAI_GPT_INPUTS_DOCTRING = r"""    Inputs:
+OPENAI_GPT_INPUTS_DOCSTRING = r"""    Inputs:
         **input_ids**: ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
             Indices of input sequence tokens in the vocabulary.
             Indices can be obtained using :class:`pytorch_transformers.BPT2Tokenizer`.
@@ -422,7 +423,7 @@ OPENAI_GPT_INPUTS_DOCTRING = r"""    Inputs:
 """
 
 @add_start_docstrings("The bare OpenAI GPT transformer model outputing raw hidden-states without any specific head on top.",
-                      OPENAI_GPT_START_DOCSTRING, GPT2_INPUTS_DOCTRING)
+                      OPENAI_GPT_START_DOCSTRING, OPENAI_GPT_INPUTS_DOCSTRING)
 class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
     r"""
     Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -532,7 +533,7 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
 
 
 @add_start_docstrings("""OpenAI GPT Model transformer with a language modeling head on top
-(linear layer with weights tied to the input embeddings). """, OPENAI_GPT_START_DOCSTRING, OPENAI_GPT_INPUTS_DOCTRING)
+(linear layer with weights tied to the input embeddings). """, OPENAI_GPT_START_DOCSTRING, OPENAI_GPT_INPUTS_DOCSTRING)
 class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
     r"""
         **lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
diff --git a/pytorch_transformers/modeling_transfo_xl.py b/pytorch_transformers/modeling_transfo_xl.py
index f368d32636..7eb7a46df3 100644
--- a/pytorch_transformers/modeling_transfo_xl.py
+++ b/pytorch_transformers/modeling_transfo_xl.py
@@ -36,7 +36,7 @@ from torch.nn.parameter import Parameter
 
 from .modeling_bert import BertLayerNorm as LayerNorm
 from .modeling_transfo_xl_utilities import ProjectedAdaptiveLogSoftmax, sample_logits
-from .modeling_utils import CONFIG_NAME, WEIGHTS_NAME, PretrainedConfig, PreTrainedModel
+from .modeling_utils import (PretrainedConfig, PreTrainedModel, add_start_docstrings)
 
 logger = logging.getLogger(__name__)
 
@@ -910,23 +910,71 @@ class TransfoXLPreTrainedModel(PreTrainedModel):
         pass
 
 
+TRANSFO_XL_START_DOCSTRING = r"""    The Transformer-XL model was proposed in
+    `Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context`_
+    by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
+    It's a causal (uni-directional) transformer with relative positioning (sinusoïdal) embeddings which can reuse
+    previously computed hidden-states to attend to longer context (memory).
+    This model also uses adaptive softmax inputs and outputs (tied).
+
+    This model is a PyTorch `torch.nn.Module`_ sub-class. Use it as a regular PyTorch Module and
+    refer to the PyTorch documentation for all matter related to general usage and behavior.
+
+    .. _`Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context`:
+        https://arxiv.org/abs/1901.02860
+
+    .. _`torch.nn.Module`:
+        https://pytorch.org/docs/stable/nn.html#module
+
+    Parameters:
+        config (:class:`~pytorch_transformers.TransfoXLConfig`): Model configuration class with all the parameters of the model.
+"""
+
+TRANSFO_XL_INPUTS_DOCSTRING = r"""
+    Inputs:
+        **input_ids**: ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+            Indices of input sequence tokens in the vocabulary.
+            Indices can be obtained using :class:`pytorch_transformers.TransfoXLTokenizer`.
+            See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
+            :func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
+        **mems**:
+            list of ``torch.FloatTensor`` (one for each layer):
+            that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
+            (see `mems` output below). Can be used to speed up sequential decoding and attend to longer context.
+        **head_mask**: (`optional`) ``torch.Tensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
+            Mask to nullify selected heads of the self-attention modules.
+            Mask indices selected in ``[0, 1]``:
+            ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
+"""
+
+@add_start_docstrings("The bare Bert Model transformer outputing raw hidden-states without any specific head on top.",
+                      TRANSFO_XL_START_DOCSTRING, TRANSFO_XL_INPUTS_DOCSTRING)
 class TransfoXLModel(TransfoXLPreTrainedModel):
-    """Transformer XL model ("Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context").
+    r"""
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **last_hidden_state**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, hidden_size)``
+            Sequence of hidden-states at the last layer of the model.
+        **mems**: ``torch.Tensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
+            list of ``torch.FloatTensor`` (one for each layer):
+            that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
+            (see `mems` input above). Can be used to speed up sequential decoding and attend to longer context.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
 
-    Transformer XL uses relative positioning (with sinusiodal patterns) and adaptive softmax inputs which means that:
+    Examples::
 
-        - you don't need to specify positioning embeddings indices.
+        >>> config = TransfoXLConfig.from_pretrained('transfo-xl-wt103')
+        >>> tokenizer = TransfoXLTokenizer.from_pretrained('transfo-xl-wt103')
+        >>> model = TransfoXLModel(config)
+        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        >>> outputs = model(input_ids)
+        >>> last_hidden_states, mems = outputs[:2]
 
-        - the tokens in the vocabulary have to be sorted in decreasing frequency.
-
-    Args:
-        config: a TransfoXLConfig class instance with the configuration to build a new model
-
-
-    Example::
-
-        config = TransfoXLConfig()
-        model = TransfoXLModel(config)
     """
     def __init__(self, config):
         super(TransfoXLModel, self).__init__(config)
@@ -1193,41 +1241,6 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
         return outputs  # last hidden state, new_mems, (all hidden states), (all attentions)
 
     def forward(self, input_ids, mems=None, head_mask=None):
-        """
-        Performs a model forward pass. **Can be called by calling the class directly, once it has been instantiated.**
-
-        Args:
-            `input_ids`: a ``torch.LongTensor`` of shape [batch_size, sequence_length]
-                with the token indices selected in the range [0, self.config.n_token[
-            `mems`: optional memory of hidden states from previous forward passes
-                as a list (num layers) of hidden states at the entry of each layer
-                each hidden states has shape [self.config.mem_len, bsz, self.config.d_model]
-                Note that the first two dimensions are transposed in `mems` with regards to `input_ids` and `labels`
-
-        Returns:
-            A tuple of ``(last_hidden_state, new_mems)``.
-
-                ``last_hidden_state``: the encoded-hidden-states at the top of the model
-                as a ``torch.FloatTensor`` of size [batch_size, sequence_length, self.config.d_model]
-
-                ``new_mems``: list (num layers) of updated mem states at the entry of each layer
-                each mem state is a ``torch.FloatTensor`` of size [self.config.mem_len, batch_size, self.config.d_model]
-                Note that the first two dimensions are transposed in ``mems`` with regards to ``input_ids`` and
-                ``labels``
-
-        Example::
-
-            # Already been converted into BPE token ids
-            input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
-            input_ids_next = torch.LongTensor([[53, 21, 1], [64, 23, 100]])
-
-            last_hidden_state, new_mems = model(input_ids)
-            # or
-            last_hidden_state, new_mems = model.forward(input_ids)
-
-            # Another time on input_ids_next using the memory:
-            last_hidden_state, new_mems = model(input_ids_next, new_mems)
-        """
         # the original code for Transformer-XL used shapes [len, bsz] but we want a unified interface in the library
         # so we transpose here from shape [bsz, len] to shape [len, bsz]
         input_ids = input_ids.transpose(0, 1).contiguous()
@@ -1239,27 +1252,45 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
         return outputs  # last hidden state, new_mems, (all hidden states), (all attentions)
 
 
+@add_start_docstrings("""The Transformer-XL Model with a language modeling head on top
+    (adaptive softmax with weights tied to the adaptive input embeddings)""",
+    TRANSFO_XL_START_DOCSTRING, TRANSFO_XL_INPUTS_DOCSTRING)
 class TransfoXLLMHeadModel(TransfoXLPreTrainedModel):
-    """Transformer XL model ("Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context").
+    r"""
+        **lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+            Labels for language modeling.
+            Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids``
+            Indices are selected in ``[-1, 0, ..., config.vocab_size]``
+            All labels set to ``-1`` are ignored (masked), the loss is only
+            computed for labels in ``[0, ..., config.vocab_size]``
 
-    This model adds an (adaptive) softmax head on top of the ``TransfoXLModel``
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **loss**: (`optional`, returned when ``lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
+            Language modeling loss.
+        **prediction_scores**: ``None`` if ``lm_labels`` is provided else ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+            We don't output them when the loss is computed to speedup adaptive softmax decoding.
+        **mems**: ``torch.Tensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
+            list of ``torch.FloatTensor`` (one for each layer):
+            that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
+            (see `mems` input above). Can be used to speed up sequential decoding and attend to longer context.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
 
-    Transformer XL uses a relative positioning (with sinusoidal patterns) and adaptive softmax inputs which means that:
+    Examples::
 
-        - you don't need to specify positioning embeddings indices
+        >>> config = TransfoXLConfig.from_pretrained('transfo-xl-wt103')
+        >>> tokenizer = TransfoXLTokenizer.from_pretrained('transfo-xl-wt103')
+        >>> model = TransfoXLLMHeadModel(config)
+        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        >>> outputs = model(input_ids)
+        >>> prediction_scores, mems = outputs[:2]
 
-        - the tokens in the vocabulary have to be sorted in decreasing frequency.
-
-    Call ``self.tie_weights()`` if you update/load the weights of the transformer to keep the weights tied.
-
-    Args:
-        config: a ``TransfoXLConfig`` class instance with the configuration to build a new model
-
-
-    Example::
-
-        config = TransfoXLConfig()
-        model = TransfoXLModel(config)
     """
     def __init__(self, config):
         super(TransfoXLLMHeadModel, self).__init__(config)
@@ -1310,44 +1341,6 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel):
         return self.transformer.init_mems(data)
 
     def forward(self, input_ids, labels=None, mems=None, head_mask=None):
-        """
-        Performs a model forward pass. **Can be called by calling the class directly, once it has been instantiated.**
-
-        Args:
-            `input_ids`: a ``torch.LongTensor`` of shape [batch_size, sequence_length]
-                with the token indices selected in the range [0, self.config.n_token[
-            `labels`: an optional ``torch.LongTensor`` of shape [batch_size, sequence_length]
-                with the labels token indices selected in the range [0, self.config.n_token[
-            `mems`: an optional memory of hidden states from previous forward passes
-                as a list (num layers) of hidden states at the entry of each layer
-                each hidden states has shape [self.config.mem_len, bsz, self.config.d_model]
-                Note that the first two dimensions are transposed in `mems` with regards to `input_ids` and `labels`
-
-        Returns:
-            A tuple of (last_hidden_state, new_mems)
-
-                ``last_hidden_state``: output of the (adaptive) softmax. If ``labels`` is ``None``, it is the negative
-                log likelihood of shape [batch_size, sequence_length]. Otherwise, it is the log probabilities of
-                tokens of, shape [batch_size, sequence_length, n_tokens].
-
-                ``new_mems``: list (num layers) of updated mem states at the entry of each layer
-                each mem state is a ``torch.FloatTensor`` of size [self.config.mem_len, batch_size, self.config.d_model]
-                Note that the first two dimensions are transposed in ``mems`` with regards to ``input_ids`` and
-                ``labels``
-
-        Example::
-
-            # Already been converted into BPE token ids
-            input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
-            input_ids_next = torch.LongTensor([[53, 21, 1], [64, 23, 100]])
-
-            last_hidden_state, new_mems = model(input_ids)
-            # or
-            last_hidden_state, new_mems = model.forward(input_ids)
-
-            # Another time on input_ids_next using the memory:
-            last_hidden_state, new_mems = model(input_ids_next, mems=new_mems)
-        """
         bsz = input_ids.size(0)
         tgt_len = input_ids.size(1)
 

From 44c985facdf562d6cf3d7cd72f2900e3a0d85d6e Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Mon, 15 Jul 2019 11:36:50 +0200
Subject: [PATCH 124/139] update doc for XLM and XLNet

---
 pytorch_transformers/modeling_bert.py       |  26 +-
 pytorch_transformers/modeling_gpt2.py       |  20 +-
 pytorch_transformers/modeling_openai.py     |  20 +-
 pytorch_transformers/modeling_transfo_xl.py |   8 +-
 pytorch_transformers/modeling_utils.py      |   1 -
 pytorch_transformers/modeling_xlm.py        | 471 +++++++++----------
 pytorch_transformers/modeling_xlnet.py      | 474 +++++++++-----------
 7 files changed, 459 insertions(+), 561 deletions(-)

diff --git a/pytorch_transformers/modeling_bert.py b/pytorch_transformers/modeling_bert.py
index 78dbc69982..a044832282 100644
--- a/pytorch_transformers/modeling_bert.py
+++ b/pytorch_transformers/modeling_bert.py
@@ -611,11 +611,11 @@ BERT_INPUTS_DOCSTRING = r"""
             (see `BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding`_ for more details).
         **attention_mask**: (`optional`) ``torch.Tensor`` of shape ``(batch_size, sequence_length)``:
             Mask to avoid performing attention on padding token indices.
-            Mask indices selected in ``[0, 1]``:
+            Mask values selected in ``[0, 1]``:
             ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
         **head_mask**: (`optional`) ``torch.Tensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
             Mask to nullify selected heads of the self-attention modules.
-            Mask indices selected in ``[0, 1]``:
+            Mask values selected in ``[0, 1]``:
             ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
 """
 
@@ -714,7 +714,7 @@ class BertModel(BertPreTrainedModel):
         return outputs  # sequence_output, pooled_output, (hidden_states), (attentions)
 
 
-@add_start_docstrings("""Bert Model transformer BERT model with two heads on top as done during the pre-training:
+@add_start_docstrings("""Bert Model with two heads on top as done during the pre-training:
     a `masked language modeling` head and a `next sentence prediction (classification)` head. """,
     BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING)
 class BertForPreTraining(BertPreTrainedModel):
@@ -791,7 +791,7 @@ class BertForPreTraining(BertPreTrainedModel):
         return outputs  # (loss), prediction_scores, seq_relationship_score, (hidden_states), (attentions)
 
 
-@add_start_docstrings("""Bert Model transformer BERT model with a `language modeling` head on top. """,
+@add_start_docstrings("""Bert Model with a `language modeling` head on top. """,
     BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING)
 class BertForMaskedLM(BertPreTrainedModel):
     r"""
@@ -856,7 +856,7 @@ class BertForMaskedLM(BertPreTrainedModel):
         return outputs  # (masked_lm_loss), prediction_scores, (hidden_states), (attentions)
 
 
-@add_start_docstrings("""Bert Model transformer BERT model with a `next sentence prediction (classification)` head on top. """,
+@add_start_docstrings("""Bert Model with a `next sentence prediction (classification)` head on top. """,
     BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING)
 class BertForNextSentencePrediction(BertPreTrainedModel):
     r"""
@@ -913,7 +913,7 @@ class BertForNextSentencePrediction(BertPreTrainedModel):
         return outputs  # (next_sentence_loss), seq_relationship_score, (hidden_states), (attentions)
 
 
-@add_start_docstrings("""Bert Model transformer BERT model with a sequence classification/regression head on top (a linear layer on top of
+@add_start_docstrings("""Bert Model transformer with a sequence classification/regression head on top (a linear layer on top of
     the pooled output) e.g. for GLUE tasks. """,
     BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING)
 class BertForSequenceClassification(BertPreTrainedModel):
@@ -981,7 +981,7 @@ class BertForSequenceClassification(BertPreTrainedModel):
         return outputs  # (loss), logits, (hidden_states), (attentions)
 
 
-@add_start_docstrings("""Bert Model transformer BERT model with a multiple choice classification head on top (a linear layer on top of
+@add_start_docstrings("""Bert Model with a multiple choice classification head on top (a linear layer on top of
     the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """,
     BERT_START_DOCSTRING)
 class BertForMultipleChoice(BertPreTrainedModel):
@@ -1016,11 +1016,11 @@ class BertForMultipleChoice(BertPreTrainedModel):
         **attention_mask**: (`optional`) ``torch.Tensor`` of shape ``(batch_size, num_choices, sequence_length)``:
             Mask to avoid performing attention on padding token indices.
             The second dimension of the input (`num_choices`) indicates the number of choices to score.
-            Mask indices selected in ``[0, 1]``:
+            Mask values selected in ``[0, 1]``:
             ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
         **head_mask**: (`optional`) ``torch.Tensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
             Mask to nullify selected heads of the self-attention modules.
-            Mask indices selected in ``[0, 1]``:
+            Mask values selected in ``[0, 1]``:
             ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
         **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
             Labels for computing the multiple choice classification loss.
@@ -1087,7 +1087,7 @@ class BertForMultipleChoice(BertPreTrainedModel):
         return outputs  # (loss), reshaped_logits, (hidden_states), (attentions)
 
 
-@add_start_docstrings("""Bert Model transformer BERT model with a token classification head on top (a linear layer on top of
+@add_start_docstrings("""Bert Model with a token classification head on top (a linear layer on top of
     the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
     BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING)
 class BertForTokenClassification(BertPreTrainedModel):
@@ -1154,17 +1154,17 @@ class BertForTokenClassification(BertPreTrainedModel):
         return outputs  # (loss), scores, (hidden_states), (attentions)
 
 
-@add_start_docstrings("""Bert Model transformer BERT model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
+@add_start_docstrings("""Bert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
     the hidden-states output to compute `span start logits` and `span end logits`). """,
     BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING)
 class BertForQuestionAnswering(BertPreTrainedModel):
     r"""
         **start_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
-            Position (index) of the start of the labelled span for computing the token classification loss.
+            Labels for position (index) of the start of the labelled span for computing the token classification loss.
             Positions are clamped to the length of the sequence (`sequence_length`).
             Position outside of the sequence are not taken into account for computing the loss.
         **end_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
-            Position (index) of the end of the labelled span for computing the token classification loss.
+            Labels for position (index) of the end of the labelled span for computing the token classification loss.
             Positions are clamped to the length of the sequence (`sequence_length`).
             Position outside of the sequence are not taken into account for computing the loss.
 
diff --git a/pytorch_transformers/modeling_gpt2.py b/pytorch_transformers/modeling_gpt2.py
index 06386f9ace..415396496c 100644
--- a/pytorch_transformers/modeling_gpt2.py
+++ b/pytorch_transformers/modeling_gpt2.py
@@ -404,11 +404,11 @@ GPT2_INPUTS_DOCSTRING = r"""    Inputs:
             (see `past` output below). Can be used to speed up sequential decoding.
         **attention_mask**: (`optional`) ``torch.Tensor`` of shape ``(batch_size, sequence_length)``:
             Mask to avoid performing attention on padding token indices.
-            Mask indices selected in ``[0, 1]``:
+            Mask values selected in ``[0, 1]``:
             ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
         **head_mask**: (`optional`) ``torch.Tensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
             Mask to nullify selected heads of the self-attention modules.
-            Mask indices selected in ``[0, 1]``:
+            Mask values selected in ``[0, 1]``:
             ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
 """
 
@@ -541,7 +541,7 @@ class GPT2Model(GPT2PreTrainedModel):
 (linear layer with weights tied to the input embeddings). """, GPT2_START_DOCSTRING, GPT2_INPUTS_DOCSTRING)
 class GPT2LMHeadModel(GPT2PreTrainedModel):
     r"""
-        **lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+        **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
             Labels for language modeling.
             Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids``
             Indices are selected in ``[-1, 0, ..., config.vocab_size]``
@@ -549,7 +549,7 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
             computed for labels in ``[0, ..., config.vocab_size]``
 
     Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
-        **loss**: (`optional`, returned when ``lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
+        **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
             Language modeling loss.
         **prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
             Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
@@ -571,7 +571,7 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
         >>> tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
         >>> model = GPT2LMHeadModel(config)
         >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
-        >>> outputs = model(input_ids, lm_labels=input_ids)
+        >>> outputs = model(input_ids, labels=input_ids)
         >>> loss, logits = outputs[:2]
 
     """
@@ -590,17 +590,17 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
         self._tie_or_clone_weights(self.lm_head,
                                    self.transformer.wte)
 
-    def forward(self, input_ids, position_ids=None, token_type_ids=None, lm_labels=None, past=None, head_mask=None):
+    def forward(self, input_ids, position_ids=None, token_type_ids=None, labels=None, past=None, head_mask=None):
         transformer_outputs = self.transformer(input_ids, position_ids, token_type_ids, past, head_mask)
         hidden_states = transformer_outputs[0]
 
         lm_logits = self.lm_head(hidden_states)
 
         outputs = (lm_logits,) + transformer_outputs[1:]
-        if lm_labels is not None:
+        if labels is not None:
             # Shift so that tokens < n predict n
             shift_logits = lm_logits[..., :-1, :].contiguous()
-            shift_labels = lm_labels[..., 1:].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
             # Flatten the tokens
             loss_fct = CrossEntropyLoss(ignore_index=-1)
             loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)),
@@ -639,11 +639,11 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
             (see `past` output below). Can be used to speed up sequential decoding.
         **attention_mask**: (`optional`) ``torch.Tensor`` of shape ``(batch_size, num_choices, sequence_length)``:
             Mask to avoid performing attention on padding token indices.
-            Mask indices selected in ``[0, 1]``:
+            Mask values selected in ``[0, 1]``:
             ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
         **head_mask**: (`optional`) ``torch.Tensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
             Mask to nullify selected heads of the self-attention modules.
-            Mask indices selected in ``[0, 1]``:
+            Mask values selected in ``[0, 1]``:
             ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
         **lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
             Labels for language modeling.
diff --git a/pytorch_transformers/modeling_openai.py b/pytorch_transformers/modeling_openai.py
index 268252a12c..d51e4309b8 100644
--- a/pytorch_transformers/modeling_openai.py
+++ b/pytorch_transformers/modeling_openai.py
@@ -414,11 +414,11 @@ OPENAI_GPT_INPUTS_DOCSTRING = r"""    Inputs:
             Indices are selected in the vocabulary (unlike BERT which has a specific vocabulary for segment indices).
         **attention_mask**: (`optional`) ``torch.Tensor`` of shape ``(batch_size, sequence_length)``:
             Mask to avoid performing attention on padding token indices.
-            Mask indices selected in ``[0, 1]``:
+            Mask values selected in ``[0, 1]``:
             ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
         **head_mask**: (`optional`) ``torch.Tensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
             Mask to nullify selected heads of the self-attention modules.
-            Mask indices selected in ``[0, 1]``:
+            Mask values selected in ``[0, 1]``:
             ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
 """
 
@@ -536,7 +536,7 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
 (linear layer with weights tied to the input embeddings). """, OPENAI_GPT_START_DOCSTRING, OPENAI_GPT_INPUTS_DOCSTRING)
 class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
     r"""
-        **lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+        **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
             Labels for language modeling.
             Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids``
             Indices are selected in ``[-1, 0, ..., config.vocab_size]``
@@ -544,7 +544,7 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
             computed for labels in ``[0, ..., config.vocab_size]``
 
     Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
-        **loss**: (`optional`, returned when ``lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
+        **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
             Language modeling loss.
         **prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
             Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
@@ -562,7 +562,7 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
         >>> tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
         >>> model = OpenAIGPTLMHeadModel(config)
         >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
-        >>> outputs = model(input_ids, lm_labels=input_ids)
+        >>> outputs = model(input_ids, labels=input_ids)
         >>> loss, logits = outputs[:2]
 
     """
@@ -581,16 +581,16 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
         self._tie_or_clone_weights(self.lm_head,
                                    self.transformer.tokens_embed)
 
-    def forward(self, input_ids, position_ids=None, token_type_ids=None, lm_labels=None, head_mask=None):
+    def forward(self, input_ids, position_ids=None, token_type_ids=None, labels=None, head_mask=None):
         transformer_outputs = self.transformer(input_ids, position_ids, token_type_ids, head_mask)
         hidden_states = transformer_outputs[0]
         lm_logits = self.lm_head(hidden_states)
 
         outputs = (lm_logits,) + transformer_outputs[1:]
-        if lm_labels is not None:
+        if labels is not None:
             # Shift so that tokens < n predict n
             shift_logits = lm_logits[..., :-1, :].contiguous()
-            shift_labels = lm_labels[..., 1:].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
             # Flatten the tokens
             loss_fct = CrossEntropyLoss(ignore_index=-1)
             loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)),
@@ -625,11 +625,11 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
             Indices are selected in the vocabulary (unlike BERT which has a specific vocabulary for segment indices).
         **attention_mask**: (`optional`) ``torch.Tensor`` of shape ``(batch_size, num_choices, sequence_length)``:
             Mask to avoid performing attention on padding token indices.
-            Mask indices selected in ``[0, 1]``:
+            Mask values selected in ``[0, 1]``:
             ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
         **head_mask**: (`optional`) ``torch.Tensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
             Mask to nullify selected heads of the self-attention modules.
-            Mask indices selected in ``[0, 1]``:
+            Mask values selected in ``[0, 1]``:
             ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
         **lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
             Labels for language modeling.
diff --git a/pytorch_transformers/modeling_transfo_xl.py b/pytorch_transformers/modeling_transfo_xl.py
index 7eb7a46df3..d9c8cba8db 100644
--- a/pytorch_transformers/modeling_transfo_xl.py
+++ b/pytorch_transformers/modeling_transfo_xl.py
@@ -937,13 +937,13 @@ TRANSFO_XL_INPUTS_DOCSTRING = r"""
             Indices can be obtained using :class:`pytorch_transformers.TransfoXLTokenizer`.
             See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
             :func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
-        **mems**:
+        **mems**: (`optional`)
             list of ``torch.FloatTensor`` (one for each layer):
             that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
             (see `mems` output below). Can be used to speed up sequential decoding and attend to longer context.
         **head_mask**: (`optional`) ``torch.Tensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
             Mask to nullify selected heads of the self-attention modules.
-            Mask indices selected in ``[0, 1]``:
+            Mask values selected in ``[0, 1]``:
             ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
 """
 
@@ -954,7 +954,7 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
     Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
         **last_hidden_state**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, hidden_size)``
             Sequence of hidden-states at the last layer of the model.
-        **mems**: ``torch.Tensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
+        **mems**:
             list of ``torch.FloatTensor`` (one for each layer):
             that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
             (see `mems` input above). Can be used to speed up sequential decoding and attend to longer context.
@@ -1270,7 +1270,7 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel):
         **prediction_scores**: ``None`` if ``lm_labels`` is provided else ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
             Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
             We don't output them when the loss is computed to speedup adaptive softmax decoding.
-        **mems**: ``torch.Tensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
+        **mems**:
             list of ``torch.FloatTensor`` (one for each layer):
             that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
             (see `mems` input above). Can be used to speed up sequential decoding and attend to longer context.
diff --git a/pytorch_transformers/modeling_utils.py b/pytorch_transformers/modeling_utils.py
index 71fa9e3747..4e5fe92001 100644
--- a/pytorch_transformers/modeling_utils.py
+++ b/pytorch_transformers/modeling_utils.py
@@ -538,7 +538,6 @@ class PoolerAnswerClass(nn.Module):
 
 class SQuADHead(nn.Module):
     """ A SQuAD head inspired by XLNet.
-        Compute
     """
     def __init__(self, config):
         super(SQuADHead, self).__init__()
diff --git a/pytorch_transformers/modeling_xlm.py b/pytorch_transformers/modeling_xlm.py
index 755e504b7d..33b5bcf7fe 100644
--- a/pytorch_transformers/modeling_xlm.py
+++ b/pytorch_transformers/modeling_xlm.py
@@ -30,7 +30,7 @@ from torch import nn
 from torch.nn import functional as F
 from torch.nn import CrossEntropyLoss, MSELoss
 
-from .modeling_utils import (PretrainedConfig, PreTrainedModel,
+from .modeling_utils import (PretrainedConfig, PreTrainedModel, add_start_docstrings,
                              prune_linear_layer, SequenceSummary, SQuADHead)
 
 logger = logging.getLogger(__name__)
@@ -392,28 +392,94 @@ class XLMPreTrainedModel(PreTrainedModel):
             module.weight.data.fill_(1.0)
 
 
+XLM_START_DOCSTRING = r"""    The XLM model was proposed in
+    `Cross-lingual Language Model Pretraining`_
+    by Guillaume Lample*, Alexis Conneau*. It's a transformer pre-trained using one of the following objectives:
+
+        - a causal language modeling (CLM) objective (next token prediction),
+        - a masked language modeling (MLM) objective (Bert-like), or
+        - a Translation Language Modeling (TLM) object (extension of Bert's MLM to multiple language inputs)
+
+    Original code can be found `here`_.
+
+    This model is a PyTorch `torch.nn.Module`_ sub-class. Use it as a regular PyTorch Module and
+    refer to the PyTorch documentation for all matter related to general usage and behavior.
+
+    .. _`Cross-lingual Language Model Pretraining`:
+        https://arxiv.org/abs/1901.07291
+
+    .. _`torch.nn.Module`:
+        https://pytorch.org/docs/stable/nn.html#module
+
+    .. _`here`:
+        https://github.com/facebookresearch/XLM
+
+    Parameters:
+        config (:class:`~pytorch_transformers.XLMConfig`): Model configuration class with all the parameters of the model.
+"""
+
+XLM_INPUTS_DOCSTRING = r"""
+    Inputs:
+        **input_ids**: ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+            Indices of input sequence tokens in the vocabulary.
+            Indices can be obtained using :class:`pytorch_transformers.XLMTokenizer`.
+            See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
+            :func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
+        **position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+            Indices of positions of each input sequence tokens in the position embeddings.
+            Selected in the range ``[0, config.max_position_embeddings - 1[``.
+        **token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+            A parallel sequence of tokens (can be used to indicate various portions of the inputs).
+            The embeddings from these tokens will be summed with the respective token embeddings.
+            Indices are selected in the vocabulary (unlike BERT which has a specific vocabulary for segment indices).
+        **langs**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+            A parallel sequence of tokens to be used to indicate the language of each token in the input.
+            Indices are selected in the pre-trained language vocabulary,
+            i.e. in the range ``[0, config.n_langs - 1[``.
+        **attention_mask**: (`optional`) ``torch.Tensor`` of shape ``(batch_size, sequence_length)``:
+            Mask to avoid performing attention on padding token indices.
+            Mask values selected in ``[0, 1]``:
+            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
+        **lengths**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
+            Length of each sentence that can be used to avoid performing attention on padding token indices.
+            You can also use `attention_mask` for the same result (see above), kept here for compatbility.
+            Indices selected in ``[0, ..., input_ids.size(-1)]``:
+        **cache**:
+            dictionary with ``torch.FloatTensor`` that contains pre-computed
+            hidden-states (key and values in the attention blocks) as computed by the model
+            (see `cache` output below). Can be used to speed up sequential decoding.
+            The dictionary object will be modified in-place during the forward pass to add newly computed hidden-states.
+        **head_mask**: (`optional`) ``torch.Tensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
+            Mask to nullify selected heads of the self-attention modules.
+            Mask values selected in ``[0, 1]``:
+            ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
+"""
+
+@add_start_docstrings("The bare XLM Model transformer outputing raw hidden-states without any specific head on top.",
+                      XLM_START_DOCSTRING, XLM_INPUTS_DOCSTRING)
 class XLMModel(XLMPreTrainedModel):
+    r"""
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **last_hidden_state**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, hidden_size)``
+            Sequence of hidden-states at the last layer of the model.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+
+    Examples::
+
+        >>> config = XLMConfig.from_pretrained('xlm-mlm-en-2048')
+        >>> tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
+        >>> model = XLMModel(config)
+        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        >>> outputs = model(input_ids)
+        >>> last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
+
     """
-    XLM model from: "Cross-lingual Language Model Pretraining" by Guillaume Lample, Alexis Conneau
-
-    Paper: https://arxiv.org/abs/1901.07291
-
-    Original code: https://github.com/facebookresearch/XLM
-
-    Args:
-        `config`: a XLMConfig class instance with the configuration to build a new model
-        `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
-        `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
-            This can be used to compute head importance metrics. Default: False
-
-    Example::
-
-        config = modeling.XLMConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
-            num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
-
-        model = modeling.XLMModel(config=config)
-    """
-
     ATTRIBUTES = ['encoder', 'eos_index', 'pad_index',  # 'with_output', 
                   'n_langs', 'n_words', 'dim', 'n_layers', 'n_heads', 
                   'hidden_dim', 'dropout', 'attention_dropout', 'asm',
@@ -493,57 +559,8 @@ class XLMModel(XLMPreTrainedModel):
         for layer, heads in heads_to_prune.items():
             self.attentions[layer].prune_heads(heads)
 
-    def forward(self, input_ids, lengths=None, positions=None, langs=None,
+    def forward(self, input_ids, lengths=None, position_ids=None, langs=None,
                 token_type_ids=None, attention_mask=None, cache=None, head_mask=None):  # src_enc=None, src_len=None, 
-        """
-        Performs a model forward pass. **Can be called by calling the class directly, once it has been instantiated.**
-
-        Parameters:
-            `input_ids`: a ``torch.LongTensor`` of shape [batch_size, sequence_length]
-                with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
-                `run_bert_extract_features.py`, `run_bert_classifier.py` and `run_bert_squad.py`)
-            `lengths`: ``torch.LongTensor`` of size ``bs``, containing the length of each sentence
-            `positions`: ``torch.LongTensor`` of size ``(bs, slen)``, containing word positions
-            `langs`: ``torch.LongTensor`` of size ``(bs, slen)``, containing language IDs
-            `token_type_ids`: an optional ``torch.LongTensor`` of shape [batch_size, sequence_length] with the token
-                types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
-                a `sentence B` token (see XLM paper for more details).
-            `attention_mask`: an optional ``torch.LongTensor`` of shape [batch_size, sequence_length] with indices
-                selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
-                input sequence length in the current batch. It's the mask that we typically use for attention when
-                a batch has varying length sentences.
-            `cache`: TODO
-            `head_mask`: an optional ``torch.Tensor`` of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
-                It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
-
-
-        Returns:
-            A ``tuple(encoded_layers, pooled_output)``, with
-
-            ``encoded_layers``: controlled by ``output_all_encoded_layers`` argument:
-
-                - ``output_all_encoded_layers=True``: outputs a list of the full sequences of encoded-hidden-states at the end \
-                of each attention block (i.e. 12 full sequences for XLM-base, 24 for XLM-large), each \
-                encoded-hidden-state is a ``torch.FloatTensor`` of size [batch_size, sequence_length, hidden_size],
-
-                - ``output_all_encoded_layers=False``: outputs only the full sequence of hidden-states corresponding \
-                to the last attention block of shape [batch_size, sequence_length, hidden_size],
-
-            ``pooled_output``: a ``torch.FloatTensor`` of size [batch_size, hidden_size] which is the output of a
-            classifier pre-trained on top of the hidden state associated to the first character of the
-            input (`CLS`) to train on the Next-Sentence task (see XLM's paper).
-
-        Example::
-
-            # Already been converted into WordPiece token ids
-            input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
-            input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
-            token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
-
-            all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)
-            # or
-            all_encoder_layers, pooled_output = model.forward(input_ids, token_type_ids, input_mask)
-        """
         if lengths is None:
             lengths = (input_ids != self.pad_index).sum(dim=1).long()
         # mask = input_ids != self.pad_index
@@ -563,18 +580,15 @@ class XLMModel(XLMPreTrainedModel):
         # if self.is_decoder and src_enc is not None:
         #     src_mask = torch.arange(src_len.max(), dtype=torch.long, device=lengths.device) < src_len[:, None]
 
-        # positions
-        if positions is None:
-            positions = input_ids.new((slen,)).long()
-            positions = torch.arange(slen, out=positions).unsqueeze(0)
+        # position_ids
+        if position_ids is None:
+            position_ids = input_ids.new((slen,)).long()
+            position_ids = torch.arange(slen, out=position_ids).unsqueeze(0)
         else:
-            assert positions.size() == (bs, slen)  # (slen, bs)
-            # positions = positions.transpose(0, 1)
+            assert position_ids.size() == (bs, slen)  # (slen, bs)
+            # position_ids = position_ids.transpose(0, 1)
 
         # langs
-        assert langs is None or token_type_ids is None, "You can only use one among langs and token_type_ids"
-        if token_type_ids is not None:
-            langs = token_type_ids
         if langs is not None:
             assert langs.size() == (bs, slen)  # (slen, bs)
             # langs = langs.transpose(0, 1)
@@ -598,7 +612,7 @@ class XLMModel(XLMPreTrainedModel):
         if cache is not None:
             _slen = slen - cache['slen']
             input_ids = input_ids[:, -_slen:]
-            positions = positions[:, -_slen:]
+            position_ids = position_ids[:, -_slen:]
             if langs is not None:
                 langs = langs[:, -_slen:]
             mask = mask[:, -_slen:]
@@ -606,9 +620,11 @@ class XLMModel(XLMPreTrainedModel):
 
         # embeddings
         tensor = self.embeddings(input_ids)
-        tensor = tensor + self.position_embeddings(positions).expand_as(tensor)
+        tensor = tensor + self.position_embeddings(position_ids).expand_as(tensor)
         if langs is not None:
             tensor = tensor + self.lang_embeddings(langs)
+        if token_type_ids is not None:
+            tensor = tensor + self.embeddings(token_type_ids)
         tensor = self.layer_norm_emb(tensor)
         tensor = F.dropout(tensor, p=self.dropout, training=self.training)
         tensor *= mask.unsqueeze(-1).to(tensor.dtype)
@@ -702,25 +718,40 @@ class XLMPredLayer(nn.Module):
         return outputs
 
 
+@add_start_docstrings("""The XLM Model transformer with a language modeling head on top
+    (linear layer with weights tied to the input embeddings). """,
+    XLM_START_DOCSTRING, XLM_INPUTS_DOCSTRING)
 class XLMWithLMHeadModel(XLMPreTrainedModel):
-    """ XLM model from: "Cross-lingual Language Model Pretraining" by Guillaume Lample, Alexis Conneau
+    r"""
+        **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+            Labels for language modeling.
+            Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids``
+            Indices are selected in ``[-1, 0, ..., config.vocab_size]``
+            All labels set to ``-1`` are ignored (masked), the loss is only
+            computed for labels in ``[0, ..., config.vocab_size]``
 
-    Paper: https://arxiv.org/abs/1901.07291
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
+            Language modeling loss.
+        **prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
 
-    Original code: https://github.com/facebookresearch/XLM
+    Examples::
 
-    Args:
-        `config`: a XLMConfig class instance with the configuration to build a new model
-        `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
-        `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
-            This can be used to compute head importance metrics. Default: False
+        >>> config = XLMConfig.from_pretrained('xlm-mlm-en-2048')
+        >>> tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
+        >>> model = XLMWithLMHeadModel(config)
+        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        >>> outputs = model(input_ids)
+        >>> last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
 
-    Example::
-
-        config = modeling.XLMConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
-            num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
-
-        model = modeling.XLMModel(config=config)
     """
     def __init__(self, config):
         super(XLMWithLMHeadModel, self).__init__(config)
@@ -735,57 +766,9 @@ class XLMWithLMHeadModel(XLMPreTrainedModel):
         """
         self._tie_or_clone_weights(self.pred_layer.proj, self.transformer.embeddings)
 
-    def forward(self, input_ids, lengths=None, positions=None, langs=None, token_type_ids=None,
+    def forward(self, input_ids, lengths=None, position_ids=None, langs=None, token_type_ids=None,
                 attention_mask=None, cache=None, labels=None, head_mask=None):
-        """
-        Args:
-            `input_ids`: a ``torch.LongTensor`` of shape [batch_size, sequence_length]
-                with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
-                `run_bert_extract_features.py`, `run_bert_classifier.py` and `run_bert_squad.py`)
-            `lengths`: TODO
-            `positions`: TODO
-            `langs`: TODO
-            `token_type_ids`: an optional ``torch.LongTensor`` of shape [batch_size, sequence_length] with the token
-                types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
-                a `sentence B` token (see XLM paper for more details).
-            `attention_mask`: an optional ``torch.LongTensor`` of shape [batch_size, sequence_length] with indices
-                selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
-                input sequence length in the current batch. It's the mask that we typically use for attention when
-                a batch has varying length sentences.
-            `cache`: TODO
-            `labels`: TODO
-            `head_mask`: an optional ``torch.Tensor`` of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
-                It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
-
-
-        Returns:
-            A ``tuple(encoded_layers, pooled_output)``, with
-
-                ``encoded_layers``: controlled by ``output_all_encoded_layers`` argument:
-
-                    If ``output_all_encoded_layers=True``: outputs a list of the full sequences of encoded-hidden-states \
-                    at the end of each attention block (i.e. 12 full sequences for XLM-base, 24 for XLM-large), each \
-                    encoded-hidden-state is a ``torch.FloatTensor`` of size [batch_size, sequence_length, hidden_size],
-
-                    If ``output_all_encoded_layers=False``: outputs only the full sequence of hidden-states corresponding \
-                    to the last attention block of shape [batch_size, sequence_length, hidden_size],
-
-                ``pooled_output``: a ``torch.FloatTensor`` of size [batch_size, hidden_size] which is the output of a \
-                classifier pre-trained on top of the hidden state associated to the first character of the \
-                input (`CLS`) to train on the Next-Sentence task (see XLM's paper).
-
-        Example::
-
-            # Already been converted into WordPiece token ids
-            input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
-            input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
-            token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
-
-            all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)
-            # or
-            all_encoder_layers, pooled_output = model.forward(input_ids, token_type_ids, input_mask)
-        """
-        transformer_outputs = self.transformer(input_ids, lengths=lengths, positions=positions, token_type_ids=token_type_ids,
+        transformer_outputs = self.transformer(input_ids, lengths=lengths, position_ids=position_ids, token_type_ids=token_type_ids,
                                                langs=langs, attention_mask=attention_mask, cache=cache, head_mask=head_mask)
 
         output = transformer_outputs[0]
@@ -795,25 +778,40 @@ class XLMWithLMHeadModel(XLMPreTrainedModel):
         return outputs
 
 
+@add_start_docstrings("""XLM Model with a sequence classification/regression head on top (a linear layer on top of
+    the pooled output) e.g. for GLUE tasks. """,
+    XLM_START_DOCSTRING, XLM_INPUTS_DOCSTRING)
 class XLMForSequenceClassification(XLMPreTrainedModel):
-    """XLM model ("XLM: Generalized Autoregressive Pretraining for Language Understanding").
+    r"""
+        **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
+            Labels for computing the sequence classification/regression loss.
+            Indices should be in ``[0, ..., config.num_labels]``.
+            If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss),
+            If ``config.num_labels > 1`` a classification loss is computed (Cross-Entropy).
 
-    Args:
-        `config`: a XLMConfig class instance with the configuration to build a new model
-        `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
-        `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
-            This can be used to compute head importance metrics. Default: False
-        `summary_type`: str, "last", "first", "mean", or "attn". The method
-            to pool the input to get a vector representation. Default: last
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
+            Classification (or regression if config.num_labels==1) loss.
+        **logits**: ``torch.FloatTensor`` of shape ``(batch_size, config.num_labels)``
+            Classification (or regression if config.num_labels==1) scores (before SoftMax).
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
 
+    Examples::
 
-
-    Example::
-
-        config = modeling.XLMConfig(vocab_size_or_config_json_file=32000, d_model=768,
-            n_layer=12, num_attention_heads=12, intermediate_size=3072)
-
-        model = modeling.XLMModel(config=config)
+        >>> config = XLMConfig.from_pretrained('xlm-mlm-en-2048')
+        >>> tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
+        >>> 
+        >>> model = XLMForSequenceClassification(config)
+        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        >>> labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
+        >>> outputs = model(input_ids, labels=labels)
+        >>> loss, logits = outputs[:2]
 
     """
     def __init__(self, config):
@@ -825,42 +823,9 @@ class XLMForSequenceClassification(XLMPreTrainedModel):
 
         self.apply(self.init_weights)
 
-    def forward(self, input_ids, lengths=None, positions=None, langs=None, token_type_ids=None,
+    def forward(self, input_ids, lengths=None, position_ids=None, langs=None, token_type_ids=None,
                 attention_mask=None, cache=None, labels=None, head_mask=None):
-        """
-        Args:
-            input_ids: TODO
-            lengths: TODO
-            positions: TODO
-            langs: TODO
-            token_type_ids: int32 Tensor in shape [bsz, len], the input segment IDs.
-            attention_mask: [optional] float32 Tensor, SAME FUNCTION as `input_mask`
-                but with 1 for real tokens and 0 for padding.
-                Added for easy compatibility with the XLM model (which uses this negative masking).
-                You can only uses one among `input_mask` and `attention_mask`
-            cache: TODO
-            labels: TODO
-            head_mask: TODO
-
-
-        Returns:
-            A ``tuple(logits_or_loss, new_mems)``. If ``labels`` is ``None``, return token logits with shape
-            [batch_size, sequence_length]. If it isn't ``None``, return the ``CrossEntropy`` loss with the targets.
-
-            ``new_mems`` is a list (num layers) of updated mem states at the entry of each layer \
-            each mem state is a ``torch.FloatTensor`` of size [self.config.mem_len, batch_size, self.config.d_model] \
-            Note that the first two dimensions are transposed in ``mems`` with regards to ``input_ids`` and ``labels``
-
-        Example::
-
-            # Already been converted into WordPiece token ids
-            input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
-            input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
-            token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
-
-            all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)
-        """
-        transformer_outputs = self.transformer(input_ids, lengths=lengths, positions=positions, token_type_ids=token_type_ids,
+        transformer_outputs = self.transformer(input_ids, lengths=lengths, position_ids=position_ids, token_type_ids=token_type_ids,
                                                langs=langs, attention_mask=attention_mask, cache=cache, head_mask=head_mask)
 
         output = transformer_outputs[0]
@@ -881,26 +846,53 @@ class XLMForSequenceClassification(XLMPreTrainedModel):
         return outputs
 
 
+@add_start_docstrings("""XLM Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
+    the hidden-states output to compute `span start logits` and `span end logits`). """,
+    XLM_START_DOCSTRING, XLM_INPUTS_DOCSTRING)
 class XLMForQuestionAnswering(XLMPreTrainedModel):
-    """
-    XLM model for Question Answering (span extraction).
-    This module is composed of the XLM model with a linear layer on top of
-    the sequence output that computes start_logits and end_logits
+    r"""
+        **start_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
+            Labels for position (index) of the start of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`).
+            Position outside of the sequence are not taken into account for computing the loss.
+        **end_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
+            Labels for position (index) of the end of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`).
+            Position outside of the sequence are not taken into account for computing the loss.
+        **is_impossible**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
+            Labels whether a question has an answer or no answer (SQuAD 2.0)
+        **cls_index**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
+            Labels for position (index) of the classification token to use as input for computing plausibility of the answer.
+        **p_mask**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+            Optional mask of tokens which can't be in answers (e.g. [CLS], [PAD], ...) 
 
-    Args:
-        `config`: a XLMConfig class instance with the configuration to build a new model
-        `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
-        `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
-            This can be used to compute head importance metrics. Default: False
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
+            Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
+        **start_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length,)``
+            Span-start scores (before SoftMax).
+        **end_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length,)``
+            Span-end scores (before SoftMax).
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
 
+    Examples::
 
+        >>> config = XLMConfig.from_pretrained('xlm-mlm-en-2048')
+        >>> tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
+        >>> 
+        >>> model = XLMForQuestionAnswering(config)
+        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        >>> start_positions = torch.tensor([1])
+        >>> end_positions = torch.tensor([3])
+        >>> outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
+        >>> loss, start_scores, end_scores = outputs[:2]
 
-    Example::
-
-        config = XLMConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
-            num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
-
-        model = XLMForQuestionAnswering(config)
     """
     def __init__(self, config):
         super(XLMForQuestionAnswering, self).__init__(config)
@@ -910,63 +902,10 @@ class XLMForQuestionAnswering(XLMPreTrainedModel):
 
         self.apply(self.init_weights)
 
-    def forward(self, input_ids, lengths=None, positions=None, langs=None, token_type_ids=None,
+    def forward(self, input_ids, lengths=None, position_ids=None, langs=None, token_type_ids=None,
                 attention_mask=None, cache=None, start_positions=None, end_positions=None,
                 cls_index=None, is_impossible=None, p_mask=None, head_mask=None):
-
-        """
-        Performs a model forward pass. **Can be called by calling the class directly, once it has been instantiated.**
-
-        Args:
-            input_ids: a ``torch.LongTensor`` of shape [batch_size, sequence_length]
-                with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
-                `run_bert_extract_features.py`, `run_bert_classifier.py` and `run_bert_squad.py`)
-            lengths: TODO
-            positions: TODO
-            langs: TODO
-            token_type_ids: an optional ``torch.LongTensor`` of shape [batch_size, sequence_length] with the token
-                types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
-                a `sentence B` token (see XLM paper for more details).
-            attention_mask: [optional] float32 Tensor, SAME FUNCTION as `input_mask`
-                but with 1 for real tokens and 0 for padding.
-                Added for easy compatibility with the XLM model (which uses this negative masking).
-                You can only uses one among `input_mask` and `attention_mask`
-            cache: TODO
-            start_positions: position of the first token for the labeled span: ``torch.LongTensor`` of shape [batch_size].
-                Positions are clamped to the length of the sequence and position outside of the sequence are not taken
-                into account for computing the loss.
-            end_positions: position of the last token for the labeled span: ``torch.LongTensor`` of shape [batch_size].
-                Positions are clamped to the length of the sequence and position outside of the sequence are not taken
-                into account for computing the loss.
-            cls_index: TODO
-            is_impossible: TODO
-            p_mask: TODO
-            head_mask: an optional ``torch.Tensor`` of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
-                It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
-
-        Returns:
-            Either the ``total_loss`` or a ``tuple(start_logits, end_logits)``
-
-                if ``start_positions`` and ``end_positions`` are not ``None``, \
-                outputs the total_loss which is the sum of the CrossEntropy loss for the start and end token positions.
-
-                if ``start_positions`` or ``end_positions`` is ``None``:
-                Outputs a ``tuple(start_logits, end_logits)`` which are the logits respectively for the start and end
-                position tokens of shape [batch_size, sequence_length].
-
-        Example::
-
-            # Already been converted into WordPiece token ids
-            input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
-            input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
-            token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
-
-            start_logits, end_logits = model(input_ids, token_type_ids, input_mask)
-            # or
-            start_logits, end_logits = model.forward(input_ids, token_type_ids, input_mask)
-        """
-
-        transformer_outputs = self.transformer(input_ids, lengths=lengths, positions=positions, token_type_ids=token_type_ids,
+        transformer_outputs = self.transformer(input_ids, lengths=lengths, position_ids=position_ids, token_type_ids=token_type_ids,
                                                langs=langs, attention_mask=attention_mask, cache=cache, head_mask=head_mask)
 
         output = transformer_outputs[0]
diff --git a/pytorch_transformers/modeling_xlnet.py b/pytorch_transformers/modeling_xlnet.py
index d3efd2799a..a46426d82a 100644
--- a/pytorch_transformers/modeling_xlnet.py
+++ b/pytorch_transformers/modeling_xlnet.py
@@ -15,8 +15,6 @@
 # limitations under the License.
 """ PyTorch XLNet model.
 """
-from __future__ import (absolute_import, division, print_function,
-                        unicode_literals)
 from __future__ import absolute_import, division, print_function, unicode_literals
 
 import json
@@ -32,7 +30,8 @@ from torch.nn import functional as F
 from torch.nn import CrossEntropyLoss, MSELoss
 
 from .modeling_utils import (CONFIG_NAME, WEIGHTS_NAME, PretrainedConfig, PreTrainedModel,
-                             SequenceSummary, PoolerAnswerClass, PoolerEndLogits, PoolerStartLogits)
+                             SequenceSummary, PoolerAnswerClass, PoolerEndLogits, PoolerStartLogits,
+                             add_start_docstrings)
 
 
 logger = logging.getLogger(__name__)
@@ -619,26 +618,105 @@ class XLNetPreTrainedModel(PreTrainedModel):
                 module.mask_emb.data.normal_(mean=0.0, std=self.config.initializer_range)
 
 
+XLNET_START_DOCSTRING = r"""    The XLNet model was proposed in
+    `XLNet: Generalized Autoregressive Pretraining for Language Understanding`_
+    by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
+    XLnet is an extension of the Transformer-XL model pre-trained using an autoregressive method
+    to learn bidirectional contexts by maximizing the expected likelihood over all permutations
+    of the input sequence factorization order.
+
+    The specific attention pattern can be controlled at training and test time using the `perm_mask` input.
+
+    Do to the difficulty of training a fully auto-regressive model over various factorization order,
+    XLNet is pretrained using only a sub-set of the output tokens as target which are selected
+    with the `target_mapping` input.
+
+    To use XLNet for sequential decoding (i.e. not in fully bi-directional setting), use the `perm_mask` and
+    `target_mapping` inputs to control the attention span and outputs (see examples in `examples/run_generation.py`)
+
+    This model is a PyTorch `torch.nn.Module`_ sub-class. Use it as a regular PyTorch Module and
+    refer to the PyTorch documentation for all matter related to general usage and behavior.
+
+    .. _`XLNet: Generalized Autoregressive Pretraining for Language Understanding`:
+        http://arxiv.org/abs/1906.08237
+
+    .. _`torch.nn.Module`:
+        https://pytorch.org/docs/stable/nn.html#module
+
+    Parameters:
+        config (:class:`~pytorch_transformers.XLNetConfig`): Model configuration class with all the parameters of the model.
+"""
+
+XLNET_INPUTS_DOCSTRING = r"""
+    Inputs:
+        **input_ids**: ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+            Indices of input sequence tokens in the vocabulary.
+            Indices can be obtained using :class:`pytorch_transformers.XLNetTokenizer`.
+            See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
+            :func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
+        **token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+            A parallel sequence of tokens (can be used to indicate various portions of the inputs).
+            The embeddings from these tokens will be summed with the respective token embeddings.
+            Indices are selected in the vocabulary (unlike BERT which has a specific vocabulary for segment indices).
+        **attention_mask**: (`optional`) ``torch.Tensor`` of shape ``(batch_size, sequence_length)``:
+            Mask to avoid performing attention on padding token indices.
+            Mask values selected in ``[0, 1]``:
+            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
+        **input_mask**: (`optional`) ``torch.Tensor`` of shape ``(batch_size, sequence_length)``:
+            Mask to avoid performing attention on padding token indices.
+            Negative of `attention_mask`, i.e. with 0 for real tokens and 1 for padding.
+            Kept for compatibility with the original code base.
+            You can only uses one of `input_mask` and `attention_mask`
+            Mask values selected in ``[0, 1]``:
+            ``1`` for tokens that are MASKED, ``0`` for tokens that are NOT MASKED.
+        **mems**: (`optional`)
+            list of ``torch.FloatTensor`` (one for each layer):
+            that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
+            (see `mems` output below). Can be used to speed up sequential decoding and attend to longer context.
+        **perm_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, sequence_length)``:
+            Mask to indicate the attention pattern for each input token with values selected in ``[0, 1]``:
+            If ``perm_mask[k, i, j] = 0``, i attend to j in batch k;
+            if ``perm_mask[k, i, j] = 1``, i does not attend to j in batch k.
+            If None, each token attends to all the others (full bidirectional attention).
+            Only used during pretraining (to define factorization order) or for sequential decoding (generation).
+        **target_mapping**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, num_predict, sequence_length)``:
+            Mask to indicate the output tokens to use.
+            If ``target_mapping[k, i, j] = 1``, the i-th predict in batch k is on the j-th token.
+            Only used during pretraining for partial prediction or for sequential decoding (generation).
+        **head_mask**: (`optional`) ``torch.Tensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
+            Mask to nullify selected heads of the self-attention modules.
+            Mask values selected in ``[0, 1]``:
+            ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
+"""
+
+@add_start_docstrings("The bare XLNet Model transformer outputing raw hidden-states without any specific head on top.",
+                      XLNET_START_DOCSTRING, XLNET_INPUTS_DOCSTRING)
 class XLNetModel(XLNetPreTrainedModel):
-    """XLNet model ("XLNet: Generalized Autoregressive Pretraining for Language Understanding").
+    r"""
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **last_hidden_state**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, hidden_size)``
+            Sequence of hidden-states at the last layer of the model.
+        **mems**:
+            list of ``torch.FloatTensor`` (one for each layer):
+            that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
+            (see `mems` input above). Can be used to speed up sequential decoding and attend to longer context.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
 
-    TODO Lysandre filled: this was copied from the XLNetLMHeadModel, check that it's ok.
+    Examples::
 
-    Args:
-        `config`: a XLNetConfig class instance with the configuration to build a new model
-        `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
-        `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
-            This can be used to compute head importance metrics. Default: False
+        >>> config = XLNetConfig.from_pretrained('xlnet-large-cased')
+        >>> tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
+        >>> model = XLNetModel(config)
+        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        >>> outputs = model(input_ids)
+        >>> last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
 
-
-    Example::
-
-        config = modeling.XLNetConfig(vocab_size_or_config_json_file=32000, d_model=768,
-            n_layer=12, num_attention_heads=12, intermediate_size=3072)
-
-        model = modeling.XLNetModel(config=config)
-
-    TODO Lysandre filled: Added example usage
     """
     def __init__(self, config):
         super(XLNetModel, self).__init__(config)
@@ -765,50 +843,6 @@ class XLNetModel(XLNetPreTrainedModel):
 
     def forward(self, input_ids, token_type_ids=None, input_mask=None, attention_mask=None,
                 mems=None, perm_mask=None, target_mapping=None, head_mask=None):
-        """
-        Performs a model forward pass. **Can be called by calling the class directly, once it has been instantiated.**
-
-        Args:
-            input_ids: int32 Tensor in shape [bsz, len], the input token IDs.
-            token_type_ids: int32 Tensor in shape [bsz, len], the input segment IDs.
-            input_mask: [optional] float32 Tensor in shape [bsz, len], the input mask.
-                0 for real tokens and 1 for padding.
-            attention_mask: [optional] float32 Tensor, SAME FUNCTION as `input_mask`
-                but with 1 for real tokens and 0 for padding.
-                Added for easy compatibility with the BERT model (which uses this negative masking).
-                You can only uses one among `input_mask` and `attention_mask`
-            mems: [optional] a list of float32 Tensors in shape [mem_len, bsz, d_model], memory
-                from previous batches. The length of the list equals n_layer.
-                If None, no memory is used.
-            perm_mask: [optional] float32 Tensor in shape [bsz, len, len].
-                If perm_mask[k, i, j] = 0, i attend to j in batch k;
-                if perm_mask[k, i, j] = 1, i does not attend to j in batch k.
-                If None, each position attends to all the others.
-            target_mapping: [optional] float32 Tensor in shape [bsz, num_predict, len].
-                If target_mapping[k, i, j] = 1, the i-th predict in batch k is
-                on the j-th token.
-                Only used during pretraining for partial prediction.
-                Set to None during finetuning.
-            head_mask: TODO Lysandre didn't fill
-
-
-        Returns:
-            TODO Lysandre didn't fill: Missing returns!
-
-        Example::
-
-            # Already been converted into WordPiece token ids
-            input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
-            input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
-            token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
-
-            all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)
-            # or
-            all_encoder_layers, pooled_output = model.forward(input_ids, token_type_ids, input_mask)
-
-        TODO Lysandre filled: Filled with the LMHead example, is probably different since it has a different output
-
-        """
         # the original code for XLNet uses shapes [len, bsz] with the batch dimension at the end
         # but we want a unified interface in the library with the batch size on the first dimension
         # so we move here the first dimension (batch) to the end
@@ -952,23 +986,49 @@ class XLNetModel(XLNetPreTrainedModel):
         return outputs  # outputs, new_mems, (hidden_states), (attentions)
 
 
+@add_start_docstrings("""XLNet Model with a language modeling head on top
+    (linear layer with weights tied to the input embeddings). """,
+    XLNET_START_DOCSTRING, XLNET_INPUTS_DOCSTRING)
 class XLNetLMHeadModel(XLNetPreTrainedModel):
-    """XLNet model ("XLNet: Generalized Autoregressive Pretraining for Language Understanding").
+    r"""
+        **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+            Labels for language modeling.
+            Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids``
+            Indices are selected in ``[-1, 0, ..., config.vocab_size]``
+            All labels set to ``-1`` are ignored (masked), the loss is only
+            computed for labels in ``[0, ..., config.vocab_size]``
 
-    Args:
-        `config`: a XLNetConfig class instance with the configuration to build a new model
-        `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
-        `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
-            This can be used to compute head importance metrics. Default: False
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
+            Language modeling loss.
+        **prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        **mems**:
+            list of ``torch.FloatTensor`` (one for each layer):
+            that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
+            (see `mems` input above). Can be used to speed up sequential decoding and attend to longer context.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
 
-    Example::
+    Examples::
 
-        config = modeling.XLNetConfig(vocab_size_or_config_json_file=32000, d_model=768,
-            n_layer=12, num_attention_heads=12, intermediate_size=3072)
+        >>> config = XLNetConfig.from_pretrained('xlnet-large-cased')
+        >>> tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
+        >>> model = XLNetLMHeadModel(config)
+        >>> # We show how to setup inputs to predict a next token using a bi-directional context.
+        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is very <mask>")).unsqueeze(0)  # We will predict the masked token
+        >>> perm_mask = torch.zeros((1, input_ids.shape[1], input_ids.shape[1]), dtype=torch.float)
+        >>> perm_mask[:, :, -1] = 1.0  # Previous tokens don't see last token
+        >>> target_mapping = torch.zeros((1, 1, input_ids.shape[1]), dtype=torch.float)  # Shape [1, 1, seq_length] => let's predict one token
+        >>> target_mapping[0, 0, -1] = 1.0  # Our first (and only) prediction will be the last token of the sequence (the masked token)
+        >>> outputs = model(input_ids, perm_mask=perm_mask, target_mapping=target_mapping)
+        >>> next_token_logits = outputs[0]  # Output has shape [target_mapping.size(0), target_mapping.size(1), config.vocab_size]
 
-        model = modeling.XLNetLMHeadModel(config=config)
-
-    TODO Lysandre modified: Changed XLNetModel to XLNetLMHeadModel in the example
     """
     def __init__(self, config):
         super(XLNetLMHeadModel, self).__init__(config)
@@ -989,58 +1049,6 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
     def forward(self, input_ids, token_type_ids=None, input_mask=None, attention_mask=None,
                 mems=None, perm_mask=None, target_mapping=None,
                 labels=None, head_mask=None):
-        """
-         all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)
-
-        Args:
-            input_ids: int32 Tensor in shape [bsz, len], the input token IDs.
-            token_type_ids: int32 Tensor in shape [bsz, len], the input segment IDs.
-            input_mask: [optional] float32 Tensor in shape [bsz, len], the input mask.
-                0 for real tokens and 1 for padding.
-            attention_mask: [optional] float32 Tensor, SAME FUNCTION as `input_mask`
-                but with 1 for real tokens and 0 for padding.
-                Added for easy compatibility with the BERT model (which uses this negative masking).
-                You can only uses one among `input_mask` and `attention_mask`
-            mems: [optional] a list of float32 Tensors in shape [mem_len, bsz, d_model], memory
-                from previous batches. The length of the list equals n_layer.
-                If None, no memory is used.
-            perm_mask: [optional] float32 Tensor in shape [bsz, len, len].
-                If perm_mask[k, i, j] = 0, i attend to j in batch k;
-                if perm_mask[k, i, j] = 1, i does not attend to j in batch k.
-                If None, each position attends to all the others.
-            target_mapping: [optional] float32 Tensor in shape [bsz, num_predict, len].
-                If target_mapping[k, i, j] = 1, the i-th predict in batch k is
-                on the j-th token.
-                Only used during pretraining for partial prediction.
-                Set to None during finetuning.
-
-        Returns:
-            A ``tuple(encoded_layers, pooled_output)``, with
-
-                ``encoded_layers``: controlled by ``output_all_encoded_layers`` argument:
-
-                    - ``output_all_encoded_layers=True``: outputs a list of the full sequences of encoded-hidden-states \
-                    at the end of each attention block (i.e. 12 full sequences for XLNet-base, 24 for XLNet-large), \
-                    each encoded-hidden-state is a ``torch.FloatTensor`` of size [batch_size, sequence_length, d_model],
-
-                    - ``output_all_encoded_layers=False``: outputs only the full sequence of hidden-states corresponding \
-                    to the last attention block of shape [batch_size, sequence_length, d_model],
-
-                ``pooled_output``: a ``torch.FloatTensor`` of size [batch_size, d_model] which is the output of a \
-                classifier pretrained on top of the hidden state associated to the first character of the \
-                input (`CLS`) to train on the Next-Sentence task (see XLNet's paper).
-
-        Example::
-
-            # Already been converted into WordPiece token ids
-            input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
-            input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
-            token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
-
-            all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)
-            # or
-            all_encoder_layers, pooled_output = model.forward(input_ids, token_type_ids, input_mask)
-        """
         transformer_outputs = self.transformer(input_ids, token_type_ids, input_mask, attention_mask,
                                                mems, perm_mask, target_mapping, head_mask)
 
@@ -1055,30 +1063,48 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
                             labels.view(-1))
             outputs = (loss,) + outputs
 
-        return outputs  # return (loss), logits, (mems), (hidden states), (attentions)
+        return outputs  # return (loss), logits, mems, (hidden states), (attentions)
 
 
+@add_start_docstrings("""XLNet Model with a sequence classification/regression head on top (a linear layer on top of
+    the pooled output) e.g. for GLUE tasks. """,
+    XLNET_START_DOCSTRING, XLNET_INPUTS_DOCSTRING)
 class XLNetForSequenceClassification(XLNetPreTrainedModel):
-    """XLNet model ("XLNet: Generalized Autoregressive Pretraining for Language Understanding").
+    r"""
+        **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
+            Labels for computing the sequence classification/regression loss.
+            Indices should be in ``[0, ..., config.num_labels]``.
+            If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss),
+            If ``config.num_labels > 1`` a classification loss is computed (Cross-Entropy).
 
-    Args:
-        `config`: a XLNetConfig class instance with the configuration to build a new model
-        `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
-        `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
-            This can be used to compute head importance metrics. Default: False
-        `summary_type`: str, "last", "first", "mean", or "attn". The method
-            to pool the input to get a vector representation. Default: last
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
+            Classification (or regression if config.num_labels==1) loss.
+        **logits**: ``torch.FloatTensor`` of shape ``(batch_size, config.num_labels)``
+            Classification (or regression if config.num_labels==1) scores (before SoftMax).
+        **mems**:
+            list of ``torch.FloatTensor`` (one for each layer):
+            that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
+            (see `mems` input above). Can be used to speed up sequential decoding and attend to longer context.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
 
+    Examples::
 
+        >>> config = XLNetConfig.from_pretrained('xlnet-large-cased')
+        >>> tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
+        >>> 
+        >>> model = XLNetForSequenceClassification(config)
+        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        >>> labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
+        >>> outputs = model(input_ids, labels=labels)
+        >>> loss, logits = outputs[:2]
 
-    Example::
-
-        # Already been converted into WordPiece token ids
-        input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
-        input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
-        token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
-
-        all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)
     """
     def __init__(self, config):
         super(XLNetForSequenceClassification, self).__init__(config)
@@ -1093,57 +1119,6 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel):
     def forward(self, input_ids, token_type_ids=None, input_mask=None, attention_mask=None,
                 mems=None, perm_mask=None, target_mapping=None,
                 labels=None, head_mask=None):
-        """
-        Performs a model forward pass. **Can be called by calling the class directly, once it has been instantiated.**
-
-        Args:
-            input_ids: int32 Tensor in shape [bsz, len], the input token IDs.
-            token_type_ids: int32 Tensor in shape [bsz, len], the input segment IDs.
-            input_mask: float32 Tensor in shape [bsz, len], the input mask.
-                0 for real tokens and 1 for padding.
-            attention_mask: [optional] float32 Tensor, SAME FUNCTION as `input_mask`
-                but with 1 for real tokens and 0 for padding.
-                Added for easy compatibility with the BERT model (which uses this negative masking).
-                You can only uses one among `input_mask` and `attention_mask`
-            mems: a list of float32 Tensors in shape [mem_len, bsz, d_model], memory
-                from previous batches. The length of the list equals n_layer.
-                If None, no memory is used.
-            perm_mask: float32 Tensor in shape [bsz, len, len].
-                If perm_mask[k, i, j] = 0, i attend to j in batch k;
-                if perm_mask[k, i, j] = 1, i does not attend to j in batch k.
-                If None, each position attends to all the others.
-            target_mapping: float32 Tensor in shape [bsz, num_predict, len].
-                If target_mapping[k, i, j] = 1, the i-th predict in batch k is
-                on the j-th token.
-                Only used during pre-training for partial prediction.
-                Set to None during fine-tuning.
-            labels: TODO Lysandre didn't fill
-            head_mask: an optional ``torch.Tensor`` of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
-                It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
-
-
-        Returns:
-            A ``tuple(logits_or_loss, mems)``
-
-                ``logits_or_loss``: if ``labels`` is ``None``, ``logits_or_loss`` corresponds to token logits with shape \
-                [batch_size, sequence_length]. If it is not ``None``, it corresponds to the ``CrossEntropy`` loss \
-                with the targets.
-
-                ``new_mems``: list (num layers) of updated mem states at the entry of each layer \
-                each mem state is a ``torch.FloatTensor`` of size [self.config.mem_len, batch_size, self.config.d_model] \
-                Note that the first two dimensions are transposed in ``mems`` with regards to ``input_ids`` and ``labels``
-
-        Example::
-
-            # Already been converted into WordPiece token ids
-            input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
-            input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
-            token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
-
-            all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)
-            # or
-            all_encoder_layers, pooled_output = model.forward(input_ids, token_type_ids, input_mask)
-        """
         transformer_outputs = self.transformer(input_ids, token_type_ids, input_mask, attention_mask,
                                                mems, perm_mask, target_mapping, head_mask)
         output = transformer_outputs[0]
@@ -1163,28 +1138,60 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel):
                 loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
             outputs = (loss,) + outputs
 
-        return outputs  # return (loss), logits, (mems), (hidden states), (attentions)
+        return outputs  # return (loss), logits, mems, (hidden states), (attentions)
 
 
+@add_start_docstrings("""XLNet Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
+    the hidden-states output to compute `span start logits` and `span end logits`). """,
+    XLNET_START_DOCSTRING, XLNET_INPUTS_DOCSTRING)
 class XLNetForQuestionAnswering(XLNetPreTrainedModel):
-    """
-    XLNet model for Question Answering (span extraction).
+    r"""
+        **start_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
+            Labels for position (index) of the start of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`).
+            Position outside of the sequence are not taken into account for computing the loss.
+        **end_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
+            Labels for position (index) of the end of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`).
+            Position outside of the sequence are not taken into account for computing the loss.
+        **is_impossible**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
+            Labels whether a question has an answer or no answer (SQuAD 2.0)
+        **cls_index**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
+            Labels for position (index) of the classification token to use as input for computing plausibility of the answer.
+        **p_mask**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+            Optional mask of tokens which can't be in answers (e.g. [CLS], [PAD], ...) 
 
-    This module is composed of the XLNet model with a linear layer on top of
-    the sequence output that computes ``start_logits`` and ``end_logits``
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
+            Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
+        **start_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length,)``
+            Span-start scores (before SoftMax).
+        **end_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length,)``
+            Span-end scores (before SoftMax).
+        **mems**:
+            list of ``torch.FloatTensor`` (one for each layer):
+            that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
+            (see `mems` input above). Can be used to speed up sequential decoding and attend to longer context.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
 
-    Args:
-        `config`: a XLNetConfig class instance with the configuration to build a new model
-        `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
-        `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
-            This can be used to compute head importance metrics. Default: False
+    Examples::
 
-    Example::
+        >>> config = XLMConfig.from_pretrained('xlm-mlm-en-2048')
+        >>> tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
+        >>> 
+        >>> model = XLMForQuestionAnswering(config)
+        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        >>> start_positions = torch.tensor([1])
+        >>> end_positions = torch.tensor([3])
+        >>> outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
+        >>> loss, start_scores, end_scores = outputs[:2]
 
-        config = XLNetConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
-            num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
-
-        model = XLNetForQuestionAnswering(config)
     """
     def __init__(self, config):
         super(XLNetForQuestionAnswering, self).__init__(config)
@@ -1202,53 +1209,6 @@ class XLNetForQuestionAnswering(XLNetPreTrainedModel):
                 mems=None, perm_mask=None, target_mapping=None,
                 start_positions=None, end_positions=None, cls_index=None, is_impossible=None, p_mask=None,
                 head_mask=None):
-
-        """
-        Performs a model forward pass. **Can be called by calling the class directly, once it has been instantiated.**
-
-        Args:
-            `input_ids`: a ``torch.LongTensor`` of shape [batch_size, sequence_length]
-                with the word token indices in the vocabulary(see the tokens pre-processing logic in the scripts
-                `run_bert_extract_features.py`, `run_bert_classifier.py` and `run_bert_squad.py`)
-            `token_type_ids`: an optional ``torch.LongTensor`` of shape [batch_size, sequence_length] with the token
-                types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
-                a `sentence B` token (see XLNet paper for more details).
-            `attention_mask`: [optional] float32 Tensor, SAME FUNCTION as `input_mask`
-                but with 1 for real tokens and 0 for padding.
-                Added for easy compatibility with the BERT model (which uses this negative masking).
-                You can only uses one among ``input_mask`` and ``attention_mask``
-            `input_mask`: an optional ``torch.LongTensor`` of shape [batch_size, sequence_length] with indices
-                selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
-                input sequence length in the current batch. It's the mask that we typically use for attention when
-                a batch has varying length sentences.
-            `start_positions`: position of the first token for the labeled span: ``torch.LongTensor`` of shape [batch_size].
-                Positions are clamped to the length of the sequence and position outside of the sequence are not taken
-                into account for computing the loss.
-            `end_positions`: position of the last token for the labeled span: ``torch.LongTensor`` of shape [batch_size].
-                Positions are clamped to the length of the sequence and position outside of the sequence are not taken
-                into account for computing the loss.
-            `head_mask`: an optional ``torch.Tensor`` of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
-                It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
-
-        Returns:
-            if ``start_positions`` and ``end_positions`` are not ``None``, outputs the total_loss which is the sum of the \
-            ``CrossEntropy`` loss for the start and end token positions.
-
-            if ``start_positions`` or ``end_positions`` is ``None``, outputs a tuple of ``start_logits``, ``end_logits``
-            which are the logits respectively for the start and end position tokens of shape \
-            [batch_size, sequence_length].
-
-        Example::
-
-            # Already been converted into WordPiece token ids
-            input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
-            input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
-            token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
-
-            start_logits, end_logits = model(input_ids, token_type_ids, input_mask)
-            # or
-            start_logits, end_logits = model.forward(input_ids, token_type_ids, input_mask)
-        """
         transformer_outputs = self.transformer(input_ids, token_type_ids, input_mask, attention_mask,
                                                mems, perm_mask, target_mapping, head_mask)
         hidden_states = transformer_outputs[0]

From e28d8bde0dec9b9148269bcd7e40c428f32b6b56 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Mon, 15 Jul 2019 12:08:06 +0200
Subject: [PATCH 125/139] doc on base classes

---
 pytorch_transformers/modeling_utils.py | 112 +++++++++++++++++--------
 1 file changed, 76 insertions(+), 36 deletions(-)

diff --git a/pytorch_transformers/modeling_utils.py b/pytorch_transformers/modeling_utils.py
index 4e5fe92001..7a9777a0eb 100644
--- a/pytorch_transformers/modeling_utils.py
+++ b/pytorch_transformers/modeling_utils.py
@@ -54,7 +54,8 @@ else:
 
 
 class PretrainedConfig(object):
-    """ An abstract class to handle dowloading a model pretrained config.
+    """ Base class for all configuration classes.
+        Handle a few common parameters and methods for loading/downloading/saving configurations.
     """
     pretrained_config_archive_map = {}
 
@@ -66,7 +67,7 @@ class PretrainedConfig(object):
         self.torchscript = kwargs.pop('torchscript', False)
 
     def save_pretrained(self, save_directory):
-        """ Save a configuration file to a directory, so that it
+        """ Save a configuration object to a directory, so that it
             can be re-loaded using the `from_pretrained(save_directory)` class method.
         """
         assert os.path.isdir(save_directory), "Saving path should be a directory where the model and configuration can be saved"
@@ -78,16 +79,30 @@ class PretrainedConfig(object):
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path, *input, **kwargs):
-        """
-        Instantiate a PretrainedConfig from a pre-trained model configuration.
+        r""" Instantiate a PretrainedConfig from a pre-trained model configuration.
 
         Params:
-            pretrained_model_name_or_path: either:
-                - a str with the name of a pre-trained model to load selected in the list of:
-                    . `xlnet-large-cased`
-                - a path or url to a directory containing a configuration file `config.json` for the model,
-                - a path or url to a configuration file for the model.
-            cache_dir: an optional path to a folder in which the pre-trained model configuration will be cached.
+            **pretrained_model_name_or_path**: either:
+                - a string with the `shortcut name` of a pre-trained model configuration to load from cache
+                    or download and cache if not already stored in cache (e.g. 'bert-base-uncased').
+                - a path to a `directory` containing a configuration file saved
+                    using the `save_pretrained(save_directory)` method.
+                - a path or url to a saved configuration `file`.
+            **cache_dir**: (`optional`) string:
+                Path to a directory in which a downloaded pre-trained model
+                configuration should be cached if the standard cache should not be used.
+            **kwargs**: (`optional`) dict:
+                Dictionnary of key, values to update the configuration object after loading.
+                Can be used to override selected configuration parameters.
+
+        Examples::
+
+            >>> config = BertConfig.from_pretrained('bert-base-uncased')    # Download configuration from S3 and cache.
+            >>> config = BertConfig.from_pretrained('./test/saved_model/')  # E.g. config (or model) was saved using `save_pretrained('./test/saved_model/')`
+            >>> config = BertConfig.from_pretrained('./test/saved_model/my_configuration.json')
+            >>> config = BertConfig.from_pretrained('bert-base-uncased', output_attention=True)
+            >>> assert config.output_attention == True
+
         """
         cache_dir = kwargs.pop('cache_dir', None)
 
@@ -172,7 +187,7 @@ class PretrainedConfig(object):
 
 
 class PreTrainedModel(nn.Module):
-    """ An abstract class to handle storing model config and
+    """ Base class for all models. Handle loading/storing model config and
         a simple interface for dowloading and loading pretrained models.
     """
     config_class = PretrainedConfig
@@ -199,11 +214,12 @@ class PreTrainedModel(nn.Module):
             Reducing the size will remove vectors from the end
 
         Args:
-            new_num_tokens: (Optional) New number of tokens in the embedding matrix.
+            new_num_tokens: (`optional`) int
+                New number of tokens in the embedding matrix.
                 Increasing the size will add newly initialized vectors at the end
                 Reducing the size will remove vectors from the end
                 If not provided or None: return the provided token Embedding Module.
-        Return:
+        Return: ``torch.nn.Embeddings``
             Pointer to the resized Embedding Module or the old Embedding Module if new_num_tokens is None
         """
         if new_num_tokens is None:
@@ -236,13 +252,15 @@ class PreTrainedModel(nn.Module):
 
     def resize_token_embeddings(self, new_num_tokens=None):
         """ Resize input token embeddings matrix of the model if new_num_tokens != config.vocab_size.
+            Take care of tying weights embeddings afterwards if the model class has a `tie_weights()` method.
 
         Args:
-            new_num_tokens: (Optional) New number of tokens in the embedding matrix.
+            new_num_tokens: (`optional`) int
+                New number of tokens in the embedding matrix.
                 Increasing the size will add newly initialized vectors at the end
                 Reducing the size will remove vectors from the end
                 If not provided or None: does nothing.
-        Return:
+        Return: ``torch.nn.Embeddings``
             Pointer to the input tokens Embedding Module of the model
         """
         base_model = getattr(self, self.base_model_prefix, self)  # get the base model if needed
@@ -262,7 +280,8 @@ class PreTrainedModel(nn.Module):
 
     def prune_heads(self, heads_to_prune):
         """ Prunes heads of the base model.
-            heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
+            Args:
+                heads_to_prune: dict of {layer_num (int): list of heads to prune in this layer (list of int)}
         """
         base_model = getattr(self, self.base_model_prefix, self)  # get the base model if needed
         base_model._prune_heads(heads_to_prune)
@@ -286,26 +305,47 @@ class PreTrainedModel(nn.Module):
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
-        """
-        Instantiate a PreTrainedModel from a pre-trained model file or a pytorch state dict.
-        Download and cache the pre-trained model file if needed.
+        r""" Instantiate a PretrainedConfig from a pre-trained model configuration.
 
         Params:
-            pretrained_model_name_or_path: either:
-                - a str with the name of a pre-trained model to load, or
-                - a path or url to a pretrained model archive containing:
-                    . `config.json` a configuration file for the model
-                    . `pytorch_model.bin` a PyTorch dump of a XLNetForPreTraining instance
-                - a path or url to a tensorflow pretrained model checkpoint containing:
-                    . `config.json` a configuration file for the model
-                    . `model.chkpt` a TensorFlow checkpoint
-            config: an optional configuration for the model
-            from_tf: should we load the weights from a locally saved TensorFlow checkpoint
-            cache_dir: an optional path to a folder in which the pre-trained models will be cached.
-            state_dict: an optional state dictionnary (collections.OrderedDict object) to use
-                instead of Google pre-trained models
-            *inputs, **kwargs: additional input for the specific XLNet class
-                (ex: num_labels for XLNetForSequenceClassification)
+            **pretrained_model_name_or_path**: either:
+                - a string with the `shortcut name` of a pre-trained model to load from cache
+                    or download and cache if not already stored in cache (e.g. 'bert-base-uncased').
+                - a path to a `directory` containing a configuration file saved
+                    using the `save_pretrained(save_directory)` method.
+                - a path or url to a tensorflow index checkpoint `file` (e.g. `./tf_model/model.ckpt.index`).
+                    In this case, ``from_tf`` should be set to True and a configuration object should be
+                    provided as `config` argument. This loading option is slower than converting the TensorFlow
+                    checkpoint in a PyTorch model using the provided conversion scripts and loading
+                    the PyTorch model afterwards.
+            **config**: an optional configuration for the model to use instead of an automatically loaded configuation.
+                Configuration can be automatically loaded when:
+                - the model is a model provided by the library (loaded with a `shortcut name` of a pre-trained model), or
+                - the model was saved using the `save_pretrained(save_directory)` (loaded by suppling the save directory).
+            **state_dict**: an optional state dictionnary for the model to use instead of a state dictionary loaded
+                from saved weights file.
+                This option can be used if you want to create a model from a pretrained configuraton but load your own weights.
+                In this case though, you should check if using `save_pretrained(dir)` and `from_pretrained(save_directory)` is not
+                a simpler option.
+            **cache_dir**: (`optional`) string:
+                Path to a directory in which a downloaded pre-trained model
+                configuration should be cached if the standard cache should not be used.
+            **output_loading_info**: (`optional`) boolean:
+                Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages.
+            **kwargs**: (`optional`) dict:
+                Dictionnary of key, values to update the configuration object after loading.
+                Can be used to override selected configuration parameters. E.g. ``output_attention=True``
+
+        Examples::
+
+            >>> model = BertModel.from_pretrained('bert-base-uncased')    # Download model and configuration from S3 and cache.
+            >>> model = BertModel.from_pretrained('./test/saved_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
+            >>> model = BertModel.from_pretrained('bert-base-uncased', output_attention=True)  # Update configuration during loading
+            >>> assert model.config.output_attention == True
+            >>> # Loading from a TF checkpoint file instead of a PyTorch model (slower)
+            >>> config = BertConfig.from_json_file('./tf_model/my_tf_model_config.json')
+            >>> model = BertModel.from_pretrained('./tf_model/my_tf_checkpoint.ckpt.index', from_tf=True, config=config)
+
         """
         config = kwargs.pop('config', None)
         state_dict = kwargs.pop('state_dict', None)
@@ -428,7 +468,7 @@ class PreTrainedModel(nn.Module):
 
 class Conv1D(nn.Module):
     def __init__(self, nf, nx):
-        """ Conv1D layer as defined by Alec for GPT (and also used in GPT-2)
+        """ Conv1D layer as defined by Radford et al. for OpenAI GPT (and also used in GPT-2)
             Basically works like a Linear layer but the weights are transposed
         """
         super(Conv1D, self).__init__()
@@ -612,7 +652,7 @@ class SQuADHead(nn.Module):
 
 
 class SequenceSummary(nn.Module):
-    """ Compute a single vector summary of a sequence hidden states according to various possibilities:
+    r""" Compute a single vector summary of a sequence hidden states according to various possibilities:
         Args of the config class:
             summary_type:
                 - 'last' => [default] take the last token hidden state (like XLNet)

From f7cd7392fd2fe15c1a5eedf9c1709fafad41368a Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Mon, 15 Jul 2019 12:32:19 +0200
Subject: [PATCH 126/139] fixed tests

---
 pytorch_transformers/modeling_bert.py       | 50 +++++++++++++--------
 pytorch_transformers/modeling_gpt2.py       |  6 ++-
 pytorch_transformers/modeling_openai.py     |  6 ++-
 pytorch_transformers/modeling_transfo_xl.py |  2 +-
 pytorch_transformers/modeling_utils.py      |  2 +-
 pytorch_transformers/modeling_xlm.py        | 15 ++++---
 pytorch_transformers/modeling_xlnet.py      | 20 ++++++---
 7 files changed, 63 insertions(+), 38 deletions(-)

diff --git a/pytorch_transformers/modeling_bert.py b/pytorch_transformers/modeling_bert.py
index a044832282..27c8023c0a 100644
--- a/pytorch_transformers/modeling_bert.py
+++ b/pytorch_transformers/modeling_bert.py
@@ -253,7 +253,7 @@ class BertEmbeddings(nn.Module):
         self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps)
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
 
-    def forward(self, input_ids, position_ids=None, token_type_ids=None):
+    def forward(self, input_ids, token_type_ids=None, position_ids=None):
         seq_length = input_ids.size(1)
         if position_ids is None:
             position_ids = torch.arange(seq_length, dtype=torch.long, device=input_ids.device)
@@ -667,7 +667,7 @@ class BertModel(BertPreTrainedModel):
         for layer, heads in heads_to_prune.items():
             self.encoder.layer[layer].attention.prune_heads(heads)
 
-    def forward(self, input_ids, position_ids=None, token_type_ids=None, attention_mask=None, head_mask=None):
+    def forward(self, input_ids, token_type_ids=None, attention_mask=None, position_ids=None, head_mask=None):
         if attention_mask is None:
             attention_mask = torch.ones_like(input_ids)
         if token_type_ids is None:
@@ -703,7 +703,7 @@ class BertModel(BertPreTrainedModel):
         else:
             head_mask = [None] * self.config.num_hidden_layers
 
-        embedding_output = self.embeddings(input_ids, position_ids, token_type_ids)
+        embedding_output = self.embeddings(input_ids, position_ids=position_ids, token_type_ids=token_type_ids)
         encoder_outputs = self.encoder(embedding_output,
                                        extended_attention_mask,
                                        head_mask=head_mask)
@@ -772,9 +772,10 @@ class BertForPreTraining(BertPreTrainedModel):
         self._tie_or_clone_weights(self.cls.predictions.decoder,
                                    self.bert.embeddings.word_embeddings)
 
-    def forward(self, input_ids, position_ids=None, token_type_ids=None, attention_mask=None, masked_lm_labels=None,
-                next_sentence_label=None, head_mask=None):
-        outputs = self.bert(input_ids, position_ids, token_type_ids, attention_mask, head_mask=head_mask)
+    def forward(self, input_ids, token_type_ids=None, attention_mask=None, masked_lm_labels=None,
+                next_sentence_label=None, position_ids=None, head_mask=None):
+        outputs = self.bert(input_ids, position_ids=position_ids, token_type_ids=token_type_ids,
+                            attention_mask=attention_mask, head_mask=head_mask)
 
         sequence_output, pooled_output = outputs[:2]
         prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output)
@@ -841,8 +842,10 @@ class BertForMaskedLM(BertPreTrainedModel):
         self._tie_or_clone_weights(self.cls.predictions.decoder,
                                    self.bert.embeddings.word_embeddings)
 
-    def forward(self, input_ids, position_ids=None, token_type_ids=None, attention_mask=None, masked_lm_labels=None, head_mask=None):
-        outputs = self.bert(input_ids, position_ids, token_type_ids, attention_mask, head_mask=head_mask)
+    def forward(self, input_ids, token_type_ids=None, attention_mask=None, masked_lm_labels=None,
+                position_ids=None, head_mask=None):
+        outputs = self.bert(input_ids, position_ids=position_ids, token_type_ids=token_type_ids,
+                            attention_mask=attention_mask, head_mask=head_mask)
 
         sequence_output = outputs[0]
         prediction_scores = self.cls(sequence_output)
@@ -898,8 +901,10 @@ class BertForNextSentencePrediction(BertPreTrainedModel):
 
         self.apply(self.init_weights)
 
-    def forward(self, input_ids, position_ids=None, token_type_ids=None, attention_mask=None, next_sentence_label=None, head_mask=None):
-        outputs = self.bert(input_ids, position_ids, token_type_ids, attention_mask, head_mask=head_mask)
+    def forward(self, input_ids, token_type_ids=None, attention_mask=None, next_sentence_label=None,
+                position_ids=None, head_mask=None):
+        outputs = self.bert(input_ids, position_ids=position_ids, token_type_ids=token_type_ids,
+                            attention_mask=attention_mask, head_mask=head_mask)
         pooled_output = outputs[1]
 
         seq_relationship_score = self.cls(pooled_output)
@@ -959,8 +964,10 @@ class BertForSequenceClassification(BertPreTrainedModel):
 
         self.apply(self.init_weights)
 
-    def forward(self, input_ids, position_ids=None, token_type_ids=None, attention_mask=None, labels=None, head_mask=None):
-        outputs = self.bert(input_ids, position_ids, token_type_ids, attention_mask, head_mask=head_mask)
+    def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None,
+                position_ids=None, head_mask=None):
+        outputs = self.bert(input_ids, position_ids=position_ids, token_type_ids=token_type_ids,
+                            attention_mask=attention_mask, head_mask=head_mask)
         pooled_output = outputs[1]
 
         pooled_output = self.dropout(pooled_output)
@@ -1063,14 +1070,16 @@ class BertForMultipleChoice(BertPreTrainedModel):
 
         self.apply(self.init_weights)
 
-    def forward(self, input_ids, position_ids=None, token_type_ids=None, attention_mask=None, labels=None, head_mask=None):
+    def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None,
+                position_ids=None, head_mask=None):
         num_choices = input_ids.shape[1]
 
         flat_input_ids = input_ids.view(-1, input_ids.size(-1))
         flat_position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
         flat_token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
         flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
-        outputs = self.bert(flat_input_ids, flat_position_ids, flat_token_type_ids, flat_attention_mask, head_mask=head_mask)
+        outputs = self.bert(flat_input_ids, position_ids=flat_position_ids, token_type_ids=flat_token_type_ids,
+                            attention_mask=flat_attention_mask, head_mask=head_mask)
         pooled_output = outputs[1]
 
         pooled_output = self.dropout(pooled_output)
@@ -1131,8 +1140,10 @@ class BertForTokenClassification(BertPreTrainedModel):
 
         self.apply(self.init_weights)
 
-    def forward(self, input_ids, position_ids=None, token_type_ids=None, attention_mask=None, labels=None, head_mask=None):
-        outputs = self.bert(input_ids, position_ids, token_type_ids, attention_mask, head_mask=head_mask)
+    def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None,
+                position_ids=None, head_mask=None):
+        outputs = self.bert(input_ids, position_ids=position_ids, token_type_ids=token_type_ids,
+                            attention_mask=attention_mask, head_mask=head_mask)
         sequence_output = outputs[0]
 
         sequence_output = self.dropout(sequence_output)
@@ -1205,9 +1216,10 @@ class BertForQuestionAnswering(BertPreTrainedModel):
 
         self.apply(self.init_weights)
 
-    def forward(self, input_ids, position_ids=None, token_type_ids=None, attention_mask=None, start_positions=None,
-                end_positions=None, head_mask=None):
-        outputs = self.bert(input_ids, position_ids, token_type_ids, attention_mask, head_mask=head_mask)
+    def forward(self, input_ids, token_type_ids=None, attention_mask=None, start_positions=None,
+                end_positions=None, position_ids=None, head_mask=None):
+        outputs = self.bert(input_ids, position_ids=position_ids, token_type_ids=token_type_ids,
+                            attention_mask=attention_mask, head_mask=head_mask)
         sequence_output = outputs[0]
 
         logits = self.qa_outputs(sequence_output)
diff --git a/pytorch_transformers/modeling_gpt2.py b/pytorch_transformers/modeling_gpt2.py
index 415396496c..8edd7555db 100644
--- a/pytorch_transformers/modeling_gpt2.py
+++ b/pytorch_transformers/modeling_gpt2.py
@@ -591,7 +591,8 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
                                    self.transformer.wte)
 
     def forward(self, input_ids, position_ids=None, token_type_ids=None, labels=None, past=None, head_mask=None):
-        transformer_outputs = self.transformer(input_ids, position_ids, token_type_ids, past, head_mask)
+        transformer_outputs = self.transformer(input_ids, position_ids=position_ids, token_type_ids=token_type_ids,
+                                               past=past, head_mask=head_mask)
         hidden_states = transformer_outputs[0]
 
         lm_logits = self.lm_head(hidden_states)
@@ -709,7 +710,8 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
 
     def forward(self, input_ids, mc_token_ids=None, lm_labels=None, mc_labels=None, token_type_ids=None,
                 position_ids=None, past=None, head_mask=None):
-        transformer_outputs = self.transformer(input_ids, position_ids, token_type_ids, past, head_mask)
+        transformer_outputs = self.transformer(input_ids, position_ids=position_ids, token_type_ids=token_type_ids,
+                                               past=past, head_mask=head_mask)
         hidden_states = transformer_outputs[0]
 
         lm_logits = self.lm_head(hidden_states)
diff --git a/pytorch_transformers/modeling_openai.py b/pytorch_transformers/modeling_openai.py
index d51e4309b8..ebd4166b99 100644
--- a/pytorch_transformers/modeling_openai.py
+++ b/pytorch_transformers/modeling_openai.py
@@ -582,7 +582,8 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
                                    self.transformer.tokens_embed)
 
     def forward(self, input_ids, position_ids=None, token_type_ids=None, labels=None, head_mask=None):
-        transformer_outputs = self.transformer(input_ids, position_ids, token_type_ids, head_mask)
+        transformer_outputs = self.transformer(input_ids, position_ids=position_ids, token_type_ids=token_type_ids,
+                                               head_mask=head_mask)
         hidden_states = transformer_outputs[0]
         lm_logits = self.lm_head(hidden_states)
 
@@ -693,7 +694,8 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
 
     def forward(self, input_ids, mc_token_ids=None, lm_labels=None, mc_labels=None, token_type_ids=None,
                 position_ids=None, head_mask=None):
-        transformer_outputs = self.transformer(input_ids, position_ids, token_type_ids, head_mask)
+        transformer_outputs = self.transformer(input_ids, position_ids=position_ids, token_type_ids=token_type_ids,
+                                               head_mask=head_mask)
         hidden_states = transformer_outputs[0]
 
         lm_logits = self.lm_head(hidden_states)
diff --git a/pytorch_transformers/modeling_transfo_xl.py b/pytorch_transformers/modeling_transfo_xl.py
index d9c8cba8db..c9ae7cd1a9 100644
--- a/pytorch_transformers/modeling_transfo_xl.py
+++ b/pytorch_transformers/modeling_transfo_xl.py
@@ -1344,7 +1344,7 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel):
         bsz = input_ids.size(0)
         tgt_len = input_ids.size(1)
 
-        transformer_outputs = self.transformer(input_ids, mems, head_mask)
+        transformer_outputs = self.transformer(input_ids, mems=mems, head_mask=head_mask)
 
         last_hidden = transformer_outputs[0]
         pred_hid = last_hidden[:, -tgt_len:]
diff --git a/pytorch_transformers/modeling_utils.py b/pytorch_transformers/modeling_utils.py
index 7a9777a0eb..3f21c98b04 100644
--- a/pytorch_transformers/modeling_utils.py
+++ b/pytorch_transformers/modeling_utils.py
@@ -594,7 +594,7 @@ class SQuADHead(nn.Module):
         """
         outputs = ()
 
-        start_logits = self.start_logits(hidden_states, p_mask)
+        start_logits = self.start_logits(hidden_states, p_mask=p_mask)
 
         if start_positions is not None and end_positions is not None:
             # If we are on multi-GPU, let's remove the dimension added by batch splitting
diff --git a/pytorch_transformers/modeling_xlm.py b/pytorch_transformers/modeling_xlm.py
index 33b5bcf7fe..7d08c462ad 100644
--- a/pytorch_transformers/modeling_xlm.py
+++ b/pytorch_transformers/modeling_xlm.py
@@ -768,8 +768,9 @@ class XLMWithLMHeadModel(XLMPreTrainedModel):
 
     def forward(self, input_ids, lengths=None, position_ids=None, langs=None, token_type_ids=None,
                 attention_mask=None, cache=None, labels=None, head_mask=None):
-        transformer_outputs = self.transformer(input_ids, lengths=lengths, position_ids=position_ids, token_type_ids=token_type_ids,
-                                               langs=langs, attention_mask=attention_mask, cache=cache, head_mask=head_mask)
+        transformer_outputs = self.transformer(input_ids, lengths=lengths, position_ids=position_ids,
+                                               token_type_ids=token_type_ids, langs=langs,
+                                               attention_mask=attention_mask, cache=cache, head_mask=head_mask)
 
         output = transformer_outputs[0]
         outputs = self.pred_layer(output, labels)
@@ -825,8 +826,9 @@ class XLMForSequenceClassification(XLMPreTrainedModel):
 
     def forward(self, input_ids, lengths=None, position_ids=None, langs=None, token_type_ids=None,
                 attention_mask=None, cache=None, labels=None, head_mask=None):
-        transformer_outputs = self.transformer(input_ids, lengths=lengths, position_ids=position_ids, token_type_ids=token_type_ids,
-                                               langs=langs, attention_mask=attention_mask, cache=cache, head_mask=head_mask)
+        transformer_outputs = self.transformer(input_ids, lengths=lengths, position_ids=position_ids,
+                                               token_type_ids=token_type_ids, langs=langs,
+                                               attention_mask=attention_mask, cache=cache, head_mask=head_mask)
 
         output = transformer_outputs[0]
         logits = self.sequence_summary(output)
@@ -905,8 +907,9 @@ class XLMForQuestionAnswering(XLMPreTrainedModel):
     def forward(self, input_ids, lengths=None, position_ids=None, langs=None, token_type_ids=None,
                 attention_mask=None, cache=None, start_positions=None, end_positions=None,
                 cls_index=None, is_impossible=None, p_mask=None, head_mask=None):
-        transformer_outputs = self.transformer(input_ids, lengths=lengths, position_ids=position_ids, token_type_ids=token_type_ids,
-                                               langs=langs, attention_mask=attention_mask, cache=cache, head_mask=head_mask)
+        transformer_outputs = self.transformer(input_ids, lengths=lengths, position_ids=position_ids,
+                                               token_type_ids=token_type_ids, langs=langs,
+                                               attention_mask=attention_mask, cache=cache, head_mask=head_mask)
 
         output = transformer_outputs[0]
 
diff --git a/pytorch_transformers/modeling_xlnet.py b/pytorch_transformers/modeling_xlnet.py
index a46426d82a..5e576c51c1 100644
--- a/pytorch_transformers/modeling_xlnet.py
+++ b/pytorch_transformers/modeling_xlnet.py
@@ -1049,8 +1049,10 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
     def forward(self, input_ids, token_type_ids=None, input_mask=None, attention_mask=None,
                 mems=None, perm_mask=None, target_mapping=None,
                 labels=None, head_mask=None):
-        transformer_outputs = self.transformer(input_ids, token_type_ids, input_mask, attention_mask,
-                                               mems, perm_mask, target_mapping, head_mask)
+        transformer_outputs = self.transformer(input_ids, token_type_ids=token_type_ids,
+                                               input_mask=input_mask, attention_mask=attention_mask,
+                                               mems=mems, perm_mask=perm_mask, target_mapping=target_mapping,
+                                               head_mask=head_mask)
 
         logits = self.lm_loss(transformer_outputs[0])
 
@@ -1119,8 +1121,10 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel):
     def forward(self, input_ids, token_type_ids=None, input_mask=None, attention_mask=None,
                 mems=None, perm_mask=None, target_mapping=None,
                 labels=None, head_mask=None):
-        transformer_outputs = self.transformer(input_ids, token_type_ids, input_mask, attention_mask,
-                                               mems, perm_mask, target_mapping, head_mask)
+        transformer_outputs = self.transformer(input_ids, token_type_ids=token_type_ids,
+                                               input_mask=input_mask, attention_mask=attention_mask,
+                                               mems=mems, perm_mask=perm_mask, target_mapping=target_mapping,
+                                               head_mask=head_mask)
         output = transformer_outputs[0]
 
         output = self.sequence_summary(output)
@@ -1209,10 +1213,12 @@ class XLNetForQuestionAnswering(XLNetPreTrainedModel):
                 mems=None, perm_mask=None, target_mapping=None,
                 start_positions=None, end_positions=None, cls_index=None, is_impossible=None, p_mask=None,
                 head_mask=None):
-        transformer_outputs = self.transformer(input_ids, token_type_ids, input_mask, attention_mask,
-                                               mems, perm_mask, target_mapping, head_mask)
+        transformer_outputs = self.transformer(input_ids, token_type_ids=token_type_ids,
+                                               input_mask=input_mask, attention_mask=attention_mask,
+                                               mems=mems, perm_mask=perm_mask, target_mapping=target_mapping,
+                                               head_mask=head_mask)
         hidden_states = transformer_outputs[0]
-        start_logits = self.start_logits(hidden_states, p_mask)
+        start_logits = self.start_logits(hidden_states, p_mask=p_mask)
 
         outputs = transformer_outputs[1:]  # Keep mems, hidden states, attentions if there are in it
 

From a9ab15174cde498aa539a40da0676088d745a531 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Mon, 15 Jul 2019 12:42:12 +0200
Subject: [PATCH 127/139] fix #328

---
 pytorch_transformers/tokenization_bert.py | 17 +++++++----------
 1 file changed, 7 insertions(+), 10 deletions(-)

diff --git a/pytorch_transformers/tokenization_bert.py b/pytorch_transformers/tokenization_bert.py
index 836b20aef7..e552407689 100644
--- a/pytorch_transformers/tokenization_bert.py
+++ b/pytorch_transformers/tokenization_bert.py
@@ -66,15 +66,11 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
 def load_vocab(vocab_file):
     """Loads a vocabulary file into a dictionary."""
     vocab = collections.OrderedDict()
-    index = 0
     with open(vocab_file, "r", encoding="utf-8") as reader:
-        while True:
-            token = reader.readline()
-            if not token:
-                break
-            token = token.strip()
-            vocab[token] = index
-            index += 1
+        tokens = reader.read().splitlines()
+    for index, token in enumerate(tokens):
+        vocab[token] = index
+        index += 1
     return vocab
 
 
@@ -213,7 +209,7 @@ class BasicTokenizer(object):
         self.do_lower_case = do_lower_case
         self.never_split = never_split
 
-    def tokenize(self, text, never_split=None):
+    def tokenize(self, text, never_split=None, tokenize_chinese_chars=True):
         """Tokenizes a piece of text."""
         never_split = self.never_split + (never_split if never_split is not None else [])
         text = self._clean_text(text)
@@ -223,7 +219,8 @@ class BasicTokenizer(object):
         # and generally don't have any Chinese data in them (there are Chinese
         # characters in the vocabulary because Wikipedia does have some Chinese
         # words in the English Wikipedia.).
-        text = self._tokenize_chinese_chars(text)
+        if tokenize_chinese_chars:
+            text = self._tokenize_chinese_chars(text)
         orig_tokens = whitespace_tokenize(text)
         split_tokens = []
         for token in orig_tokens:

From ab49fafc047b13d215f1857b3b638cabf19c3fe8 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Mon, 15 Jul 2019 12:51:23 +0200
Subject: [PATCH 128/139] update tokenization docstrings for #328

---
 pytorch_transformers/tokenization_bert.py | 57 ++++++++++++++++-------
 1 file changed, 40 insertions(+), 17 deletions(-)

diff --git a/pytorch_transformers/tokenization_bert.py b/pytorch_transformers/tokenization_bert.py
index e552407689..8b34a43e5a 100644
--- a/pytorch_transformers/tokenization_bert.py
+++ b/pytorch_transformers/tokenization_bert.py
@@ -104,16 +104,23 @@ class BertTokenizer(PreTrainedTokenizer):
 
     def __init__(self, vocab_file, do_lower_case=True, do_basic_tokenize=True, never_split=None,
                  unk_token="[UNK]", sep_token="[SEP]", pad_token="[PAD]", cls_token="[CLS]",
-                 mask_token="[MASK]", **kwargs):
+                 mask_token="[MASK]", tokenize_chinese_chars=True, **kwargs):
         """Constructs a BertTokenizer.
 
         Args:
-          vocab_file: Path to a one-wordpiece-per-line vocabulary file
-          do_lower_case: Whether to lower case the input
-                         Only has an effect when do_wordpiece_only=False
-          do_basic_tokenize: Whether to do basic tokenization before wordpiece.
-          never_split: List of tokens which will never be split during tokenization.
-                         Only has an effect when do_wordpiece_only=False
+            **vocab_file**: Path to a one-wordpiece-per-line vocabulary file
+            **do_lower_case**: (`optional`) boolean (default True)
+                Whether to lower case the input
+                Only has an effect when do_basic_tokenize=True
+            **do_basic_tokenize**: (`optional`) boolean (default True)
+                Whether to do basic tokenization before wordpiece.
+            **never_split**: (`optional`) list of string
+                List of tokens which will never be split during tokenization.
+                Only has an effect when do_basic_tokenize=True
+            **tokenize_chinese_chars**: (`optional`) boolean (default True)
+                Whether to tokenize Chinese characters.
+                This should likely be desactivated for Japanese:
+                see: https://github.com/huggingface/pytorch-pretrained-BERT/issues/328
         """
         super(BertTokenizer, self).__init__(unk_token=unk_token, sep_token=sep_token,
                                             pad_token=pad_token, cls_token=cls_token,
@@ -127,8 +134,9 @@ class BertTokenizer(PreTrainedTokenizer):
             [(ids, tok) for tok, ids in self.vocab.items()])
         self.do_basic_tokenize = do_basic_tokenize
         if do_basic_tokenize:
-          self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case,
-                                                never_split=never_split)
+            self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case,
+                                                  never_split=never_split,
+                                                  tokenize_chinese_chars=tokenize_chinese_chars)
         self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token)
 
     @property
@@ -196,21 +204,36 @@ class BertTokenizer(PreTrainedTokenizer):
 class BasicTokenizer(object):
     """Runs basic tokenization (punctuation splitting, lower casing, etc.)."""
 
-    def __init__(self,
-                 do_lower_case=True,
-                 never_split=None):
-        """Constructs a BasicTokenizer.
+    def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True):
+        """ Constructs a BasicTokenizer.
 
         Args:
-          do_lower_case: Whether to lower case the input.
+            **do_lower_case**: Whether to lower case the input.
+            **never_split**: (`optional`) list of str
+                Kept for backward compatibility purposes.
+                Now implemented directly at the base class level (see :func:`PreTrainedTokenizer.tokenize`)
+                List of token not to split.
+            **tokenize_chinese_chars**: (`optional`) boolean (default True)
+                Whether to tokenize Chinese characters.
+                This should likely be desactivated for Japanese:
+                see: https://github.com/huggingface/pytorch-pretrained-BERT/issues/328
         """
         if never_split is None:
             never_split = []
         self.do_lower_case = do_lower_case
         self.never_split = never_split
+        self.tokenize_chinese_chars = tokenize_chinese_chars
 
-    def tokenize(self, text, never_split=None, tokenize_chinese_chars=True):
-        """Tokenizes a piece of text."""
+    def tokenize(self, text, never_split=None):
+        """ Basic Tokenization of a piece of text.
+            Split on "white spaces" only, for sub-word tokenization, see WordPieceTokenizer.
+
+        Args:
+            **never_split**: (`optional`) list of str
+                Kept for backward compatibility purposes.
+                Now implemented directly at the base class level (see :func:`PreTrainedTokenizer.tokenize`)
+                List of token not to split.
+        """
         never_split = self.never_split + (never_split if never_split is not None else [])
         text = self._clean_text(text)
         # This was added on November 1st, 2018 for the multilingual and Chinese
@@ -219,7 +242,7 @@ class BasicTokenizer(object):
         # and generally don't have any Chinese data in them (there are Chinese
         # characters in the vocabulary because Wikipedia does have some Chinese
         # words in the English Wikipedia.).
-        if tokenize_chinese_chars:
+        if self.tokenize_chinese_chars:
             text = self._tokenize_chinese_chars(text)
         orig_tokens = whitespace_tokenize(text)
         split_tokens = []

From 74a24f0fe96d3ca19157171e333cfcc2ab4707c5 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Mon, 15 Jul 2019 13:49:01 +0200
Subject: [PATCH 129/139] clean up file_utils

---
 pytorch_transformers/file_utils.py | 18 ------------------
 1 file changed, 18 deletions(-)

diff --git a/pytorch_transformers/file_utils.py b/pytorch_transformers/file_utils.py
index 1397bd416b..25cdf99837 100644
--- a/pytorch_transformers/file_utils.py
+++ b/pytorch_transformers/file_utils.py
@@ -258,21 +258,3 @@ def get_from_cache(url, cache_dir=None):
             logger.info("removing temp file %s", temp_file.name)
 
     return cache_path
-
-
-def read_set_from_file(filename):
-    '''
-    Extract a de-duped collection (set) of text from a file.
-    Expected file format is one item per line.
-    '''
-    collection = set()
-    with open(filename, 'r', encoding='utf-8') as file_:
-        for line in file_:
-            collection.add(line.rstrip())
-    return collection
-
-
-def get_file_extension(path, dot=True, lower=True):
-    ext = os.path.splitext(path)[1]
-    ext = ext if dot else ext[1:]
-    return ext.lower() if lower else ext

From 8ca767f13c988f27126bcc99a10bce1671c032de Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Mon, 15 Jul 2019 13:49:07 +0200
Subject: [PATCH 130/139] clean up optimization

---
 pytorch_transformers/optimization.py | 120 ++++++++++++---------------
 1 file changed, 55 insertions(+), 65 deletions(-)

diff --git a/pytorch_transformers/optimization.py b/pytorch_transformers/optimization.py
index f0ac914341..c08d3cb58b 100644
--- a/pytorch_transformers/optimization.py
+++ b/pytorch_transformers/optimization.py
@@ -24,55 +24,16 @@ from torch.optim.lr_scheduler import LambdaLR
 logger = logging.getLogger(__name__)
 
 class ConstantLRSchedule(LambdaLR):
+    """ Constant learning rate schedule.
+    """
     def __init__(self, optimizer, last_epoch=-1):
         super(ConstantLRSchedule, self).__init__(optimizer, lambda _: 1.0, last_epoch=last_epoch)
 
-class WarmupCosineSchedule(LambdaLR):
-    """
-    Linearly increases learning rate from 0 to 1 over `warmup` training steps.
-    Decreases learning rate from 1. to 0. over remaining `t_total - warmup` steps following a cosine curve.
-    If `cycles` (default=0.5) is different from default, learning rate follows cosine function after warmup.
-    :param warmup:      see LRSchedule
-    :param t_total:     see LRSchedule
-    :param cycles:      number of cycles. Default: 0.5, corresponding to cosine decay from 1. at progress==warmup and 0 at progress==1.
-    :param kw:
-    """
-    warn_t_total = True
-    def __init__(self, optimizer, warmup_steps, t_total, cycles=.5, last_epoch=-1):
-
-        def lr_lambda(step):
-            if step < warmup_steps:
-                return float(step) / float(max(1.0, warmup_steps))
-            else:
-                progress = float(step - warmup_steps) / float(max(1, t_total - warmup_steps))   # progress after warmup
-                return 0.5 * (1. + math.cos(math.pi * float(cycles) * 2.0 * progress))
-
-        super(WarmupCosineSchedule, self).__init__(optimizer, lr_lambda, last_epoch=last_epoch)
-
-class WarmupCosineWithHardRestartsSchedule(LambdaLR):
-    """
-    Linearly increases learning rate from 0 to 1 over `warmup` fraction of training steps.
-    If `cycles` (default=1.) is different from default, learning rate follows `cycles` times a cosine decaying
-    learning rate (with hard restarts).
-    """
-    def __init__(self, optimizer, warmup_steps, t_total, cycles=1., last_epoch=-1):
-
-        def lr_lambda(step):
-            if step < warmup_steps:
-                return float(step) / float(max(1, warmup_steps))
-            else:
-                progress = float(step - warmup_steps) / float(max(1, t_total - warmup_steps))   # progress after warmup
-                if progress >= 1.0:
-                    return 0.0
-                return 0.5 * (1. + math.cos(math.pi * ((float(cycles) * progress) % 1.0)))
-
-        super(WarmupCosineWithHardRestartsSchedule, self).__init__(optimizer, lr_lambda, last_epoch=last_epoch)
-
 
 class WarmupConstantSchedule(LambdaLR):
-    """
-    Linearly increases learning rate from 0 to 1 over `warmup` fraction of training steps.
-    Keeps learning rate equal to 1. after warmup.
+    """ Linear warmup and then constant.
+        Linearly increases learning rate schedule from 0 to 1 over `warmup_steps` training steps.
+        Keeps learning rate schedule equal to 1. after warmup_steps.
     """
     def __init__(self, optimizer, warmup_steps, last_epoch=-1):
 
@@ -85,48 +46,77 @@ class WarmupConstantSchedule(LambdaLR):
 
 
 class WarmupLinearSchedule(LambdaLR):
-    """
-    Linearly increases learning rate from 0 to 1 over `warmup` fraction of training steps.
-    Linearly decreases learning rate from 1. to 0. over remaining `1 - warmup` steps.
+    """ Linear warmup and then linear decay.
+        Linearly increases learning rate from 0 to 1 over `warmup_steps` training steps.
+        Linearly decreases learning rate from 1. to 0. over remaining `t_total - warmup_steps` steps.
     """
     def __init__(self, optimizer, warmup_steps, t_total, last_epoch=-1):
 
         def lr_lambda(step):
             if step < warmup_steps:
                 return float(step) / float(max(1, warmup_steps))
-            return float(t_total - step) / float(max(1.0, t_total - warmup_steps))
+            return max(0.0, float(t_total - step) / float(max(1.0, t_total - warmup_steps)))
 
         super(WarmupLinearSchedule, self).__init__(optimizer, lr_lambda, last_epoch=last_epoch)
 
 
+class WarmupCosineSchedule(LambdaLR):
+    """ Linear warmup and then cosine decay.
+        Linearly increases learning rate from 0 to 1 over `warmup_steps` training steps.
+        Decreases learning rate from 1. to 0. over remaining `t_total - warmup_steps` steps following a cosine curve.
+        If `cycles` (default=0.5) is different from default, learning rate follows cosine function after warmup.
+    """
+    warn_t_total = True
+    def __init__(self, optimizer, warmup_steps, t_total, cycles=.5, last_epoch=-1):
+
+        def lr_lambda(step):
+            if step < warmup_steps:
+                return float(step) / float(max(1.0, warmup_steps))
+            else:
+                progress = float(step - warmup_steps) / float(max(1, t_total - warmup_steps))   # progress after warmup
+                return max(0.0, 0.5 * (1. + math.cos(math.pi * float(cycles) * 2.0 * progress)))
+
+        super(WarmupCosineSchedule, self).__init__(optimizer, lr_lambda, last_epoch=last_epoch)
+
+class WarmupCosineWithHardRestartsSchedule(LambdaLR):
+    """ Linear warmup and then cosine cycles with hard restarts.
+        Linearly increases learning rate from 0 to 1 over `warmup_steps` training steps.
+        If `cycles` (default=1.) is different from default, learning rate follows `cycles` times a cosine decaying
+        learning rate (with hard restarts).
+    """
+    def __init__(self, optimizer, warmup_steps, t_total, cycles=1., last_epoch=-1):
+
+        def lr_lambda(step):
+            if step < warmup_steps:
+                return float(step) / float(max(1, warmup_steps))
+            else:
+                progress = float(step - warmup_steps) / float(max(1, t_total - warmup_steps))   # progress after warmup
+                if progress >= 1.0:
+                    return 0.0
+                return max(0.0, 0.5 * (1. + math.cos(math.pi * ((float(cycles) * progress) % 1.0))))
+
+        super(WarmupCosineWithHardRestartsSchedule, self).__init__(optimizer, lr_lambda, last_epoch=last_epoch)
+
+
 class AdamW(Optimizer):
     """ Implements Adam algorithm with weight decay fix.
 
     Parameters:
-        lr: learning rate
-        warmup: portion of t_total for the warmup, -1  means no warmup. Default: -1
-        t_total: total number of training steps for the learning
-            rate schedule, -1  means constant learning rate of 1. (no warmup regardless of warmup setting). Default: -1
-        schedule: schedule to use for the warmup (see above).
-            Can be `'warmup_linear'`, `'warmup_constant'`, `'warmup_cosine'`, `'none'`, `None` or a `_LRSchedule` object (see below).
-            If `None` or `'none'`, learning rate is always kept constant.
-            Default : `'warmup_linear'`
-        b1: Adams b1. Default: 0.9
-        b2: Adams b2. Default: 0.999
-        e: Adams epsilon. Default: 1e-6
-        weight_decay: Weight decay. Default: 0.01
-        max_grad_norm: Maximum norm for the gradients (-1 means no clipping). Default: 1.0
-        correct_bias: can be set to False to avoid correcting bias in Adam (e.g. like in Bert repository)
+        lr (float): learning rate. Default 1e-3.
+        betas (tuple of 2 floats): Adams beta parameters (b1, b2). Default: (0.9, 0.999)
+        eps (float): Adams epsilon. Default: 1e-6
+        weight_decay (float): Weight decay. Default: 0.0
+        correct_bias (bool): can be set to False to avoid correcting bias in Adam (e.g. like in Bert TF repository). Default True.
     """
-    def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-6, weight_decay=0.01, correct_bias=True):
+    def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-6, weight_decay=0.0, correct_bias=True):
         if lr < 0.0:
             raise ValueError("Invalid learning rate: {} - should be >= 0.0".format(lr))
         if not 0.0 <= betas[0] < 1.0:
             raise ValueError("Invalid beta parameter: {} - should be in [0.0, 1.0[".format(betas[0]))
         if not 0.0 <= betas[1]  < 1.0:
-            raise ValueError("Invalid beta parameter: {} - should be in [0.0, 1.0[".format(betas[1] ))
+            raise ValueError("Invalid beta parameter: {} - should be in [0.0, 1.0[".format(betas[1]))
         if not 0.0 <= eps:
-            raise ValueError("Invalid epsilon value: {} - should be >= 0.0".format(e))
+            raise ValueError("Invalid epsilon value: {} - should be >= 0.0".format(eps))
         defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay,
                         correct_bias=correct_bias)
         super(AdamW, self).__init__(params, defaults)

From 3b469cb4229c983ee8b4fed58284742f6ac93f9a Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Mon, 15 Jul 2019 15:28:37 +0200
Subject: [PATCH 131/139] updating squad for compatibility with XLNet

---
 examples/run_squad.py                  |  67 ++++--
 examples/utils_squad.py                | 270 +++++++++++++++++++++++--
 examples/utils_squad_evaluate.py       |  43 +++-
 pytorch_transformers/modeling_utils.py |  79 ++++++--
 pytorch_transformers/modeling_xlnet.py |   5 +-
 5 files changed, 402 insertions(+), 62 deletions(-)

diff --git a/examples/run_squad.py b/examples/run_squad.py
index 24f00e0518..2025217454 100644
--- a/examples/run_squad.py
+++ b/examples/run_squad.py
@@ -41,7 +41,9 @@ from pytorch_transformers import (WEIGHTS_NAME, BertConfig,
 
 from pytorch_transformers import AdamW, WarmupLinearSchedule
 
-from utils_squad import read_squad_examples, convert_examples_to_features, RawResult, write_predictions
+from utils_squad import (read_squad_examples, convert_examples_to_features,
+                         RawResult, write_predictions,
+                         RawResultExtended, write_predictions_extended)
 
 # The follwing import is the official SQuAD evaluation script (2.0).
 # You can remove it from the dependencies if you are using this script outside of the library
@@ -66,6 +68,8 @@ def set_seed(args):
     if args.n_gpu > 0:
         torch.cuda.manual_seed_all(args.seed)
 
+def to_list(tensor):
+    return tensor.detach().cpu().tolist()
 
 def train(args, train_dataset, model, tokenizer):
     """ Train the model """
@@ -118,10 +122,13 @@ def train(args, train_dataset, model, tokenizer):
             model.train()
             batch = tuple(t.to(args.device) for t in batch)
             inputs = {'input_ids':       batch[0],
-                      'token_type_ids':  batch[1] if args.model_type in ['bert', 'xlnet'] else None,  # XLM don't use segment_ids
+                      'token_type_ids':  None if args.model_type == 'xlm' else batch[1],  # XLM don't use segment_ids
                       'attention_mask':  batch[2],
                       'start_positions': batch[3],
                       'end_positions':   batch[4]}
+            if args.model_type in ['xlnet', 'xlm']:
+                inputs.update({'cls_index': batch[5],
+                               'p_mask':    batch[6]})
             ouputs = model(**inputs)
             loss = ouputs[0]  # model outputs are always tuple in pytorch-transformers (see doc)
 
@@ -197,31 +204,50 @@ def evaluate(args, model, tokenizer, prefix=""):
     for batch in tqdm(eval_dataloader, desc="Evaluating"):
         model.eval()
         batch = tuple(t.to(args.device) for t in batch)
-        example_indices = batch[3]
         with torch.no_grad():
             inputs = {'input_ids':      batch[0],
-                        'token_type_ids': batch[1] if args.model_type in ['bert', 'xlnet'] else None,  # XLM don't use segment_ids
-                        'attention_mask': batch[2]}
+                      'token_type_ids': None if args.model_type == 'xlm' else batch[1],  # XLM don't use segment_ids
+                      'attention_mask': batch[2]}
+            example_indices = batch[3]
+            if args.model_type in ['xlnet', 'xlm']:
+                inputs.update({'cls_index': batch[4],
+                               'p_mask':    batch[5]})
             outputs = model(**inputs)
             batch_start_logits, batch_end_logits = outputs[:2]
 
         for i, example_index in enumerate(example_indices):
-            start_logits = batch_start_logits[i].detach().cpu().tolist()
-            end_logits = batch_end_logits[i].detach().cpu().tolist()
             eval_feature = features[example_index.item()]
             unique_id = int(eval_feature.unique_id)
-            all_results.append(RawResult(unique_id=unique_id,
-                                         start_logits=start_logits,
-                                         end_logits=end_logits))
+            if args.model_type in ['xlnet', 'xlm']:
+                # XLNet uses a more complex post-processing procedure
+                result = RawResultExtended(unique_id            = unique_id,
+                                           start_top_log_probs  = to_list(outputs[0][i]),
+                                           start_top_index      = to_list(outputs[1][i]),
+                                           end_top_log_probs    = to_list(outputs[2][i]),
+                                           end_top_index        = to_list(outputs[3][i]),
+                                           cls_logits           = to_list(outputs[4][i]))
+            else:
+                result = RawResult(unique_id    = unique_id,
+                                   start_logits = to_list(outputs[0][i]),
+                                   end_logits   = to_list(outputs[1][i]))
+            all_results.append(result)
 
     # Compute predictions
     output_prediction_file = os.path.join(args.output_dir, "predictions_{}.json".format(prefix))
     output_nbest_file = os.path.join(args.output_dir, "nbest_predictions_{}.json".format(prefix))
     output_null_log_odds_file = os.path.join(args.output_dir, "null_odds_{}.json".format(prefix))
-    write_predictions(examples, features, all_results, args.n_best_size, args.max_answer_length,
-                      args.do_lower_case, output_prediction_file, output_nbest_file,
-                      output_null_log_odds_file, args.verbose_logging,
-                      args.version_2_with_negative, args.null_score_diff_threshold)
+
+    if args.model_type in ['xlnet', 'xlm']:
+        # XLNet uses a more complex post-processing procedure
+        write_predictions_extended(examples, features, all_results, args.n_best_size,
+                        args.max_answer_length, output_prediction_file,
+                        output_nbest_file, output_null_log_odds_file, args.predict_file,
+                        args.start_n_top, args.end_n_top, args.version_2_with_negative)
+    else:
+        write_predictions(examples, features, all_results, args.n_best_size,
+                        args.max_answer_length, args.do_lower_case, output_prediction_file,
+                        output_nbest_file, output_null_log_odds_file, args.verbose_logging,
+                        args.version_2_with_negative, args.null_score_diff_threshold)
 
     # Evaluate with the official SQuAD script
     evaluate_options = EVAL_OPTS(data_file=args.predict_file,
@@ -244,8 +270,8 @@ def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=Fal
     else:
         logger.info("Creating features from dataset file at %s", input_file)
         examples = read_squad_examples(input_file=input_file,
-                                       is_training=not evaluate,
-                                       version_2_with_negative=args.version_2_with_negative)
+                                                is_training=not evaluate,
+                                                version_2_with_negative=args.version_2_with_negative)
         features = convert_examples_to_features(examples=examples,
                                                 tokenizer=tokenizer,
                                                 max_seq_length=args.max_seq_length,
@@ -260,13 +286,18 @@ def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=Fal
     all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
     all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
     all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
+    all_cls_index = torch.tensor([f.cls_index for f in features], dtype=torch.long)
+    all_p_mask = torch.tensor([f.p_mask for f in features], dtype=torch.float)
     if evaluate:
         all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
-        dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_example_index)
+        dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
+                                all_example_index, all_cls_index, all_p_mask)
     else:
         all_start_positions = torch.tensor([f.start_position for f in features], dtype=torch.long)
         all_end_positions = torch.tensor([f.end_position for f in features], dtype=torch.long)
-        dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_start_positions, all_end_positions)
+        dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
+                                all_start_positions, all_end_positions,
+                                all_cls_index, all_p_mask)
 
     if output_examples:
         return dataset, examples, features
diff --git a/examples/utils_squad.py b/examples/utils_squad.py
index 305eeb7b40..d898a0a17e 100644
--- a/examples/utils_squad.py
+++ b/examples/utils_squad.py
@@ -26,6 +26,9 @@ from io import open
 
 from pytorch_transformers.tokenization_bert import BasicTokenizer, whitespace_tokenize
 
+# Required by XLNet evaluation method to compute optimal threshold (see write_predictions_extended() method)
+from utils_squad_evaluate import find_all_best_thresh_v2, make_qid_to_has_ans, get_raw_scores
+
 logger = logging.getLogger(__name__)
 
 
@@ -82,6 +85,8 @@ class InputFeatures(object):
                  input_ids,
                  input_mask,
                  segment_ids,
+                 cls_index,
+                 p_mask,
                  start_position=None,
                  end_position=None,
                  is_impossible=None):
@@ -94,6 +99,8 @@ class InputFeatures(object):
         self.input_ids = input_ids
         self.input_mask = input_mask
         self.segment_ids = segment_ids
+        self.cls_index = cls_index
+        self.p_mask = p_mask
         self.start_position = start_position
         self.end_position = end_position
         self.is_impossible = is_impossible
@@ -178,13 +185,25 @@ def read_squad_examples(input_file, is_training, version_2_with_negative):
 
 
 def convert_examples_to_features(examples, tokenizer, max_seq_length,
-                                 doc_stride, max_query_length, is_training):
+                                 doc_stride, max_query_length, is_training,
+                                 cls_token_at_end=False,
+                                 cls_token='[CLS]', sep_token='[SEP]', pad_token=0,
+                                 sequence_a_segment_id=0, sequence_b_segment_id=1,
+                                 cls_token_segment_id=0, pad_token_segment_id=0,
+                                 mask_padding_with_zero=True):
     """Loads a data file into a list of `InputBatch`s."""
 
     unique_id = 1000000000
+    # cnt_pos, cnt_neg = 0, 0
+    # max_N, max_M = 1024, 1024
+    # f = np.zeros((max_N, max_M), dtype=np.float32)
 
     features = []
     for (example_index, example) in enumerate(examples):
+
+        # if example_index % 100 == 0:
+        #     logger.info('Converting %s/%s pos %s neg %s', example_index, len(examples), cnt_pos, cnt_neg)
+
         query_tokens = tokenizer.tokenize(example.question_text)
 
         if len(query_tokens) > max_query_length:
@@ -239,14 +258,30 @@ def convert_examples_to_features(examples, tokenizer, max_seq_length,
             token_to_orig_map = {}
             token_is_max_context = {}
             segment_ids = []
-            tokens.append("[CLS]")
-            segment_ids.append(0)
+
+            # p_mask: mask with 1 for token than cannot be in the answer (0 for token which can be in an answer)
+            # Original TF implem also keep the classification token (set to 0) (not sure why...)
+            p_mask = []
+
+            # CLS token at the beginning
+            if not cls_token_at_end:
+                tokens.append(cls_token)
+                segment_ids.append(cls_token_segment_id)
+                p_mask.append(0)
+                cls_index = 0
+
+            # Query
             for token in query_tokens:
                 tokens.append(token)
-                segment_ids.append(0)
-            tokens.append("[SEP]")
-            segment_ids.append(0)
+                segment_ids.append(sequence_a_segment_id)
+                p_mask.append(1)
 
+            # SEP token
+            tokens.append(sep_token)
+            segment_ids.append(sequence_a_segment_id)
+            p_mask.append(1)
+
+            # Paragraph
             for i in range(doc_span.length):
                 split_token_index = doc_span.start + i
                 token_to_orig_map[len(tokens)] = tok_to_orig_index[split_token_index]
@@ -255,29 +290,42 @@ def convert_examples_to_features(examples, tokenizer, max_seq_length,
                                                        split_token_index)
                 token_is_max_context[len(tokens)] = is_max_context
                 tokens.append(all_doc_tokens[split_token_index])
-                segment_ids.append(1)
-            tokens.append("[SEP]")
-            segment_ids.append(1)
+                segment_ids.append(sequence_b_segment_id)
+                p_mask.append(0)
+
+            # SEP token
+            tokens.append(sep_token)
+            segment_ids.append(sequence_b_segment_id)
+            p_mask.append(1)
+
+            # CLS token at the end
+            if cls_token_at_end:
+                tokens.append(cls_token)
+                segment_ids.append(cls_token_segment_id)
+                p_mask.append(0)
+                cls_index = len(tokens) - 1  # Index of classification token
 
             input_ids = tokenizer.convert_tokens_to_ids(tokens)
 
             # The mask has 1 for real tokens and 0 for padding tokens. Only real
             # tokens are attended to.
-            input_mask = [1] * len(input_ids)
+            input_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)
 
             # Zero-pad up to the sequence length.
             while len(input_ids) < max_seq_length:
-                input_ids.append(0)
-                input_mask.append(0)
-                segment_ids.append(0)
+                input_ids.append(pad_token)
+                input_mask.append(0 if mask_padding_with_zero else 1)
+                segment_ids.append(pad_token_segment_id)
+                p_mask.append(1)
 
             assert len(input_ids) == max_seq_length
             assert len(input_mask) == max_seq_length
             assert len(segment_ids) == max_seq_length
 
+            span_is_impossible = example.is_impossible
             start_position = None
             end_position = None
-            if is_training and not example.is_impossible:
+            if is_training and not span_is_impossible:
                 # For training, if our document chunk does not contain an annotation
                 # we throw it out, since there is nothing to predict.
                 doc_start = doc_span.start
@@ -289,13 +337,16 @@ def convert_examples_to_features(examples, tokenizer, max_seq_length,
                 if out_of_span:
                     start_position = 0
                     end_position = 0
+                    span_is_impossible = True
                 else:
                     doc_offset = len(query_tokens) + 2
                     start_position = tok_start_position - doc_start + doc_offset
                     end_position = tok_end_position - doc_start + doc_offset
-            if is_training and example.is_impossible:
-                start_position = 0
-                end_position = 0
+
+            if is_training and span_is_impossible:
+                start_position = cls_index
+                end_position = cls_index
+
             if example_index < 20:
                 logger.info("*** Example ***")
                 logger.info("unique_id: %s" % (unique_id))
@@ -312,9 +363,9 @@ def convert_examples_to_features(examples, tokenizer, max_seq_length,
                     "input_mask: %s" % " ".join([str(x) for x in input_mask]))
                 logger.info(
                     "segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
-                if is_training and example.is_impossible:
+                if is_training and span_is_impossible:
                     logger.info("impossible example")
-                if is_training and not example.is_impossible:
+                if is_training and not span_is_impossible:
                     answer_text = " ".join(tokens[start_position:(end_position + 1)])
                     logger.info("start_position: %d" % (start_position))
                     logger.info("end_position: %d" % (end_position))
@@ -332,9 +383,11 @@ def convert_examples_to_features(examples, tokenizer, max_seq_length,
                     input_ids=input_ids,
                     input_mask=input_mask,
                     segment_ids=segment_ids,
+                    cls_index=cls_index,
+                    p_mask=p_mask,
                     start_position=start_position,
                     end_position=end_position,
-                    is_impossible=example.is_impossible))
+                    is_impossible=span_is_impossible))
             unique_id += 1
 
     return features
@@ -417,7 +470,6 @@ def _check_is_max_context(doc_spans, cur_span_index, position):
 RawResult = collections.namedtuple("RawResult",
                                    ["unique_id", "start_logits", "end_logits"])
 
-
 def write_predictions(all_examples, all_features, all_results, n_best_size,
                       max_answer_length, do_lower_case, output_prediction_file,
                       output_nbest_file, output_null_log_odds_file, verbose_logging,
@@ -612,6 +664,182 @@ def write_predictions(all_examples, all_features, all_results, n_best_size,
     return all_predictions
 
 
+# For XLNet (and XLM which uses the same head)
+RawResultExtended = collections.namedtuple("RawResultExtended",
+    ["unique_id", "start_top_log_probs", "start_top_index",
+     "end_top_log_probs", "end_top_index", "cls_logits"])
+
+
+def write_predictions_extended(all_examples, all_features, all_results, n_best_size,
+                                max_answer_length, output_prediction_file,
+                                output_nbest_file,
+                                output_null_log_odds_file, orig_data,
+                                start_n_top, end_n_top, version_2_with_negative):
+    """ XLNet write prediction logic (more complex than Bert's).
+        Write final predictions to the json file and log-odds of null if needed.
+
+        Requires utils_squad_evaluate.py
+    """
+    _PrelimPrediction = collections.namedtuple(  # pylint: disable=invalid-name
+        "PrelimPrediction",
+        ["feature_index", "start_index", "end_index",
+        "start_log_prob", "end_log_prob"])
+
+    _NbestPrediction = collections.namedtuple(  # pylint: disable=invalid-name
+        "NbestPrediction", ["text", "start_log_prob", "end_log_prob"])
+
+    logger.info("Writing predictions to: %s", output_prediction_file)
+    # logger.info("Writing nbest to: %s" % (output_nbest_file))
+
+    example_index_to_features = collections.defaultdict(list)
+    for feature in all_features:
+        example_index_to_features[feature.example_index].append(feature)
+
+    unique_id_to_result = {}
+    for result in all_results:
+        unique_id_to_result[result.unique_id] = result
+
+    all_predictions = collections.OrderedDict()
+    all_nbest_json = collections.OrderedDict()
+    scores_diff_json = collections.OrderedDict()
+
+    for (example_index, example) in enumerate(all_examples):
+        features = example_index_to_features[example_index]
+
+        prelim_predictions = []
+        # keep track of the minimum score of null start+end of position 0
+        score_null = 1000000  # large and positive
+
+        for (feature_index, feature) in enumerate(features):
+            result = unique_id_to_result[feature.unique_id]
+
+            cur_null_score = result.cls_logits
+
+            # if we could have irrelevant answers, get the min score of irrelevant
+            score_null = min(score_null, cur_null_score)
+
+            for i in range(start_n_top):
+                for j in range(end_n_top):
+                    start_log_prob = result.start_top_log_probs[i]
+                    start_index = result.start_top_index[i]
+
+                    j_index = i * end_n_top + j
+
+                    end_log_prob = result.end_top_log_probs[j_index]
+                    end_index = result.end_top_index[j_index]
+
+                    # We could hypothetically create invalid predictions, e.g., predict
+                    # that the start of the span is in the question. We throw out all
+                    # invalid predictions.
+                    if start_index >= feature.paragraph_len - 1:
+                        continue
+                    if end_index >= feature.paragraph_len - 1:
+                        continue
+
+                    if not feature.token_is_max_context.get(start_index, False):
+                        continue
+                    if end_index < start_index:
+                        continue
+                    length = end_index - start_index + 1
+                    if length > max_answer_length:
+                        continue
+
+                    prelim_predictions.append(
+                        _PrelimPrediction(
+                            feature_index=feature_index,
+                            start_index=start_index,
+                            end_index=end_index,
+                            start_log_prob=start_log_prob,
+                            end_log_prob=end_log_prob))
+
+        prelim_predictions = sorted(
+            prelim_predictions,
+            key=lambda x: (x.start_log_prob + x.end_log_prob),
+            reverse=True)
+
+        seen_predictions = {}
+        nbest = []
+        for pred in prelim_predictions:
+            if len(nbest) >= n_best_size:
+                break
+            feature = features[pred.feature_index]
+
+            tok_start_to_orig_index = feature.tok_start_to_orig_index
+            tok_end_to_orig_index = feature.tok_end_to_orig_index
+            start_orig_pos = tok_start_to_orig_index[pred.start_index]
+            end_orig_pos = tok_end_to_orig_index[pred.end_index]
+
+            paragraph_text = example.paragraph_text
+            final_text = paragraph_text[start_orig_pos: end_orig_pos + 1].strip()
+
+            if final_text in seen_predictions:
+                continue
+
+            seen_predictions[final_text] = True
+
+            nbest.append(
+                _NbestPrediction(
+                    text=final_text,
+                    start_log_prob=pred.start_log_prob,
+                    end_log_prob=pred.end_log_prob))
+
+        # In very rare edge cases we could have no valid predictions. So we
+        # just create a nonce prediction in this case to avoid failure.
+        if not nbest:
+            nbest.append(
+                _NbestPrediction(text="", start_log_prob=-1e6,
+                end_log_prob=-1e6))
+
+        total_scores = []
+        best_non_null_entry = None
+        for entry in nbest:
+            total_scores.append(entry.start_log_prob + entry.end_log_prob)
+            if not best_non_null_entry:
+                best_non_null_entry = entry
+
+        probs = _compute_softmax(total_scores)
+
+        nbest_json = []
+        for (i, entry) in enumerate(nbest):
+            output = collections.OrderedDict()
+            output["text"] = entry.text
+            output["probability"] = probs[i]
+            output["start_log_prob"] = entry.start_log_prob
+            output["end_log_prob"] = entry.end_log_prob
+            nbest_json.append(output)
+
+        assert len(nbest_json) >= 1
+        assert best_non_null_entry is not None
+
+        score_diff = score_null
+        scores_diff_json[example.qas_id] = score_diff
+        # note(zhiliny): always predict best_non_null_entry
+        # and the evaluation script will search for the best threshold
+        all_predictions[example.qas_id] = best_non_null_entry.text
+
+        all_nbest_json[example.qas_id] = nbest_json
+
+    with open(output_prediction_file, "w") as writer:
+        writer.write(json.dumps(all_predictions, indent=4) + "\n")
+
+    with open(output_nbest_file, "w") as writer:
+        writer.write(json.dumps(all_nbest_json, indent=4) + "\n")
+
+    if version_2_with_negative:
+        with open(output_null_log_odds_file, "w") as writer:
+            writer.write(json.dumps(scores_diff_json, indent=4) + "\n")
+
+    qid_to_has_ans = make_qid_to_has_ans(orig_data)
+    has_ans_qids = [k for k, v in qid_to_has_ans.items() if v]
+    no_ans_qids = [k for k, v in qid_to_has_ans.items() if not v]
+    exact_raw, f1_raw = get_raw_scores(orig_data, all_predictions)
+    out_eval = {}
+
+    find_all_best_thresh_v2(out_eval, all_predictions, exact_raw, f1_raw, scores_diff_json, qid_to_has_ans)
+
+    return out_eval
+
+
 def get_final_text(pred_text, orig_text, do_lower_case, verbose_logging=False):
     """Project the tokenized prediction back to the original text."""
 
diff --git a/examples/utils_squad_evaluate.py b/examples/utils_squad_evaluate.py
index d0cf643fe3..ed162e6fe6 100644
--- a/examples/utils_squad_evaluate.py
+++ b/examples/utils_squad_evaluate.py
@@ -1,4 +1,5 @@
-"""Official evaluation script for SQuAD version 2.0.
+""" Official evaluation script for SQuAD version 2.0.
+    Modified by XLNet authors to update `find_best_threshold` scripts for SQuAD V2.0
 
 In addition to basic functionality, we also compute additional statistics and
 plot precision-recall curves if an additional na_prob.json file is provided.
@@ -232,6 +233,36 @@ def find_best_thresh(preds, scores, na_probs, qid_to_has_ans):
       best_thresh = na_probs[qid]
   return 100.0 * best_score / len(scores), best_thresh
 
+def find_best_thresh_v2(preds, scores, na_probs, qid_to_has_ans):
+  num_no_ans = sum(1 for k in qid_to_has_ans if not qid_to_has_ans[k])
+  cur_score = num_no_ans
+  best_score = cur_score
+  best_thresh = 0.0
+  qid_list = sorted(na_probs, key=lambda k: na_probs[k])
+  for i, qid in enumerate(qid_list):
+    if qid not in scores: continue
+    if qid_to_has_ans[qid]:
+      diff = scores[qid]
+    else:
+      if preds[qid]:
+        diff = -1
+      else:
+        diff = 0
+    cur_score += diff
+    if cur_score > best_score:
+      best_score = cur_score
+      best_thresh = na_probs[qid]
+
+  has_ans_score, has_ans_cnt = 0, 0
+  for qid in qid_list:
+    if not qid_to_has_ans[qid]: continue
+    has_ans_cnt += 1
+
+    if qid not in scores: continue
+    has_ans_score += scores[qid]
+
+  return 100.0 * best_score / len(scores), best_thresh, 1.0 * has_ans_score / has_ans_cnt
+
 def find_all_best_thresh(main_eval, preds, exact_raw, f1_raw, na_probs, qid_to_has_ans):
   best_exact, exact_thresh = find_best_thresh(preds, exact_raw, na_probs, qid_to_has_ans)
   best_f1, f1_thresh = find_best_thresh(preds, f1_raw, na_probs, qid_to_has_ans)
@@ -240,6 +271,16 @@ def find_all_best_thresh(main_eval, preds, exact_raw, f1_raw, na_probs, qid_to_h
   main_eval['best_f1'] = best_f1
   main_eval['best_f1_thresh'] = f1_thresh
 
+def find_all_best_thresh_v2(main_eval, preds, exact_raw, f1_raw, na_probs, qid_to_has_ans):
+  best_exact, exact_thresh, has_ans_exact = find_best_thresh_v2(preds, exact_raw, na_probs, qid_to_has_ans)
+  best_f1, f1_thresh, has_ans_f1 = find_best_thresh_v2(preds, f1_raw, na_probs, qid_to_has_ans)
+  main_eval['best_exact'] = best_exact
+  main_eval['best_exact_thresh'] = exact_thresh
+  main_eval['best_f1'] = best_f1
+  main_eval['best_f1_thresh'] = f1_thresh
+  main_eval['has_ans_exact'] = has_ans_exact
+  main_eval['has_ans_f1'] = has_ans_f1
+
 def main(OPTS):
   with open(OPTS.data_file) as f:
     dataset_json = json.load(f)
diff --git a/pytorch_transformers/modeling_utils.py b/pytorch_transformers/modeling_utils.py
index 3f21c98b04..ebee4fac1d 100644
--- a/pytorch_transformers/modeling_utils.py
+++ b/pytorch_transformers/modeling_utils.py
@@ -493,8 +493,9 @@ class PoolerStartLogits(nn.Module):
 
     def forward(self, hidden_states, p_mask=None):
         """ Args:
-            `p_mask`: [optional] invalid position mask such as query and special symbols (PAD, SEP, CLS)
-                shape [batch_size, seq_len]. 1.0 means token should be masked.
+            **p_mask**: (`optional`) ``torch.FloatTensor`` of shape `(batch_size, seq_len)`
+                invalid position mask such as query and special symbols (PAD, SEP, CLS)
+                1.0 means token should be masked.
         """
         x = self.dense(hidden_states).squeeze(-1)
 
@@ -516,11 +517,16 @@ class PoolerEndLogits(nn.Module):
 
     def forward(self, hidden_states, start_states=None, start_positions=None, p_mask=None):
         """ Args:
-            One of start_states, start_positions should be not None. If both are set, start_positions overrides start_states.
-            `start_states`: hidden states of the first tokens for the labeled span: torch.LongTensor of shape identical to hidden_states.
-            `start_positions`: position of the first token for the labeled span: torch.LongTensor of shape [batch_size].
-            `p_mask`: [optional] invalid position mask such as query and special symbols (PAD, SEP, CLS)
-                shape [batch_size, seq_len]. 1.0 means token should be masked.
+            One of ``start_states``, ``start_positions`` should be not None.
+            If both are set, ``start_positions`` overrides ``start_states``.
+
+            **start_states**: ``torch.LongTensor`` of shape identical to hidden_states
+                hidden states of the first tokens for the labeled span.
+            **start_positions**: ``torch.LongTensor`` of shape ``(batch_size,)``
+                position of the first token for the labeled span: 
+            **p_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, seq_len)``
+                Mask of invalid position such as query and special symbols (PAD, SEP, CLS)
+                1.0 means token should be masked.
         """
         slen, hsz = hidden_states.shape[-2:]
         assert start_states is not None or start_positions is not None, "One of start_states, start_positions should be not None"
@@ -549,13 +555,21 @@ class PoolerAnswerClass(nn.Module):
         self.dense_1 = nn.Linear(config.hidden_size, 1, bias=False)
 
     def forward(self, hidden_states, start_states=None, start_positions=None, cls_index=None):
-        """ Args:
-            One of start_states, start_positions should be not None. If both are set, start_positions overrides start_states.
-            `start_states`: hidden states of the first tokens for the labeled span: torch.LongTensor of shape identical to hidden_states.
-            `start_positions`: position of the first token for the labeled span: torch.LongTensor of shape [batch_size].
-            `cls_index`: position of the CLS token: torch.LongTensor of shape [batch_size]. If None, take the last token.
+        """
+        Args:
+            One of ``start_states``, ``start_positions`` should be not None.
+            If both are set, ``start_positions`` overrides ``start_states``.
 
-            # note(zhiliny): no dependency on end_feature so that we can obtain one single `cls_logits` for each sample
+            **start_states**: ``torch.LongTensor`` of shape identical to ``hidden_states``.
+                hidden states of the first tokens for the labeled span.
+            **start_positions**: ``torch.LongTensor`` of shape ``(batch_size,)``
+                position of the first token for the labeled span.
+            **cls_index**: torch.LongTensor of shape ``(batch_size,)``
+                position of the CLS token. If None, take the last token.
+
+            note(Original repo):
+                no dependency on end_feature so that we can obtain one single `cls_logits`
+                for each sample
         """
         slen, hsz = hidden_states.shape[-2:]
         assert start_states is not None or start_positions is not None, "One of start_states, start_positions should be not None"
@@ -577,7 +591,35 @@ class PoolerAnswerClass(nn.Module):
 
 
 class SQuADHead(nn.Module):
-    """ A SQuAD head inspired by XLNet.
+    r""" A SQuAD head inspired by XLNet.
+
+    Parameters:
+        config (:class:`~pytorch_transformers.XLNetConfig`): Model configuration class with all the parameters of the model.
+
+    Inputs:
+        **hidden_states**: ``torch.FloatTensor`` of shape ``(batch_size, seq_len, hidden_size)``
+            hidden states of sequence tokens
+        **start_positions**: ``torch.LongTensor`` of shape ``(batch_size,)``
+            position of the first token for the labeled span.
+        **end_positions**: ``torch.LongTensor`` of shape ``(batch_size,)``
+            position of the last token for the labeled span.
+        **cls_index**: torch.LongTensor of shape ``(batch_size,)``
+            position of the CLS token. If None, take the last token.
+        **is_impossible**: ``torch.LongTensor`` of shape ``(batch_size,)``
+            Whether the question has a possible answer in the paragraph or not.
+        **p_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, seq_len)``
+            Mask of invalid position such as query and special symbols (PAD, SEP, CLS)
+            1.0 means token should be masked.
+
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **loss**: (`optional`, returned if both ``start_positions`` and ``end_positions`` are provided) ``torch.FloatTensor`` of shape ``(1,)``:
+            Classification loss as the sum of start token, end token (and is_impossible if provided) classification losses.
+        **last_hidden_state**: `(`optional`, returned if ``start_positions`` or ``end_positions`` is not provided) `torch.FloatTensor`` of shape ``(batch_size, sequence_length, hidden_size)``
+            Sequence of hidden-states at the last layer of the model.
+        **mems**:
+            list of ``torch.FloatTensor`` (one for each layer):
+            that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
+            (see `mems` input above). Can be used to speed up sequential decoding and attend to longer context.
     """
     def __init__(self, config):
         super(SQuADHead, self).__init__()
@@ -590,8 +632,6 @@ class SQuADHead(nn.Module):
 
     def forward(self, hidden_states, start_positions=None, end_positions=None,
                 cls_index=None, is_impossible=None, p_mask=None):
-        """ hidden_states: float Tensor in shape [bsz, seq_len, hidden_size], the hidden-states of the last layer.
-        """
         outputs = ()
 
         start_logits = self.start_logits(hidden_states, p_mask=p_mask)
@@ -618,9 +658,8 @@ class SQuADHead(nn.Module):
 
                 # note(zhiliny): by default multiply the loss by 0.5 so that the scale is comparable to start_loss and end_loss
                 total_loss += cls_loss * 0.5
-                outputs = (total_loss, start_logits, end_logits, cls_logits) + outputs
-            else:
-                outputs = (total_loss, start_logits, end_logits) + outputs
+
+            outputs = (total_loss,) + outputs
 
         else:
             # during inference, compute the end logits based on beam search
@@ -647,7 +686,7 @@ class SQuADHead(nn.Module):
             outputs = (start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits) + outputs
 
         # return start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits
-        # or (if labels are provided) total_loss, start_logits, end_logits, (cls_logits)
+        # or (if labels are provided) (total_loss,)
         return outputs
 
 
diff --git a/pytorch_transformers/modeling_xlnet.py b/pytorch_transformers/modeling_xlnet.py
index 5e576c51c1..6de4d02103 100644
--- a/pytorch_transformers/modeling_xlnet.py
+++ b/pytorch_transformers/modeling_xlnet.py
@@ -1162,8 +1162,9 @@ class XLNetForQuestionAnswering(XLNetPreTrainedModel):
             Labels whether a question has an answer or no answer (SQuAD 2.0)
         **cls_index**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
             Labels for position (index) of the classification token to use as input for computing plausibility of the answer.
-        **p_mask**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
-            Optional mask of tokens which can't be in answers (e.g. [CLS], [PAD], ...) 
+        **p_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
+            Optional mask of tokens which can't be in answers (e.g. [CLS], [PAD], ...).
+            1.0 means token should be masked. 0.0 mean token is not masked.
 
     Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
         **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:

From 15d8b1266c3a399e16a9ffe2f8e0420e3c9682a4 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Mon, 15 Jul 2019 17:30:42 +0200
Subject: [PATCH 132/139] update tokenizer - update squad example for xlnet

---
 examples/run_glue.py                          | 20 +++---
 examples/run_squad.py                         | 24 +++----
 examples/test_examples.py                     | 10 +--
 examples/utils_squad.py                       | 41 ++++++++---
 pytorch_transformers/modeling_utils.py        | 29 +++++---
 pytorch_transformers/modeling_xlnet.py        | 43 ++++++-----
 .../tests/tokenization_bert_test.py           |  5 +-
 .../tests/tokenization_gpt2_test.py           |  5 +-
 .../tests/tokenization_openai_test.py         |  5 +-
 .../tests/tokenization_tests_commons.py       | 13 ++--
 .../tests/tokenization_transfo_xl_test.py     |  5 +-
 .../tests/tokenization_xlm_test.py            |  5 +-
 .../tests/tokenization_xlnet_test.py          |  5 +-
 pytorch_transformers/tokenization_bert.py     |  7 +-
 pytorch_transformers/tokenization_gpt2.py     |  6 +-
 pytorch_transformers/tokenization_openai.py   |  6 +-
 .../tokenization_transfo_xl.py                |  6 +-
 pytorch_transformers/tokenization_utils.py    | 72 +++++++++----------
 pytorch_transformers/tokenization_xlm.py      |  6 +-
 pytorch_transformers/tokenization_xlnet.py    |  9 +--
 20 files changed, 191 insertions(+), 131 deletions(-)

diff --git a/examples/run_glue.py b/examples/run_glue.py
index 979c644471..f017db2f6f 100644
--- a/examples/run_glue.py
+++ b/examples/run_glue.py
@@ -242,7 +242,7 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False):
     # Load data features from cache or dataset file
     cached_features_file = os.path.join(args.data_dir, 'cached_{}_{}_{}_{}'.format(
         'dev' if evaluate else 'train',
-        list(filter(None, args.model_name.split('/'))).pop(),
+        list(filter(None, args.model_name_or_path.split('/'))).pop(),
         str(args.max_seq_length),
         str(task)))
     if os.path.exists(cached_features_file):
@@ -282,8 +282,10 @@ def main():
     ## Required parameters
     parser.add_argument("--data_dir", default=None, type=str, required=True,
                         help="The input data dir. Should contain the .tsv files (or other data files) for the task.")
-    parser.add_argument("--model_name", default=None, type=str, required=True,
-                        help="Bert/XLNet/XLM pre-trained model selected in the list: " + ", ".join(ALL_MODELS))
+    parser.add_argument("--model_type", default=None, type=str, required=True,
+                        help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()))
+    parser.add_argument("--model_name_or_path", default=None, type=str, required=True,
+                        help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS))
     parser.add_argument("--task_name", default=None, type=str, required=True,
                         help="The name of the task to train selected in the list: " + ", ".join(processors.keys()))
     parser.add_argument("--output_dir", default=None, type=str, required=True,
@@ -400,15 +402,11 @@ def main():
     if args.local_rank not in [-1, 0]:
         torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
 
-    args.model_type = ""
-    for key in MODEL_CLASSES:
-        if key in args.model_name.lower():
-            args.model_type = key  # take the first match in model types
-            break
+    args.model_type = args.model_type.lower()
     config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
-    config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name, num_labels=num_labels, finetuning_task=args.task_name)
-    tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name, do_lower_case=args.do_lower_case)
-    model = model_class.from_pretrained(args.model_name, from_tf=bool('.ckpt' in args.model_name), config=config)
+    config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path, num_labels=num_labels, finetuning_task=args.task_name)
+    tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, do_lower_case=args.do_lower_case)
+    model = model_class.from_pretrained(args.model_name_or_path, from_tf=bool('.ckpt' in args.model_name_or_path), config=config)
 
     if args.local_rank == 0:
         torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
diff --git a/examples/run_squad.py b/examples/run_squad.py
index 2025217454..e920ebe378 100644
--- a/examples/run_squad.py
+++ b/examples/run_squad.py
@@ -213,7 +213,6 @@ def evaluate(args, model, tokenizer, prefix=""):
                 inputs.update({'cls_index': batch[4],
                                'p_mask':    batch[5]})
             outputs = model(**inputs)
-            batch_start_logits, batch_end_logits = outputs[:2]
 
         for i, example_index in enumerate(example_indices):
             eval_feature = features[example_index.item()]
@@ -242,7 +241,8 @@ def evaluate(args, model, tokenizer, prefix=""):
         write_predictions_extended(examples, features, all_results, args.n_best_size,
                         args.max_answer_length, output_prediction_file,
                         output_nbest_file, output_null_log_odds_file, args.predict_file,
-                        args.start_n_top, args.end_n_top, args.version_2_with_negative)
+                        model.config.start_n_top, model.config.end_n_top,
+                        args.version_2_with_negative, tokenizer, args.verbose_logging)
     else:
         write_predictions(examples, features, all_results, args.n_best_size,
                         args.max_answer_length, args.do_lower_case, output_prediction_file,
@@ -262,7 +262,7 @@ def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=Fal
     input_file = args.predict_file if evaluate else args.train_file
     cached_features_file = os.path.join(os.path.dirname(input_file), 'cached_{}_{}_{}'.format(
         'dev' if evaluate else 'train',
-        list(filter(None, args.model_name.split('/'))).pop(),
+        list(filter(None, args.model_name_or_path.split('/'))).pop(),
         str(args.max_seq_length)))
     if os.path.exists(cached_features_file) and not args.overwrite_cache and not output_examples:
         logger.info("Loading features from cached file %s", cached_features_file)
@@ -312,8 +312,10 @@ def main():
                         help="SQuAD json for training. E.g., train-v1.1.json")
     parser.add_argument("--predict_file", default=None, type=str, required=True,
                         help="SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json")
-    parser.add_argument("--model_name", default=None, type=str, required=True,
-                        help="Bert/XLNet/XLM pre-trained model selected in the list: " + ", ".join(ALL_MODELS))
+    parser.add_argument("--model_type", default=None, type=str, required=True,
+                        help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()))
+    parser.add_argument("--model_name_or_path", default=None, type=str, required=True,
+                        help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS))
     parser.add_argument("--output_dir", default=None, type=str, required=True,
                         help="The output directory where the model checkpoints and predictions will be written.")
 
@@ -438,15 +440,11 @@ def main():
     if args.local_rank not in [-1, 0]:
         torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
 
-    args.model_type = ""
-    for key in MODEL_CLASSES:
-        if key in args.model_name.lower():
-            args.model_type = key  # take the first match in model types
-            break
+    args.model_type = args.model_type.lower()
     config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
-    config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name)
-    tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name, do_lower_case=args.do_lower_case)
-    model = model_class.from_pretrained(args.model_name, from_tf=bool('.ckpt' in args.model_name), config=config)
+    config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path)
+    tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, do_lower_case=args.do_lower_case)
+    model = model_class.from_pretrained(args.model_name_or_path, from_tf=bool('.ckpt' in args.model_name_or_path), config=config)
 
     if args.local_rank == 0:
         torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
diff --git a/examples/test_examples.py b/examples/test_examples.py
index a07c0ea31b..00370e9361 100644
--- a/examples/test_examples.py
+++ b/examples/test_examples.py
@@ -60,8 +60,9 @@ class ExamplesTests(unittest.TestCase):
                     "--warmup_steps=2",
                     "--overwrite_output_dir",
                     "--seed=42"]
-        model_name = "--model_name=bert-base-uncased"
-        with patch.object(sys, 'argv', testargs + [model_name]):
+        model_type, model_name = ("--model_type=bert",
+                                  "--model_name_or_path=bert-base-uncased")
+        with patch.object(sys, 'argv', testargs + [model_type, model_name]):
             result = run_glue.main()
             for value in result.values():
                 self.assertGreaterEqual(value, 0.75)
@@ -85,8 +86,9 @@ class ExamplesTests(unittest.TestCase):
                     "--per_gpu_eval_batch_size=1",
                     "--overwrite_output_dir",
                     "--seed=42"]
-        model_name = "--model_name=bert-base-uncased"
-        with patch.object(sys, 'argv', testargs + [model_name]):
+        model_type, model_name = ("--model_type=bert",
+                                  "--model_name_or_path=bert-base-uncased")
+        with patch.object(sys, 'argv', testargs + [model_type, model_name]):
             result = run_squad.main()
             self.assertGreaterEqual(result['f1'], 30)
             self.assertGreaterEqual(result['exact'], 30)
diff --git a/examples/utils_squad.py b/examples/utils_squad.py
index d898a0a17e..34a0c9cc02 100644
--- a/examples/utils_squad.py
+++ b/examples/utils_squad.py
@@ -87,6 +87,7 @@ class InputFeatures(object):
                  segment_ids,
                  cls_index,
                  p_mask,
+                 paragraph_len,
                  start_position=None,
                  end_position=None,
                  is_impossible=None):
@@ -101,6 +102,7 @@ class InputFeatures(object):
         self.segment_ids = segment_ids
         self.cls_index = cls_index
         self.p_mask = p_mask
+        self.paragraph_len = paragraph_len
         self.start_position = start_position
         self.end_position = end_position
         self.is_impossible = is_impossible
@@ -292,6 +294,7 @@ def convert_examples_to_features(examples, tokenizer, max_seq_length,
                 tokens.append(all_doc_tokens[split_token_index])
                 segment_ids.append(sequence_b_segment_id)
                 p_mask.append(0)
+            paragraph_len = doc_span.length
 
             # SEP token
             tokens.append(sep_token)
@@ -385,6 +388,7 @@ def convert_examples_to_features(examples, tokenizer, max_seq_length,
                     segment_ids=segment_ids,
                     cls_index=cls_index,
                     p_mask=p_mask,
+                    paragraph_len=paragraph_len,
                     start_position=start_position,
                     end_position=end_position,
                     is_impossible=span_is_impossible))
@@ -673,8 +677,9 @@ RawResultExtended = collections.namedtuple("RawResultExtended",
 def write_predictions_extended(all_examples, all_features, all_results, n_best_size,
                                 max_answer_length, output_prediction_file,
                                 output_nbest_file,
-                                output_null_log_odds_file, orig_data,
-                                start_n_top, end_n_top, version_2_with_negative):
+                                output_null_log_odds_file, orig_data_file,
+                                start_n_top, end_n_top, version_2_with_negative,
+                                tokenizer, verbose_logging):
     """ XLNet write prediction logic (more complex than Bert's).
         Write final predictions to the json file and log-odds of null if needed.
 
@@ -764,13 +769,30 @@ def write_predictions_extended(all_examples, all_features, all_results, n_best_s
                 break
             feature = features[pred.feature_index]
 
-            tok_start_to_orig_index = feature.tok_start_to_orig_index
-            tok_end_to_orig_index = feature.tok_end_to_orig_index
-            start_orig_pos = tok_start_to_orig_index[pred.start_index]
-            end_orig_pos = tok_end_to_orig_index[pred.end_index]
+            # XLNet un-tokenizer
+            # Let's keep it simple for now and see if we need all this later.
+            # 
+            # tok_start_to_orig_index = feature.tok_start_to_orig_index
+            # tok_end_to_orig_index = feature.tok_end_to_orig_index
+            # start_orig_pos = tok_start_to_orig_index[pred.start_index]
+            # end_orig_pos = tok_end_to_orig_index[pred.end_index]
+            # paragraph_text = example.paragraph_text
+            # final_text = paragraph_text[start_orig_pos: end_orig_pos + 1].strip()
 
-            paragraph_text = example.paragraph_text
-            final_text = paragraph_text[start_orig_pos: end_orig_pos + 1].strip()
+            # Previously used Bert untokenizer
+            tok_tokens = feature.tokens[pred.start_index:(pred.end_index + 1)]
+            orig_doc_start = feature.token_to_orig_map[pred.start_index]
+            orig_doc_end = feature.token_to_orig_map[pred.end_index]
+            orig_tokens = example.doc_tokens[orig_doc_start:(orig_doc_end + 1)]
+            tok_text = tokenizer.convert_tokens_to_string(tok_tokens)
+
+            # Clean whitespace
+            tok_text = tok_text.strip()
+            tok_text = " ".join(tok_text.split())
+            orig_text = " ".join(orig_tokens)
+
+            final_text = get_final_text(tok_text, orig_text, tokenizer.do_lower_case,
+                                        verbose_logging)
 
             if final_text in seen_predictions:
                 continue
@@ -829,6 +851,9 @@ def write_predictions_extended(all_examples, all_features, all_results, n_best_s
         with open(output_null_log_odds_file, "w") as writer:
             writer.write(json.dumps(scores_diff_json, indent=4) + "\n")
 
+    with open(orig_data_file, "r", encoding='utf-8') as reader:
+        orig_data = json.load(reader)["data"]
+
     qid_to_has_ans = make_qid_to_has_ans(orig_data)
     has_ans_qids = [k for k, v in qid_to_has_ans.items() if v]
     no_ans_qids = [k for k, v in qid_to_has_ans.items() if not v]
diff --git a/pytorch_transformers/modeling_utils.py b/pytorch_transformers/modeling_utils.py
index ebee4fac1d..2c15aa740b 100644
--- a/pytorch_transformers/modeling_utils.py
+++ b/pytorch_transformers/modeling_utils.py
@@ -528,9 +528,9 @@ class PoolerEndLogits(nn.Module):
                 Mask of invalid position such as query and special symbols (PAD, SEP, CLS)
                 1.0 means token should be masked.
         """
-        slen, hsz = hidden_states.shape[-2:]
         assert start_states is not None or start_positions is not None, "One of start_states, start_positions should be not None"
         if start_positions is not None:
+            slen, hsz = hidden_states.shape[-2:]
             start_positions = start_positions[:, None, None].expand(-1, -1, hsz) # shape (bsz, 1, hsz)
             start_states = hidden_states.gather(-2, start_positions) # shape (bsz, 1, hsz)
             start_states = start_states.expand(-1, slen, -1) # shape (bsz, slen, hsz)
@@ -571,7 +571,7 @@ class PoolerAnswerClass(nn.Module):
                 no dependency on end_feature so that we can obtain one single `cls_logits`
                 for each sample
         """
-        slen, hsz = hidden_states.shape[-2:]
+        hsz = hidden_states.shape[-1]
         assert start_states is not None or start_positions is not None, "One of start_states, start_positions should be not None"
         if start_positions is not None:
             start_positions = start_positions[:, None, None].expand(-1, -1, hsz) # shape (bsz, 1, hsz)
@@ -614,12 +614,21 @@ class SQuADHead(nn.Module):
     Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
         **loss**: (`optional`, returned if both ``start_positions`` and ``end_positions`` are provided) ``torch.FloatTensor`` of shape ``(1,)``:
             Classification loss as the sum of start token, end token (and is_impossible if provided) classification losses.
-        **last_hidden_state**: `(`optional`, returned if ``start_positions`` or ``end_positions`` is not provided) `torch.FloatTensor`` of shape ``(batch_size, sequence_length, hidden_size)``
-            Sequence of hidden-states at the last layer of the model.
-        **mems**:
-            list of ``torch.FloatTensor`` (one for each layer):
-            that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
-            (see `mems` input above). Can be used to speed up sequential decoding and attend to longer context.
+        **start_top_log_probs**: `(`optional`, returned if ``start_positions`` or ``end_positions`` is not provided)
+            ``torch.FloatTensor`` of shape ``(batch_size, config.start_n_top)``
+            Log probabilities for the top config.start_n_top start token possibilities (beam-search).
+        **start_top_index**: `(`optional`, returned if ``start_positions`` or ``end_positions`` is not provided)
+            ``torch.LongTensor`` of shape ``(batch_size, config.start_n_top)``
+            Indices for the top config.start_n_top start token possibilities (beam-search).
+        **end_top_log_probs**: `(`optional`, returned if ``start_positions`` or ``end_positions`` is not provided)
+            ``torch.FloatTensor`` of shape ``(batch_size, config.start_n_top * config.end_n_top)``
+            Log probabilities for the top ``config.start_n_top * config.end_n_top`` end token possibilities (beam-search).
+        **end_top_index**: `(`optional`, returned if ``start_positions`` or ``end_positions`` is not provided)
+            ``torch.LongTensor`` of shape ``(batch_size, config.start_n_top * config.end_n_top)``
+            Indices for the top ``config.start_n_top * config.end_n_top`` end token possibilities (beam-search).
+        **cls_logits**: `(`optional`, returned if ``start_positions`` or ``end_positions`` is not provided)
+            ``torch.FloatTensor`` of shape ``(batch_size,)``
+            Log probabilities for the ``is_impossible`` label of the answers.
     """
     def __init__(self, config):
         super(SQuADHead, self).__init__()
@@ -667,8 +676,8 @@ class SQuADHead(nn.Module):
             start_log_probs = F.softmax(start_logits, dim=-1) # shape (bsz, slen)
 
             start_top_log_probs, start_top_index = torch.topk(start_log_probs, self.start_n_top, dim=-1) # shape (bsz, start_n_top)
-            start_top_index = start_top_index.unsqueeze(-1).expand(-1, -1, hsz) # shape (bsz, start_n_top, hsz)
-            start_states = torch.gather(hidden_states, -2, start_top_index) # shape (bsz, start_n_top, hsz)
+            start_top_index_exp = start_top_index.unsqueeze(-1).expand(-1, -1, hsz) # shape (bsz, start_n_top, hsz)
+            start_states = torch.gather(hidden_states, -2, start_top_index_exp) # shape (bsz, start_n_top, hsz)
             start_states = start_states.unsqueeze(1).expand(-1, slen, -1, -1) # shape (bsz, slen, start_n_top, hsz)
 
             hidden_states_expanded = hidden_states.unsqueeze(2).expand_as(start_states) # shape (bsz, slen, start_n_top, hsz)
diff --git a/pytorch_transformers/modeling_xlnet.py b/pytorch_transformers/modeling_xlnet.py
index 6de4d02103..848e73cfc9 100644
--- a/pytorch_transformers/modeling_xlnet.py
+++ b/pytorch_transformers/modeling_xlnet.py
@@ -1167,12 +1167,23 @@ class XLNetForQuestionAnswering(XLNetPreTrainedModel):
             1.0 means token should be masked. 0.0 mean token is not masked.
 
     Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
-        **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
-            Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
-        **start_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length,)``
-            Span-start scores (before SoftMax).
-        **end_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length,)``
-            Span-end scores (before SoftMax).
+        **loss**: (`optional`, returned if both ``start_positions`` and ``end_positions`` are provided) ``torch.FloatTensor`` of shape ``(1,)``:
+            Classification loss as the sum of start token, end token (and is_impossible if provided) classification losses.
+        **start_top_log_probs**: `(`optional`, returned if ``start_positions`` or ``end_positions`` is not provided)
+            ``torch.FloatTensor`` of shape ``(batch_size, config.start_n_top)``
+            Log probabilities for the top config.start_n_top start token possibilities (beam-search).
+        **start_top_index**: `(`optional`, returned if ``start_positions`` or ``end_positions`` is not provided)
+            ``torch.LongTensor`` of shape ``(batch_size, config.start_n_top)``
+            Indices for the top config.start_n_top start token possibilities (beam-search).
+        **end_top_log_probs**: `(`optional`, returned if ``start_positions`` or ``end_positions`` is not provided)
+            ``torch.FloatTensor`` of shape ``(batch_size, config.start_n_top * config.end_n_top)``
+            Log probabilities for the top ``config.start_n_top * config.end_n_top`` end token possibilities (beam-search).
+        **end_top_index**: `(`optional`, returned if ``start_positions`` or ``end_positions`` is not provided)
+            ``torch.LongTensor`` of shape ``(batch_size, config.start_n_top * config.end_n_top)``
+            Indices for the top ``config.start_n_top * config.end_n_top`` end token possibilities (beam-search).
+        **cls_logits**: `(`optional`, returned if ``start_positions`` or ``end_positions`` is not provided)
+            ``torch.FloatTensor`` of shape ``(batch_size,)``
+            Log probabilities for the ``is_impossible`` label of the answers.
         **mems**:
             list of ``torch.FloatTensor`` (one for each layer):
             that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
@@ -1243,12 +1254,10 @@ class XLNetForQuestionAnswering(XLNetPreTrainedModel):
                 loss_fct_cls = nn.BCEWithLogitsLoss()
                 cls_loss = loss_fct_cls(cls_logits, is_impossible)
 
-                # note(zhiliny): by default multiply the loss by 0.5 so that the scale is
-                # comparable to start_loss and end_loss
+                # note(zhiliny): by default multiply the loss by 0.5 so that the scale is comparable to start_loss and end_loss
                 total_loss += cls_loss * 0.5
-                outputs = (total_loss, start_logits, end_logits, cls_logits) + outputs
-            else:
-                outputs = (total_loss, start_logits, end_logits) + outputs
+
+            outputs = (total_loss,) + outputs
 
         else:
             # during inference, compute the end logits based on beam search
@@ -1256,8 +1265,8 @@ class XLNetForQuestionAnswering(XLNetPreTrainedModel):
             start_log_probs = F.softmax(start_logits, dim=-1) # shape (bsz, slen)
 
             start_top_log_probs, start_top_index = torch.topk(start_log_probs, self.start_n_top, dim=-1) # shape (bsz, start_n_top)
-            start_top_index = start_top_index.unsqueeze(-1).expand(-1, -1, hsz) # shape (bsz, start_n_top, hsz)
-            start_states = torch.gather(hidden_states, -2, start_top_index) # shape (bsz, start_n_top, hsz)
+            start_top_index_exp = start_top_index.unsqueeze(-1).expand(-1, -1, hsz) # shape (bsz, start_n_top, hsz)
+            start_states = torch.gather(hidden_states, -2, start_top_index_exp) # shape (bsz, start_n_top, hsz)
             start_states = start_states.unsqueeze(1).expand(-1, slen, -1, -1) # shape (bsz, slen, start_n_top, hsz)
 
             hidden_states_expanded = hidden_states.unsqueeze(2).expand_as(start_states) # shape (bsz, slen, start_n_top, hsz)
@@ -1269,11 +1278,11 @@ class XLNetForQuestionAnswering(XLNetPreTrainedModel):
             end_top_log_probs = end_top_log_probs.view(-1, self.start_n_top * self.end_n_top)
             end_top_index = end_top_index.view(-1, self.start_n_top * self.end_n_top)
 
-            start_states = torch.einsum("blh,bl->bh", hidden_states, start_log_probs)
-            cls_logits = self.answer_class(hidden_states, start_states=start_states, cls_index=cls_index)
+            start_states = torch.einsum("blh,bl->bh", hidden_states, start_log_probs)  # get the representation of START as weighted sum of hidden states
+            cls_logits = self.answer_class(hidden_states, start_states=start_states, cls_index=cls_index)  # Shape (batch size,): one single `cls_logits` for each sample
 
             outputs = (start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits) + outputs
 
-        # return start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits, mems, (hidden states), (attentions)
-        # or (if labels are provided) total_loss, start_logits, end_logits, (cls_logits), mems, (hidden states), (attentions)
+        # return start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits
+        # or (if labels are provided) (total_loss,)
         return outputs
diff --git a/pytorch_transformers/tests/tokenization_bert_test.py b/pytorch_transformers/tests/tokenization_bert_test.py
index dbbe9ac5ea..0b9cfb1b32 100644
--- a/pytorch_transformers/tests/tokenization_bert_test.py
+++ b/pytorch_transformers/tests/tokenization_bert_test.py
@@ -38,7 +38,10 @@ class TokenizationTest(unittest.TestCase):
             with open(vocab_file, "w", encoding='utf-8') as vocab_writer:
                 vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
 
-            create_and_check_tokenizer_commons(self, BertTokenizer, tmpdirname)
+            input_text = u"UNwant\u00E9d,running"
+            output_text = u"unwanted, running"
+
+            create_and_check_tokenizer_commons(self, input_text, output_text, BertTokenizer, tmpdirname)
 
             tokenizer = BertTokenizer(vocab_file)
 
diff --git a/pytorch_transformers/tests/tokenization_gpt2_test.py b/pytorch_transformers/tests/tokenization_gpt2_test.py
index 8ae8896187..8dae72ec99 100644
--- a/pytorch_transformers/tests/tokenization_gpt2_test.py
+++ b/pytorch_transformers/tests/tokenization_gpt2_test.py
@@ -41,7 +41,10 @@ class GPT2TokenizationTest(unittest.TestCase):
             with open(merges_file, "w") as fp:
                 fp.write("\n".join(merges))
 
-            create_and_check_tokenizer_commons(self, GPT2Tokenizer, tmpdirname, **special_tokens_map)
+            input_text = u"lower newer"
+            output_text = u"lower<unk>newer"
+
+            create_and_check_tokenizer_commons(self, input_text, output_text, GPT2Tokenizer, tmpdirname, **special_tokens_map)
 
             tokenizer = GPT2Tokenizer(vocab_file, merges_file, **special_tokens_map)
             text = "lower"
diff --git a/pytorch_transformers/tests/tokenization_openai_test.py b/pytorch_transformers/tests/tokenization_openai_test.py
index f5c99877d7..9b4841a605 100644
--- a/pytorch_transformers/tests/tokenization_openai_test.py
+++ b/pytorch_transformers/tests/tokenization_openai_test.py
@@ -42,7 +42,10 @@ class OpenAIGPTTokenizationTest(unittest.TestCase):
             with open(merges_file, "w") as fp:
                 fp.write("\n".join(merges))
 
-            create_and_check_tokenizer_commons(self, OpenAIGPTTokenizer, tmpdirname)
+            input_text = u"lower newer"
+            output_text = u"lower newer"
+
+            create_and_check_tokenizer_commons(self, input_text, output_text, OpenAIGPTTokenizer, tmpdirname)
 
             tokenizer = OpenAIGPTTokenizer(vocab_file, merges_file)
 
diff --git a/pytorch_transformers/tests/tokenization_tests_commons.py b/pytorch_transformers/tests/tokenization_tests_commons.py
index 44adbc6b53..e33ba3cb06 100644
--- a/pytorch_transformers/tests/tokenization_tests_commons.py
+++ b/pytorch_transformers/tests/tokenization_tests_commons.py
@@ -113,23 +113,24 @@ def create_and_check_add_tokens_tokenizer(tester, tokenizer_class, *inputs, **kw
     tester.assertEqual(tokens[-2], tokenizer.convert_tokens_to_ids(tokenizer.pad_token))
 
 
-def create_and_check_required_methods_tokenizer(tester, tokenizer_class, *inputs, **kwargs):
+def create_and_check_required_methods_tokenizer(tester, input_text, output_text, tokenizer_class, *inputs, **kwargs):
     tokenizer = tokenizer_class.from_pretrained(*inputs, **kwargs)
 
-    text = u"He is very happy, UNwant\u00E9d,running"
-    tokens = tokenizer.tokenize(text)
+    tokens = tokenizer.tokenize(input_text)
     ids = tokenizer.convert_tokens_to_ids(tokens)
-    ids_2 = tokenizer.encode(text)
+    ids_2 = tokenizer.encode(input_text)
     tester.assertListEqual(ids, ids_2)
 
     tokens_2 = tokenizer.convert_ids_to_tokens(ids)
     text_2 = tokenizer.decode(ids)
 
+    tester.assertEqual(text_2, output_text)
+
     tester.assertNotEqual(len(tokens_2), 0)
     tester.assertIsInstance(text_2, (str, unicode))
 
-def create_and_check_tokenizer_commons(tester, tokenizer_class, *inputs, **kwargs):
-    create_and_check_required_methods_tokenizer(tester, tokenizer_class, *inputs, **kwargs)
+def create_and_check_tokenizer_commons(tester, input_text, output_text, tokenizer_class, *inputs, **kwargs):
+    create_and_check_required_methods_tokenizer(tester, input_text, output_text, tokenizer_class, *inputs, **kwargs)
     create_and_check_add_tokens_tokenizer(tester, tokenizer_class, *inputs, **kwargs)
     create_and_check_save_and_load_tokenizer(tester, tokenizer_class, *inputs, **kwargs)
     create_and_check_pickle_tokenizer(tester, tokenizer_class, *inputs, **kwargs)
diff --git a/pytorch_transformers/tests/tokenization_transfo_xl_test.py b/pytorch_transformers/tests/tokenization_transfo_xl_test.py
index 135f48b0ef..aecfeaef5f 100644
--- a/pytorch_transformers/tests/tokenization_transfo_xl_test.py
+++ b/pytorch_transformers/tests/tokenization_transfo_xl_test.py
@@ -34,7 +34,10 @@ class TransfoXLTokenizationTest(unittest.TestCase):
             with open(vocab_file, "w", encoding='utf-8') as vocab_writer:
                 vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
 
-            create_and_check_tokenizer_commons(self, TransfoXLTokenizer, tmpdirname, lower_case=True)
+            input_text = u"<unk> UNwanted , running"
+            output_text = u"<unk> unwanted, running"
+
+            create_and_check_tokenizer_commons(self, input_text, output_text, TransfoXLTokenizer, tmpdirname, lower_case=True)
 
             tokenizer = TransfoXLTokenizer(vocab_file=vocab_file, lower_case=True)
 
diff --git a/pytorch_transformers/tests/tokenization_xlm_test.py b/pytorch_transformers/tests/tokenization_xlm_test.py
index 827ec1606e..97e8fa983f 100644
--- a/pytorch_transformers/tests/tokenization_xlm_test.py
+++ b/pytorch_transformers/tests/tokenization_xlm_test.py
@@ -41,7 +41,10 @@ class XLMTokenizationTest(unittest.TestCase):
             with open(merges_file, "w") as fp:
                 fp.write("\n".join(merges))
 
-            create_and_check_tokenizer_commons(self, XLMTokenizer, tmpdirname)
+            input_text = u"lower newer"
+            output_text = u"lower newer"
+
+            create_and_check_tokenizer_commons(self, input_text, output_text, XLMTokenizer, tmpdirname)
 
             tokenizer = XLMTokenizer(vocab_file, merges_file)
 
diff --git a/pytorch_transformers/tests/tokenization_xlnet_test.py b/pytorch_transformers/tests/tokenization_xlnet_test.py
index e50fe9243d..27c6b984ee 100644
--- a/pytorch_transformers/tests/tokenization_xlnet_test.py
+++ b/pytorch_transformers/tests/tokenization_xlnet_test.py
@@ -32,7 +32,10 @@ class XLNetTokenizationTest(unittest.TestCase):
         with TemporaryDirectory() as tmpdirname:
             tokenizer.save_pretrained(tmpdirname)
 
-            create_and_check_tokenizer_commons(self, XLNetTokenizer, tmpdirname)
+            input_text = u"This is a test"
+            output_text = u"This is a test"
+
+            create_and_check_tokenizer_commons(self, input_text, output_text, XLNetTokenizer, tmpdirname)
 
             tokens = tokenizer.tokenize(u'This is a test')
             self.assertListEqual(tokens, [u'▁This', u'▁is', u'▁a', u'▁t', u'est'])
diff --git a/pytorch_transformers/tokenization_bert.py b/pytorch_transformers/tokenization_bert.py
index 8b34a43e5a..f1e900caaf 100644
--- a/pytorch_transformers/tokenization_bert.py
+++ b/pytorch_transformers/tokenization_bert.py
@@ -161,10 +161,9 @@ class BertTokenizer(PreTrainedTokenizer):
         """Converts an index (integer) in a token (string/unicode) using the vocab."""
         return self.ids_to_tokens.get(index, self.unk_token)
 
-    def _convert_ids_to_string(self, tokens_ids):
-        """Converts a sequence of ids in a string."""
-        tokens = self.convert_ids_to_tokens(tokens_ids)
-        out_string = ''.join(tokens).replace(' ##', '').strip()
+    def convert_tokens_to_string(self, tokens):
+        """ Converts a sequence of tokens (string) in a single string. """
+        out_string = ' '.join(tokens).replace(' ##', '').strip()
         return out_string
 
     def save_vocabulary(self, vocab_path):
diff --git a/pytorch_transformers/tokenization_gpt2.py b/pytorch_transformers/tokenization_gpt2.py
index bd90a92251..43c57c9cd3 100644
--- a/pytorch_transformers/tokenization_gpt2.py
+++ b/pytorch_transformers/tokenization_gpt2.py
@@ -185,9 +185,9 @@ class GPT2Tokenizer(PreTrainedTokenizer):
         """Converts an index (integer) in a token (string/unicode) using the vocab."""
         return self.decoder.get(index)
 
-    def _convert_ids_to_string(self, tokens_ids):
-        """Converts a sequence of ids in a string."""
-        text = ''.join(tokens_ids)
+    def convert_tokens_to_string(self, tokens):
+        """ Converts a sequence of tokens (string) in a single string. """
+        text = ''.join(tokens)
         text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors=self.errors)
         return text
 
diff --git a/pytorch_transformers/tokenization_openai.py b/pytorch_transformers/tokenization_openai.py
index 16d355c57d..0eb5281d39 100644
--- a/pytorch_transformers/tokenization_openai.py
+++ b/pytorch_transformers/tokenization_openai.py
@@ -174,9 +174,9 @@ class OpenAIGPTTokenizer(PreTrainedTokenizer):
         """Converts an id in a token (BPE) using the vocab."""
         return self.decoder.get(index, self.unk_token)
 
-    def _convert_ids_to_string(self, tokens_ids):
-        """Converts a sequence of ids in a string."""
-        out_string = ''.join(tokens_ids).replace('</w>', ' ').strip()
+    def convert_tokens_to_string(self, tokens):
+        """ Converts a sequence of tokens (string) in a single string. """
+        out_string = ''.join(tokens).replace('</w>', ' ').strip()
         return out_string
 
     def save_vocabulary(self, save_directory):
diff --git a/pytorch_transformers/tokenization_transfo_xl.py b/pytorch_transformers/tokenization_transfo_xl.py
index 98b4eb6ff5..b08e8e1cca 100644
--- a/pytorch_transformers/tokenization_transfo_xl.py
+++ b/pytorch_transformers/tokenization_transfo_xl.py
@@ -229,9 +229,9 @@ class TransfoXLTokenizer(PreTrainedTokenizer):
             else:
                 raise ValueError('Token not in vocabulary and no <unk> token in vocabulary for replacement')
 
-    def _convert_ids_to_string(self, tokens_ids):
-        """Converts a sequence of ids in a string."""
-        out_string = ' '.join(tokens_ids).strip()
+    def convert_tokens_to_string(self, tokens):
+        """ Converts a sequence of tokens (string) in a single string. """
+        out_string = ' '.join(tokens).strip()
         return out_string
 
     def convert_to_tensor(self, symbols):
diff --git a/pytorch_transformers/tokenization_utils.py b/pytorch_transformers/tokenization_utils.py
index 9840e75225..d857e6f2d4 100644
--- a/pytorch_transformers/tokenization_utils.py
+++ b/pytorch_transformers/tokenization_utils.py
@@ -361,52 +361,26 @@ class PreTrainedTokenizer(object):
             (resp.) a sequence of ids, using the vocabulary.
         """
         if isinstance(tokens, str) or (six.PY2 and isinstance(tokens, unicode)):
-            return self.convert_token_to_id_with_added_voc(tokens)
+            return self._convert_token_to_id_with_added_voc(tokens)
 
         ids = []
         for token in tokens:
-            ids.append(self.convert_token_to_id_with_added_voc(token))
+            ids.append(self._convert_token_to_id_with_added_voc(token))
         if len(ids) > self.max_len:
             logger.warning("Token indices sequence length is longer than the specified maximum sequence length "
                            "for this model ({} > {}). Running this sequence through the model will result in "
                            "indexing errors".format(len(ids), self.max_len))
         return ids
 
-
-    def convert_token_to_id_with_added_voc(self, token):
+    def _convert_token_to_id_with_added_voc(self, token):
         if token in self.added_tokens_encoder:
             return self.added_tokens_encoder[token]
         return self._convert_token_to_id(token)
 
-
     def _convert_token_to_id(self, token):
         raise NotImplementedError
 
 
-    def convert_ids_to_tokens(self, ids, skip_special_tokens=False):
-        """ Converts a single index or a sequence of indices (integers) in a token "
-            (resp.) a sequence of tokens (str/unicode), using the vocabulary and added tokens.
-
-            Args:
-                skip_special_tokens: Don't decode special tokens (self.all_special_tokens). Default: False
-        """
-        if isinstance(ids, int):
-            return self.convert_id_to_token(ids)
-        tokens = []
-        for index in ids:
-            if index in self.all_special_ids and skip_special_tokens:
-                continue
-            if index in self.added_tokens_decoder:
-                tokens.append(self.added_tokens_decoder[index])
-            else:
-                tokens.append(self._convert_id_to_token(index))
-        return tokens
-
-
-    def _convert_id_to_token(self, index):
-        raise NotImplementedError
-
-
     def encode(self, text):
         """ Converts a string in a sequence of ids (integer), using the tokenizer and vocabulary.
             same as self.convert_tokens_to_ids(self.tokenize(text)).
@@ -414,22 +388,48 @@ class PreTrainedTokenizer(object):
         return self.convert_tokens_to_ids(self.tokenize(text))
 
 
+    def convert_ids_to_tokens(self, ids, skip_special_tokens=False):
+        """ Converts a single index or a sequence of indices (integers) in a token "
+            (resp.) a sequence of tokens (str/unicode), using the vocabulary and added tokens.
+
+            Args:
+                skip_special_tokens: Don't decode special tokens (self.all_special_tokens). Default: False
+        """
+        if isinstance(ids, int):
+            if ids in self.added_tokens_decoder:
+                return self.added_tokens_decoder[ids]
+            else:
+                return self._convert_id_to_token(ids)
+        tokens = []
+        for index in ids:
+            if index in self.all_special_ids and skip_special_tokens:
+                continue
+            if index in self.added_tokens_decoder:
+                tokens.append(self.added_tokens_decoder[index])
+            else:
+                tokens.append(self._convert_id_to_token(index))
+        return tokens
+
+    def _convert_id_to_token(self, index):
+        raise NotImplementedError
+
+    def convert_tokens_to_string(self, tokens):
+        """ Converts a sequence of tokens (string) in a single string.
+            The most simple way to do it is ' '.join(self.convert_ids_to_tokens(token_ids))
+            but we often want to remove sub-word tokenization artifacts at the same time.
+        """
+        return ' '.join(self.convert_ids_to_tokens(tokens))
+
     def decode(self, token_ids, skip_special_tokens=False, clean_up_tokenization_spaces=True):
         """ Converts a sequence of ids (integer) in a string, using the tokenizer and vocabulary
             with options to remove special tokens and clean up tokenization spaces.
         """
         filtered_tokens = self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens)
-        text = self._convert_ids_to_string(filtered_tokens)
+        text = self.convert_tokens_to_string(filtered_tokens)
         if clean_up_tokenization_spaces:
             text = clean_up_tokenization(text)
         return text
 
-    def _convert_ids_to_string(self, tokens_ids):
-        """ Converts a sequence of ids (integer) in a string, using the tokenizer and vocabulary.
-            roughtly same as ' '.join(self.convert_ids_to_tokens(token_ids)).
-        """
-        return ' '.join(self.convert_ids_to_tokens(tokens_ids))
-
     @property
     def special_tokens_map(self):
         """ A dictionary mapping special token class attribute (cls_token, unk_token...) to their
diff --git a/pytorch_transformers/tokenization_xlm.py b/pytorch_transformers/tokenization_xlm.py
index 58fefa104b..42b61badcd 100644
--- a/pytorch_transformers/tokenization_xlm.py
+++ b/pytorch_transformers/tokenization_xlm.py
@@ -202,9 +202,9 @@ class XLMTokenizer(PreTrainedTokenizer):
         """Converts an index (integer) in a token (string/unicode) using the vocab."""
         return self.decoder.get(index, self.unk_token)
 
-    def _convert_ids_to_string(self, tokens_ids):
-        """Converts a sequence of ids in a string."""
-        out_string = ''.join(tokens_ids).replace('</w>', ' ').strip()
+    def convert_tokens_to_string(self, tokens):
+        """ Converts a sequence of tokens (string) in a single string. """
+        out_string = ''.join(tokens).replace('</w>', ' ').strip()
         return out_string
 
     def save_vocabulary(self, save_directory):
diff --git a/pytorch_transformers/tokenization_xlnet.py b/pytorch_transformers/tokenization_xlnet.py
index d7317b2afc..fa60a18d8a 100644
--- a/pytorch_transformers/tokenization_xlnet.py
+++ b/pytorch_transformers/tokenization_xlnet.py
@@ -170,9 +170,9 @@ class XLNetTokenizer(PreTrainedTokenizer):
             token = token.decode('utf-8')
         return token
 
-    def _convert_ids_to_string(self, tokens_ids):
-        """Converts a sequence of ids in a string."""
-        out_string = ''.join(tokens_ids).replace(SPIECE_UNDERLINE, ' ')
+    def convert_tokens_to_string(self, tokens):
+        """Converts a sequence of tokens (strings for sub-words) in a single string."""
+        out_string = ''.join(tokens).replace(SPIECE_UNDERLINE, ' ').strip()
         return out_string
 
     def save_vocabulary(self, save_directory):
@@ -184,6 +184,7 @@ class XLNetTokenizer(PreTrainedTokenizer):
             return
         out_vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES['vocab_file'])
 
-        copyfile(self.vocab_file, out_vocab_file)
+        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
+            copyfile(self.vocab_file, out_vocab_file)
 
         return (out_vocab_file,)

From e691fc0963b64e8c19b4a71ddaefe418096776b6 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Mon, 15 Jul 2019 17:45:24 +0200
Subject: [PATCH 133/139] update QA models tests + run_generation

---
 examples/run_generation.py                    | 17 ++++++-------
 examples/test_examples.py                     |  3 ++-
 .../tests/modeling_xlm_test.py                | 24 ++++++++++++-------
 .../tests/modeling_xlnet_test.py              | 24 ++++++++++++-------
 4 files changed, 41 insertions(+), 27 deletions(-)

diff --git a/examples/run_generation.py b/examples/run_generation.py
index 4108b2894a..a2a8f29103 100644
--- a/examples/run_generation.py
+++ b/examples/run_generation.py
@@ -131,8 +131,10 @@ def sample_sequence(model, length, context, num_samples=1, temperature=1, top_k=
 
 def main():
     parser = argparse.ArgumentParser()
-    parser.add_argument('--model_name', type=str, default=None, required=True,
-                        help="GPT, GPT-2, Transformer-XL or XLNet pre-trained model selected in the list: " + ", ".join(ALL_MODELS))
+    parser.add_argument("--model_type", default=None, type=str, required=True,
+                        help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()))
+    parser.add_argument("--model_name_or_path", default=None, type=str, required=True,
+                        help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS))
     parser.add_argument("--prompt", type=str, default="")
     parser.add_argument("--padding_text", type=str, default="")
     parser.add_argument("--length", type=int, default=20)
@@ -150,15 +152,10 @@ def main():
 
     set_seed(args)
 
-    args.model_type = ""
-    for key in MODEL_CLASSES:
-        if key in args.model_name.lower():
-            args.model_type = key  # take the first match in model types
-            break
-
+    args.model_type = args.model_type.lower()
     model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
-    tokenizer = tokenizer_class.from_pretrained(args.model_name)
-    model = model_class.from_pretrained(args.model_name)
+    tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path)
+    model = model_class.from_pretrained(args.model_name_or_path)
     model.to(args.device)
     model.eval()
 
diff --git a/examples/test_examples.py b/examples/test_examples.py
index 00370e9361..2f88d129f8 100644
--- a/examples/test_examples.py
+++ b/examples/test_examples.py
@@ -101,7 +101,8 @@ class ExamplesTests(unittest.TestCase):
                     "--prompt=Hello",
                     "--length=10",
                     "--seed=42"]
-        model_name = "--model_name=openai-gpt"
+        model_type, model_name = ("--model_type=openai-gpt",
+                                  "--model_name_or_path=openai-gpt")
         with patch.object(sys, 'argv', testargs + [model_name]):
             result = run_generation.main()
             self.assertGreaterEqual(len(result), 10)
diff --git a/pytorch_transformers/tests/modeling_xlm_test.py b/pytorch_transformers/tests/modeling_xlm_test.py
index 85189859a6..4308c18d45 100644
--- a/pytorch_transformers/tests/modeling_xlm_test.py
+++ b/pytorch_transformers/tests/modeling_xlm_test.py
@@ -191,17 +191,19 @@ class XLMModelTest(CommonTestCases.CommonModelTester):
                                          cls_index=sequence_labels,
                                          is_impossible=is_impossible_labels)
 
-            total_loss, start_logits, end_logits, cls_logits = outputs
+            (total_loss,) = outputs
 
             outputs = model(input_ids, start_positions=sequence_labels,
                                          end_positions=sequence_labels)
 
-            total_loss, start_logits, end_logits = outputs
+            (total_loss,) = outputs
 
             result = {
                 "loss": total_loss,
-                "start_logits": start_logits,
-                "end_logits": end_logits,
+                "start_top_log_probs": start_top_log_probs,
+                "start_top_index": start_top_index,
+                "end_top_log_probs": end_top_log_probs,
+                "end_top_index": end_top_index,
                 "cls_logits": cls_logits,
             }
 
@@ -209,11 +211,17 @@ class XLMModelTest(CommonTestCases.CommonModelTester):
                 list(result["loss"].size()),
                 [])
             self.parent.assertListEqual(
-                list(result["start_logits"].size()),
-                [self.batch_size, self.seq_length])
+                list(result["start_top_log_probs"].size()),
+                [self.batch_size, model.config.start_n_top])
             self.parent.assertListEqual(
-                list(result["end_logits"].size()),
-                [self.batch_size, self.seq_length])
+                list(result["start_top_index"].size()),
+                [self.batch_size, model.config.start_n_top])
+            self.parent.assertListEqual(
+                list(result["end_top_log_probs"].size()),
+                [self.batch_size, model.config.start_n_top * model.config.end_n_top])
+            self.parent.assertListEqual(
+                list(result["end_top_index"].size()),
+                [self.batch_size, model.config.start_n_top * model.config.end_n_top])
             self.parent.assertListEqual(
                 list(result["cls_logits"].size()),
                 [self.batch_size])
diff --git a/pytorch_transformers/tests/modeling_xlnet_test.py b/pytorch_transformers/tests/modeling_xlnet_test.py
index 8360a08d60..290c5766e2 100644
--- a/pytorch_transformers/tests/modeling_xlnet_test.py
+++ b/pytorch_transformers/tests/modeling_xlnet_test.py
@@ -210,17 +210,19 @@ class XLNetModelTest(CommonTestCases.CommonModelTester):
                                          cls_index=sequence_labels,
                                          is_impossible=is_impossible_labels)
 
-            total_loss, start_logits, end_logits, cls_logits, mems = outputs
+            total_loss, mems = outputs
 
             outputs = model(input_ids_1, start_positions=sequence_labels,
                                          end_positions=sequence_labels)
 
-            total_loss, start_logits, end_logits, mems = outputs
+            total_loss, mems = outputs
 
             result = {
                 "loss": total_loss,
-                "start_logits": start_logits,
-                "end_logits": end_logits,
+                "start_top_log_probs": start_top_log_probs,
+                "start_top_index": start_top_index,
+                "end_top_log_probs": end_top_log_probs,
+                "end_top_index": end_top_index,
                 "cls_logits": cls_logits,
                 "mems": mems,
             }
@@ -229,11 +231,17 @@ class XLNetModelTest(CommonTestCases.CommonModelTester):
                 list(result["loss"].size()),
                 [])
             self.parent.assertListEqual(
-                list(result["start_logits"].size()),
-                [self.batch_size, self.seq_length])
+                list(result["start_top_log_probs"].size()),
+                [self.batch_size, model.config.start_n_top])
             self.parent.assertListEqual(
-                list(result["end_logits"].size()),
-                [self.batch_size, self.seq_length])
+                list(result["start_top_index"].size()),
+                [self.batch_size, model.config.start_n_top])
+            self.parent.assertListEqual(
+                list(result["end_top_log_probs"].size()),
+                [self.batch_size, model.config.start_n_top * model.config.end_n_top])
+            self.parent.assertListEqual(
+                list(result["end_top_index"].size()),
+                [self.batch_size, model.config.start_n_top * model.config.end_n_top])
             self.parent.assertListEqual(
                 list(result["cls_logits"].size()),
                 [self.batch_size])

From 76da9765b6c9acc521bf589fac81075aabe50d78 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Mon, 15 Jul 2019 17:52:35 +0200
Subject: [PATCH 134/139] fix run_generation test

---
 examples/test_examples.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/test_examples.py b/examples/test_examples.py
index 2f88d129f8..688401ebc9 100644
--- a/examples/test_examples.py
+++ b/examples/test_examples.py
@@ -103,7 +103,7 @@ class ExamplesTests(unittest.TestCase):
                     "--seed=42"]
         model_type, model_name = ("--model_type=openai-gpt",
                                   "--model_name_or_path=openai-gpt")
-        with patch.object(sys, 'argv', testargs + [model_name]):
+        with patch.object(sys, 'argv', testargs + [model_type, model_name]):
             result = run_generation.main()
             self.assertGreaterEqual(len(result), 10)
 

From 3b8b0e01bb89152a0c3102f21e5764cafddbe0b0 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Tue, 16 Jul 2019 00:12:55 +0200
Subject: [PATCH 135/139] update readme

---
 README.md                              | 1709 ++----------------------
 docs/source/serialization.rst          |  171 +++
 pytorch_transformers/modeling_utils.py |   10 +-
 pytorch_transformers/modeling_xlnet.py |   10 +-
 4 files changed, 326 insertions(+), 1574 deletions(-)
 create mode 100644 docs/source/serialization.rst

diff --git a/README.md b/README.md
index dba18a0d5e..129179b817 100644
--- a/README.md
+++ b/README.md
@@ -1,69 +1,41 @@
-# PyTorch Pretrained BERT: The Big & Extending Repository of pretrained Transformers
+# 👾 PyTorch-Transformers
 
 [![CircleCI](https://circleci.com/gh/huggingface/pytorch-pretrained-bert.svg?style=svg)](https://circleci.com/gh/huggingface/pytorch-pretrained-bert)
 
-This repository contains op-for-op PyTorch implementations, pre-trained models and fine-tuning examples for:
+PyTorch-Transformers is a library of state-of-the-art pre-trained models for Natural Language Processing (NLP). The library currently contains PyTorch implementations, pre-trained model weights, usage scripts and conversion utilities for the following models:
 
-- [Google's BERT model](https://github.com/google-research/bert),
-- [OpenAI's GPT model](https://github.com/openai/finetune-transformer-lm),
-- [OpenAI's GPT-2 model](https://blog.openai.com/better-language-models/).
-- [Google/CMU's Transformer-XL model](https://github.com/kimiyoung/transformer-xl), and
-- [Google/CMU's XLNet model](https://github.com/zihangdai/xlnet/).
-- [Facebook's XLM model](https://github.com/facebookresearch/XLM/).
+- **[Google's BERT model](https://github.com/google-research/bert)** released with the paper [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova.
+- **[OpenAI's GPT model](https://github.com/openai/finetune-transformer-lm) released  with the paper [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/)** by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.
+- **[OpenAI's GPT-2 model](https://blog.openai.com/better-language-models/)** released with the paper [Language Models are Unsupervised Multitask Learners](https://blog.openai.com/better-language-models/) by Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever**.
+- **[Google/CMU's Transformer-XL model](https://github.com/kimiyoung/transformer-xl)** released with the paper [​XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
+- **[Google/CMU's XLNet model](https://github.com/zihangdai/xlnet/)** released with the paper [​XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
+- **[Facebook's XLM model](https://github.com/facebookresearch/XLM/)** released together with the paper [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) by Guillaume Lample and Alexis Conneau.
 
-These implementations have been tested on several datasets (see the examples) and should match the performances of the associated TensorFlow implementations (e.g. ~91 F1 on SQuAD for BERT, ~88 F1 on RocStories for OpenAI GPT and ~18.3 perplexity on WikiText 103 for the Transformer-XL). You can find more details in the [Examples](#examples) section below.
+These implementations have been tested on several datasets (see the example scripts) and should match the performances of the original implementations (e.g. ~93 F1 on SQuAD for BERT Whole-Word-Masking, ~88 F1 on RocStories for OpenAI GPT, ~18.3 perplexity on WikiText 103 for Transformer-XL, ~0.916 Peason R coefficient on STS-B for XLNet).
 
-Here are some information on these models:
+You can find more details in the [Examples](#examples) section of the documentation.
 
-**BERT** was released together with the paper [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova. This PyTorch implementation of BERT is provided with [Google's pre-trained models](https://github.com/google-research/bert), examples, notebooks and a command-line interface to load any pre-trained TensorFlow checkpoint for BERT is also provided.
-
-**OpenAI GPT** was released together with the paper [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever. This PyTorch implementation of OpenAI GPT is an adaptation of the [PyTorch implementation by HuggingFace](https://github.com/huggingface/pytorch-openai-transformer-lm) and is provided with [OpenAI's pre-trained model](https://github.com/openai/finetune-transformer-lm) and a command-line interface that was used to convert the pre-trained NumPy checkpoint in PyTorch.
-
-**OpenAI GPT-2** was released together with the paper [Language Models are Unsupervised Multitask Learners](https://blog.openai.com/better-language-models/) by Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever**. This PyTorch implementation of OpenAI GPT-2 is an adaptation of the [OpenAI's implementation](https://github.com/openai/gpt-2) and is provided with [OpenAI's pre-trained model](https://github.com/openai/gpt-2) and a command-line interface that was used to convert the TensorFlow checkpoint in PyTorch.
-
-**Google/CMU's Transformer-XL** was released together with the paper [​XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
-This PyTorch implementation of XLNet is an adaptation of the original [PyTorch implementation](https://github.com/kimiyoung/transformer-xl) which has been slightly modified to match the performances of the TensorFlow implementation and allow to re-use the pretrained weights. A command-line interface is provided to convert TensorFlow checkpoints in PyTorch models.
-
-**Google/CMU's XLNet** was released together with the paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](http://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
-This PyTorch implementation of XLNet is provided with [Google/CMU's pre-trained models](https://github.com/zihangdai/xlnet) and examples. A command-line interface is provided to convert TensorFlow checkpoints in PyTorch models.
-
-**Facebook's XLM** was released together with the paper [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) by Guillaume Lample and Alexis Conneau.
-This PyTorch implementation of XLM is an adaptation of the original [PyTorch implementation](https://github.com/facebookresearch/XLM). A command-line interface is provided to convert original PyTorch checkpoints in PyTorch models according to the present repository.
-
-## Content
+## Readme
 
 | Section | Description |
-| - | - |
+|-|-|
 | [Installation](#installation) | How to install the package |
-| [Overview](#overview) | Overview of the package |
-| [Usage](#usage) | Quickstart examples |
-| [Doc](#doc) |  Detailed documentation |
-| [Examples](#examples) | Detailed examples on how to fine-tune Bert |
-| [Notebooks](#notebooks) | Introduction on the provided Jupyter Notebooks |
-| [TPU](#tpu) | Notes on TPU support and pretraining scripts |
-| [Command-line interface](#Command-line-interface) | Convert a TensorFlow checkpoint in a PyTorch dump |
+| [Quick tour: Usage](#quick-tour-usage) | Tokenizers & models usage: Bert and GPT-2 |
+| [Quick tour: Fine-tuning/usage scripts](#quick-tour-fine-tuning-usage-scripts) | Using provided scripts: GLUE, SQuAD and Text generation |
+| [Documentation](#documentation) | Full API documentation and more |
 
 ## Installation
 
-This repo was tested on Python 2.7 and 3.5+ (examples are tested only on python 3.5+) and PyTorch 0.4.1/1.0.0
+This repo is tested on Python 2.7 and 3.5+ (examples are tested only on python 3.5+) and PyTorch 0.4.1 to 1.1.0
 
 ### With pip
 
-PyTorch pretrained bert can be installed by pip as follows:
+PyTorch-Transformers can be installed by pip as follows:
 
 ```bash
 pip install pytorch-transformers
 ```
 
-If you want to reproduce the original tokenization process of the `OpenAI GPT` paper, you will need to install `ftfy` (limit to version 4.4.3 if you are using Python 2) and `SpaCy` :
-
-```bash
-pip install spacy ftfy==4.4.3
-python -m spacy download en
-```
-
-If you don't install `ftfy` and `SpaCy`, the `OpenAI GPT` tokenizer will default to tokenize using BERT's `BasicTokenizer` followed by Byte-Pair Encoding (which should be fine for most usage, don't worry).
-
 ### From source
 
 Clone the repository and run:
@@ -72,110 +44,37 @@ Clone the repository and run:
 pip install [--editable] .
 ```
 
-Here also, if you want to reproduce the original tokenization process of the `OpenAI GPT` model, you will need to install `ftfy` (limit to version 4.4.3 if you are using Python 2) and `SpaCy` :
+### SpaCy, ftfy
+
+If you want to reproduce the original tokenization process of the `OpenAI GPT` paper, you can install `ftfy` (version 4.4.3 if you are using Python 2) and `SpaCy` :
 
 ```bash
 pip install spacy ftfy==4.4.3
 python -m spacy download en
 ```
 
-Again, if you don't install `ftfy` and `SpaCy`, the `OpenAI GPT` tokenizer will default to tokenize using BERT's `BasicTokenizer` followed by Byte-Pair Encoding (which should be fine for most usage).
+If you don't install `ftfy` and `SpaCy`, the `OpenAI GPT` tokenizer will default to tokenize using BERT's `BasicTokenizer` followed by Byte-Pair Encoding (which should be fine for most usage, don't worry).
 
-A series of tests is included in the [tests folder](https://github.com/huggingface/pytorch-transformers/tree/master/tests) and can be run using `pytest` (install pytest if needed: `pip install pytest`).
+### Tests
 
-You can run the tests with the command:
+A series of tests is included for the library and the example scripts. Library tests can be found in the [tests folder](https://github.com/huggingface/pytorch-transformers/tree/master/pytorch_transformers/tests) and examples tests in the [examples folder](https://github.com/huggingface/pytorch-transformers/tree/master/examples).
+
+These tests can be run using `pytest` (install pytest if needed with `pip install pytest`).
+
+You can run the tests from the root of the cloned repository with the commands:
 
 ```bash
-python -m pytest -sv tests/
+python -m pytest -sv ./pytorch_transformers/tests/
+python -m pytest -sv ./examples/
 ```
 
-## Overview
+## Quick tour: Usage
 
-This package comprises the following classes that can be imported in Python and are detailed in the [Doc](#doc) section of this readme:
+Here are two quick-start examples using `Bert` and `GPT2` with pre-trained models.
 
-- Eight **Bert** PyTorch models (`torch.nn.Module`) with pre-trained weights (in the [`modeling.py`](./pytorch_transformers/modeling.py) file):
-  - [`BertModel`](./pytorch_transformers/modeling.py#L639) - raw BERT Transformer model (**fully pre-trained**),
-  - [`BertForMaskedLM`](./pytorch_transformers/modeling.py#L793) - BERT Transformer with the pre-trained masked language modeling head on top (**fully pre-trained**),
-  - [`BertForNextSentencePrediction`](./pytorch_transformers/modeling.py#L854) - BERT Transformer with the pre-trained next sentence prediction classifier on top  (**fully pre-trained**),
-  - [`BertForPreTraining`](./pytorch_transformers/modeling.py#L722) - BERT Transformer with masked language modeling head and next sentence prediction classifier on top (**fully pre-trained**),
-  - [`BertForSequenceClassification`](./pytorch_transformers/modeling.py#L916) - BERT Transformer with a sequence classification head on top (BERT Transformer is **pre-trained**, the sequence classification head **is only initialized and has to be trained**),
-  - [`BertForMultipleChoice`](./pytorch_transformers/modeling.py#L982) - BERT Transformer with a multiple choice head on top (used for task like Swag) (BERT Transformer is **pre-trained**, the multiple choice classification head **is only initialized and has to be trained**),
-  - [`BertForTokenClassification`](./pytorch_transformers/modeling.py#L1051) - BERT Transformer with a token classification head on top (BERT Transformer is **pre-trained**, the token classification head **is only initialized and has to be trained**),
-  - [`BertForQuestionAnswering`](./pytorch_transformers/modeling.py#L1124) - BERT Transformer with a token classification head on top (BERT Transformer is **pre-trained**, the token classification head **is only initialized and has to be trained**).
+See the [documentation](#doc) for the details of all the models and classes.
 
-- Three **OpenAI GPT** PyTorch models (`torch.nn.Module`) with pre-trained weights (in the [`modeling_openai.py`](./pytorch_transformers/modeling_openai.py) file):
-  - [`OpenAIGPTModel`](./pytorch_transformers/modeling_openai.py#L536) - raw OpenAI GPT Transformer model (**fully pre-trained**),
-  - [`OpenAIGPTLMHeadModel`](./pytorch_transformers/modeling_openai.py#L643) - OpenAI GPT Transformer with the tied language modeling head on top (**fully pre-trained**),
-  - [`OpenAIGPTDoubleHeadsModel`](./pytorch_transformers/modeling_openai.py#L722) - OpenAI GPT Transformer with the tied language modeling head and a multiple choice classification head on top (OpenAI GPT Transformer is **pre-trained**, the multiple choice classification head **is only initialized and has to be trained**),
-
-- Two **Transformer-XL** PyTorch models (`torch.nn.Module`) with pre-trained weights (in the [`modeling_transfo_xl.py`](./pytorch_transformers/modeling_transfo_xl.py) file):
-  - [`TransfoXLModel`](./pytorch_transformers/modeling_transfo_xl.py#L983) - Transformer-XL model which outputs the last hidden state and memory cells (**fully pre-trained**),
-  - [`TransfoXLLMHeadModel`](./pytorch_transformers/modeling_transfo_xl.py#L1260) - Transformer-XL with the tied adaptive softmax head on top for language modeling which outputs the logits/loss and memory cells (**fully pre-trained**),
-
-- Three **OpenAI GPT-2** PyTorch models (`torch.nn.Module`) with pre-trained weights (in the [`modeling_gpt2.py`](./pytorch_transformers/modeling_gpt2.py) file):
-  - [`GPT2Model`](./pytorch_transformers/modeling_gpt2.py#L479) - raw OpenAI GPT-2 Transformer model (**fully pre-trained**),
-  - [`GPT2LMHeadModel`](./pytorch_transformers/modeling_gpt2.py#L559) - OpenAI GPT-2 Transformer with the tied language modeling head on top (**fully pre-trained**),
-  - [`GPT2DoubleHeadsModel`](./pytorch_transformers/modeling_gpt2.py#L624) - OpenAI GPT-2 Transformer with the tied language modeling head and a multiple choice classification head on top (OpenAI GPT-2 Transformer is **pre-trained**, the multiple choice classification head **is only initialized and has to be trained**),
-
-- Tokenizers for **BERT** (using word-piece) (in the [`tokenization.py`](./pytorch_transformers/tokenization.py) file):
-  - `BasicTokenizer` - basic tokenization (punctuation splitting, lower casing, etc.),
-  - `WordpieceTokenizer` - WordPiece tokenization,
-  - `BertTokenizer` - perform end-to-end tokenization, i.e. basic tokenization followed by WordPiece tokenization.
-
-- Tokenizer for **OpenAI GPT** (using Byte-Pair-Encoding) (in the [`tokenization_openai.py`](./pytorch_transformers/tokenization_openai.py) file):
-  - `OpenAIGPTTokenizer` - perform Byte-Pair-Encoding (BPE) tokenization.
-
-- Tokenizer for **Transformer-XL** (word tokens ordered by frequency for adaptive softmax) (in the [`tokenization_transfo_xl.py`](./pytorch_transformers/tokenization_transfo_xl.py) file):
-  - `OpenAIGPTTokenizer` - perform word tokenization and can order words by frequency in a corpus for use in an adaptive softmax.
-
-- Tokenizer for **OpenAI GPT-2** (using byte-level Byte-Pair-Encoding) (in the [`tokenization_gpt2.py`](./pytorch_transformers/tokenization_gpt2.py) file):
-  - `GPT2Tokenizer` - perform byte-level Byte-Pair-Encoding (BPE) tokenization.
-
-- Optimizer (in the [`optimization.py`](./pytorch_transformers/optimization.py) file):
-  - `AdamW` - Version of Adam algorithm with weight decay fix, warmup and linear decay of the learning rate.
-
-- Configuration classes for BERT, OpenAI GPT and Transformer-XL (in the respective [`modeling.py`](./pytorch_transformers/modeling.py), [`modeling_openai.py`](./pytorch_transformers/modeling_openai.py), [`modeling_transfo_xl.py`](./pytorch_transformers/modeling_transfo_xl.py) files):
-  - `BertConfig` - Configuration class to store the configuration of a `BertModel` with utilities to read and write from JSON configuration files.
-  - `OpenAIGPTConfig` - Configuration class to store the configuration of a `OpenAIGPTModel` with utilities to read and write from JSON configuration files.
-  - `GPT2Config` - Configuration class to store the configuration of a `GPT2Model` with utilities to read and write from JSON configuration files.
-  - `TransfoXLConfig` - Configuration class to store the configuration of a `TransfoXLModel` with utilities to read and write from JSON configuration files.
-
-The repository further comprises:
-
-- Five examples on how to use **BERT** (in the [`examples` folder](./examples)):
-  - [`run_bert_extract_features.py`](./examples/run_bert_extract_features.py) - Show how to extract hidden states from an instance of `BertModel`,
-  - [`run_bert_classifier.py`](./examples/run_bert_classifier.py) - Show how to fine-tune an instance of `BertForSequenceClassification` on GLUE's MRPC task,
-  - [`run_bert_squad.py`](./examples/run_bert_squad.py) - Show how to fine-tune an instance of `BertForQuestionAnswering` on SQuAD v1.0 and SQuAD v2.0 tasks.
-  - [`run_swag.py`](./examples/run_swag.py) - Show how to fine-tune an instance of `BertForMultipleChoice` on Swag task.
-  - [`simple_lm_finetuning.py`](./examples/lm_finetuning/simple_lm_finetuning.py) - Show how to fine-tune an instance of `BertForPretraining` on a target text corpus.
-
-- One example on how to use **OpenAI GPT** (in the [`examples` folder](./examples)):
-  - [`run_openai_gpt.py`](./examples/run_openai_gpt.py) - Show how to fine-tune an instance of `OpenGPTDoubleHeadsModel` on the RocStories task.
-
-- One example on how to use **Transformer-XL** (in the [`examples` folder](./examples)):
-  - [`run_transfo_xl.py`](./examples/run_transfo_xl.py) - Show how to load and evaluate a pre-trained model of `TransfoXLLMHeadModel` on WikiText 103.
-
-- One example on how to use **OpenAI GPT-2** in the unconditional and interactive mode (in the [`examples` folder](./examples)):
-  - [`run_gpt2.py`](./examples/run_gpt2.py) - Show how to use OpenAI GPT-2 an instance of `GPT2LMHeadModel` to generate text (same as the original OpenAI GPT-2 examples).
-
-  These examples are detailed in the [Examples](#examples) section of this readme.
-
-- Three notebooks that were used to check that the TensorFlow and PyTorch models behave identically (in the [`notebooks` folder](./notebooks)):
-  - [`Comparing-TF-and-PT-models.ipynb`](./notebooks/Comparing-TF-and-PT-models.ipynb) - Compare the hidden states predicted by `BertModel`,
-  - [`Comparing-TF-and-PT-models-SQuAD.ipynb`](./notebooks/Comparing-TF-and-PT-models-SQuAD.ipynb) - Compare the spans predicted by  `BertForQuestionAnswering` instances,
-  - [`Comparing-TF-and-PT-models-MLM-NSP.ipynb`](./notebooks/Comparing-TF-and-PT-models-MLM-NSP.ipynb) - Compare the predictions of the `BertForPretraining` instances.
-
-  These notebooks are detailed in the [Notebooks](#notebooks) section of this readme.
-
-- A command-line interface to convert TensorFlow checkpoints (BERT, Transformer-XL) or NumPy checkpoint (OpenAI) in a PyTorch save of the associated PyTorch model:
-
-  This CLI is detailed in the [Command-line interface](#Command-line-interface) section of this readme.
-
-## Usage
-
-### BERT
-
-Here is a quick-start example using `BertTokenizer`, `BertModel` and `BertForMaskedLM` class with Google AI's pre-trained `Bert base uncased` model. See the [doc section](#doc) below for all the details on these classes.
+### BERT example
 
 First let's prepare a tokenized input with `BertTokenizer`
 
@@ -183,14 +82,14 @@ First let's prepare a tokenized input with `BertTokenizer`
 import torch
 from pytorch_transformers import BertTokenizer, BertModel, BertForMaskedLM
 
-# OPTIONAL: if you want to have more information on what's happening, activate the logger as follows
+# OPTIONAL: if you want to have more information on what's happening under the hood, activate the logger as follows
 import logging
 logging.basicConfig(level=logging.INFO)
 
 # Load pre-trained model tokenizer (vocabulary)
 tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
 
-# Tokenized input
+# Tokenize input
 text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
 tokenized_text = tokenizer.tokenize(text)
 
@@ -209,11 +108,14 @@ tokens_tensor = torch.tensor([indexed_tokens])
 segments_tensors = torch.tensor([segments_ids])
 ```
 
-Let's see how to use `BertModel` to get hidden states
+Let's see how to use `BertModel` to get encoded inputs:
 
 ```python
 # Load pre-trained model (weights)
 model = BertModel.from_pretrained('bert-base-uncased')
+
+# Set the model in evaluation mode to desactivate the DropOut modules
+# This is IMPORTANT to have reproductible results during evaluation!
 model.eval()
 
 # If you have a GPU, put everything on cuda
@@ -223,12 +125,17 @@ model.to('cuda')
 
 # Predict hidden states features for each layer
 with torch.no_grad():
-    encoded_layers, _ = model(tokens_tensor, segments_tensors)
-# We have a hidden states for each of the 12 layers in model bert-base-uncased
-assert len(encoded_layers) == 12
+    # See the models docstrings for the detail of the inputs
+    outputs = model(tokens_tensor, token_type_ids=segments_tensors)
+    # PyTorch-Transformers models always output tuples.
+    # See the models docstrings for the detail of all the outputs
+    # In our case, the first element is the hidden state of the last layer of the Bert model
+    encoded_layers = outputs[0]
+# We have encoded our input sequence in a FloatTensor of shape (batch size, sequence length, model hidden dimension)
+assert tuple(encoded_layers.shape) == (1, len(indexed_tokens), model.config.hidden_size)
 ```
 
-And how to use `BertForMaskedLM`
+And how to use `BertForMaskedLM` to predict a masked token:
 
 ```python
 # Load pre-trained model (weights)
@@ -242,7 +149,8 @@ model.to('cuda')
 
 # Predict all tokens
 with torch.no_grad():
-    predictions = model(tokens_tensor, segments_tensors)
+    outputs = model(tokens_tensor, token_type_ids=segments_tensors)
+    predictions = outputs[0]
 
 # confirm we were able to predict 'henson'
 predicted_index = torch.argmax(predictions[0, masked_index]).item()
@@ -250,177 +158,15 @@ predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
 assert predicted_token == 'henson'
 ```
 
-### OpenAI GPT
-
-Here is a quick-start example using `OpenAIGPTTokenizer`, `OpenAIGPTModel` and `OpenAIGPTLMHeadModel` class with OpenAI's pre-trained  model. See the [doc section](#doc) below for all the details on these classes.
-
-First let's prepare a tokenized input with `OpenAIGPTTokenizer`
-
-```python
-import torch
-from pytorch_transformers import OpenAIGPTTokenizer, OpenAIGPTModel, OpenAIGPTLMHeadModel
-
-# OPTIONAL: if you want to have more information on what's happening, activate the logger as follows
-import logging
-logging.basicConfig(level=logging.INFO)
-
-# Load pre-trained model tokenizer (vocabulary)
-tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
-
-# Tokenized input
-text = "Who was Jim Henson ? Jim Henson was a puppeteer"
-tokenized_text = tokenizer.tokenize(text)
-
-# Convert token to vocabulary indices
-indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
-
-# Convert inputs to PyTorch tensors
-tokens_tensor = torch.tensor([indexed_tokens])
-```
-
-Let's see how to use `OpenAIGPTModel` to get hidden states
-
-```python
-# Load pre-trained model (weights)
-model = OpenAIGPTModel.from_pretrained('openai-gpt')
-model.eval()
-
-# If you have a GPU, put everything on cuda
-tokens_tensor = tokens_tensor.to('cuda')
-model.to('cuda')
-
-# Predict hidden states features for each layer
-with torch.no_grad():
-    hidden_states = model(tokens_tensor)
-```
-
-And how to use `OpenAIGPTLMHeadModel`
-
-```python
-# Load pre-trained model (weights)
-model = OpenAIGPTLMHeadModel.from_pretrained('openai-gpt')
-model.eval()
-
-# If you have a GPU, put everything on cuda
-tokens_tensor = tokens_tensor.to('cuda')
-model.to('cuda')
-
-# Predict all tokens
-with torch.no_grad():
-    predictions = model(tokens_tensor)
-
-# get the predicted last token
-predicted_index = torch.argmax(predictions[0, -1, :]).item()
-predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
-assert predicted_token == '.</w>'
-```
-
-And how to use `OpenAIGPTDoubleHeadsModel`
-
-```python
-# Load pre-trained model (weights)
-model = OpenAIGPTDoubleHeadsModel.from_pretrained('openai-gpt')
-model.eval()
-
-#  Prepare tokenized input
-text1 = "Who was Jim Henson ? Jim Henson was a puppeteer"
-text2 = "Who was Jim Henson ? Jim Henson was a mysterious young man"
-tokenized_text1 = tokenizer.tokenize(text1)
-tokenized_text2 = tokenizer.tokenize(text2)
-indexed_tokens1 = tokenizer.convert_tokens_to_ids(tokenized_text1)
-indexed_tokens2 = tokenizer.convert_tokens_to_ids(tokenized_text2)
-tokens_tensor = torch.tensor([[indexed_tokens1, indexed_tokens2]])
-mc_token_ids = torch.LongTensor([[len(tokenized_text1)-1, len(tokenized_text2)-1]])
-
-# Predict hidden states features for each layer
-with torch.no_grad():
-    lm_logits, multiple_choice_logits = model(tokens_tensor, mc_token_ids)
-```
-
-### Transformer-XL
-
-Here is a quick-start example using `TransfoXLTokenizer`, `TransfoXLModel` and `TransfoXLModelLMHeadModel` class with the Transformer-XL model pre-trained on WikiText-103. See the [doc section](#doc) below for all the details on these classes.
-
-First let's prepare a tokenized input with `TransfoXLTokenizer`
-
-```python
-import torch
-from pytorch_transformers import TransfoXLTokenizer, TransfoXLModel, TransfoXLLMHeadModel
-
-# OPTIONAL: if you want to have more information on what's happening, activate the logger as follows
-import logging
-logging.basicConfig(level=logging.INFO)
-
-# Load pre-trained model tokenizer (vocabulary from wikitext 103)
-tokenizer = TransfoXLTokenizer.from_pretrained('transfo-xl-wt103')
-
-# Tokenized input
-text_1 = "Who was Jim Henson ?"
-text_2 = "Jim Henson was a puppeteer"
-tokenized_text_1 = tokenizer.tokenize(text_1)
-tokenized_text_2 = tokenizer.tokenize(text_2)
-
-# Convert token to vocabulary indices
-indexed_tokens_1 = tokenizer.convert_tokens_to_ids(tokenized_text_1)
-indexed_tokens_2 = tokenizer.convert_tokens_to_ids(tokenized_text_2)
-
-# Convert inputs to PyTorch tensors
-tokens_tensor_1 = torch.tensor([indexed_tokens_1])
-tokens_tensor_2 = torch.tensor([indexed_tokens_2])
-```
-
-Let's see how to use `TransfoXLModel` to get hidden states
-
-```python
-# Load pre-trained model (weights)
-model = TransfoXLModel.from_pretrained('transfo-xl-wt103')
-model.eval()
-
-# If you have a GPU, put everything on cuda
-tokens_tensor_1 = tokens_tensor_1.to('cuda')
-tokens_tensor_2 = tokens_tensor_2.to('cuda')
-model.to('cuda')
-
-with torch.no_grad():
-    # Predict hidden states features for each layer
-    hidden_states_1, mems_1 = model(tokens_tensor_1)
-    # We can re-use the memory cells in a subsequent call to attend a longer context
-    hidden_states_2, mems_2 = model(tokens_tensor_2, mems=mems_1)
-```
-
-And how to use `TransfoXLLMHeadModel`
-
-```python
-# Load pre-trained model (weights)
-model = TransfoXLLMHeadModel.from_pretrained('transfo-xl-wt103')
-model.eval()
-
-# If you have a GPU, put everything on cuda
-tokens_tensor_1 = tokens_tensor_1.to('cuda')
-tokens_tensor_2 = tokens_tensor_2.to('cuda')
-model.to('cuda')
-
-with torch.no_grad():
-    # Predict all tokens
-    predictions_1, mems_1 = model(tokens_tensor_1)
-    # We can re-use the memory cells in a subsequent call to attend a longer context
-    predictions_2, mems_2 = model(tokens_tensor_2, mems=mems_1)
-
-# get the predicted last token
-predicted_index = torch.argmax(predictions_2[0, -1, :]).item()
-predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
-assert predicted_token == 'who'
-```
-
 ### OpenAI GPT-2
 
-Here is a quick-start example using `GPT2Tokenizer`, `GPT2Model` and `GPT2LMHeadModel` class with OpenAI's pre-trained  model. See the [doc section](#doc) below for all the details on these classes.
+Here is a quick-start example using `GPT2Tokenizer` and `GPT2LMHeadModel` class with OpenAI's pre-trained model.
 
 First let's prepare a tokenized input with `GPT2Tokenizer`
 
 ```python
 import torch
-from pytorch_transformers import GPT2Tokenizer, GPT2Model, GPT2LMHeadModel
+from pytorch_transformers import GPT2Tokenizer, GPT2LMHeadModel
 
 # OPTIONAL: if you want to have more information on what's happening, activate the logger as follows
 import logging
@@ -430,787 +176,53 @@ logging.basicConfig(level=logging.INFO)
 tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
 
 # Encode some inputs
-text_1 = "Who was Jim Henson ?"
-text_2 = "Jim Henson was a puppeteer"
-indexed_tokens_1 = tokenizer.encode(text_1)
-indexed_tokens_2 = tokenizer.encode(text_2)
+text = "Who was Jim Henson ? Jim Henson was a"
+indexed_tokens = tokenizer.encode(text)
 
 # Convert inputs to PyTorch tensors
-tokens_tensor_1 = torch.tensor([indexed_tokens_1])
-tokens_tensor_2 = torch.tensor([indexed_tokens_2])
+tokens_tensor = torch.tensor([indexed_tokens])
 ```
 
-Let's see how to use `GPT2Model` to get hidden states
-
-```python
-# Load pre-trained model (weights)
-model = GPT2Model.from_pretrained('gpt2')
-model.eval()
-
-# If you have a GPU, put everything on cuda
-tokens_tensor_1 = tokens_tensor_1.to('cuda')
-tokens_tensor_2 = tokens_tensor_2.to('cuda')
-model.to('cuda')
-
-# Predict hidden states features for each layer
-with torch.no_grad():
-    hidden_states_1, past = model(tokens_tensor_1)
-    # past can be used to reuse precomputed hidden state in a subsequent predictions
-    # (see beam-search examples in the run_gpt2.py example).
-    hidden_states_2, past = model(tokens_tensor_2, past=past)
-```
-
-And how to use `GPT2LMHeadModel`
+Let's see how to use `GPT2LMHeadModel` to generate some text from our prompt:
 
 ```python
 # Load pre-trained model (weights)
 model = GPT2LMHeadModel.from_pretrained('gpt2')
+
+# Set the model in evaluation mode to desactivate the DropOut modules
+# This is IMPORTANT to have reproductible results during evaluation!
 model.eval()
 
 # If you have a GPU, put everything on cuda
-tokens_tensor_1 = tokens_tensor_1.to('cuda')
-tokens_tensor_2 = tokens_tensor_2.to('cuda')
+tokens_tensor = tokens_tensor.to('cuda')
 model.to('cuda')
 
 # Predict all tokens
 with torch.no_grad():
-    predictions_1, past = model(tokens_tensor_1)
-    # past can be used to reuse precomputed hidden state in a subsequent predictions
-    # (see beam-search examples in the run_gpt2.py example).
-    predictions_2, past = model(tokens_tensor_2, past=past)
+    outputs = model(tokens_tensor)
+    predictions = outputs[0]
 
-# get the predicted last token
-predicted_index = torch.argmax(predictions_2[0, -1, :]).item()
-predicted_token = tokenizer.decode([predicted_index])
+# get the predicted next sub-word (in our case, the word 'man')
+predicted_index = torch.argmax(predictions[0, -1, :]).item()
+predicted_text = tokenizer.decode(indexed_tokens + [predicted_index])
+assert predicted_text == 'Who was Jim Henson? Jim Henson was a man'
 ```
 
-And how to use `GPT2DoubleHeadsModel`
+Examples for each model class of each model architecture (Bert, GPT, GPT-2, Transformer-XL, XLNet and XLM) can be found in the documentation.
 
-```python
-# Load pre-trained model (weights)
-model = GPT2DoubleHeadsModel.from_pretrained('gpt2')
-model.eval()
+## Quick tour: Fine-tuning/usage scripts
 
-#  Prepare tokenized input
-text1 = "Who was Jim Henson ? Jim Henson was a puppeteer"
-text2 = "Who was Jim Henson ? Jim Henson was a mysterious young man"
-tokenized_text1 = tokenizer.tokenize(text1)
-tokenized_text2 = tokenizer.tokenize(text2)
-indexed_tokens1 = tokenizer.convert_tokens_to_ids(tokenized_text1)
-indexed_tokens2 = tokenizer.convert_tokens_to_ids(tokenized_text2)
-tokens_tensor = torch.tensor([[indexed_tokens1, indexed_tokens2]])
-mc_token_ids = torch.LongTensor([[len(tokenized_text1)-1, len(tokenized_text2)-1]])
+We include several example script with SOTA performances for NLU and NLG tasks:
 
-# Predict hidden states features for each layer
-with torch.no_grad():
-    lm_logits, multiple_choice_logits, past = model(tokens_tensor, mc_token_ids)
-```
+- fine-tuning Bert/XLNet/XLM with a *sequence-level classifier* on nine different GLUE tasks,
+- fine-tuning Bert/XLNet/XLM with a *token-level classifier* on the question answering dataset SQuAD 2.0, and
+- using GPT/GPT-2/Transformer-XL and XLNet for conditional language generation.
 
-## Doc
+Here are three quick examples:
 
-Here is a detailed documentation of the classes in the package and how to use them:
+### Fine-tuning for sequence classification: GLUE tasks examples
 
-| Sub-section | Description |
-|-|-|
-| [Loading pre-trained weights](#loading-google-ai-or-openai-pre-trained-weights-or-pytorch-dump) | How to load Google AI/OpenAI's pre-trained weight or a PyTorch saved instance |
-| [Serialization best-practices](#serialization-best-practices) | How to save and reload a fine-tuned model |
-| [Configurations](#configurations) | API of the configuration classes for BERT, GPT, GPT-2 and Transformer-XL |
-| [Models](#models) | API of the PyTorch model classes for BERT, GPT, GPT-2 and Transformer-XL |
-| [Tokenizers](#tokenizers) | API of the tokenizers class for BERT, GPT, GPT-2 and Transformer-XL|
-| [Optimizers](#optimizers) |  API of the optimizers |
-
-### Loading Google AI or OpenAI pre-trained weights or PyTorch dump
-
-### `from_pretrained()` method
-
-To load one of Google AI's, OpenAI's pre-trained models or a PyTorch saved model (an instance of `BertForPreTraining` saved with `torch.save()`), the PyTorch model classes and the tokenizer can be instantiated using the `from_pretrained()` method:
-
-```python
-model = BERT_CLASS.from_pretrained(PRE_TRAINED_MODEL_NAME_OR_PATH, cache_dir=None, from_tf=False, state_dict=None, *input, **kwargs)
-```
-
-where
-
-- `BERT_CLASS` is either a tokenizer to load the vocabulary (`BertTokenizer` or `OpenAIGPTTokenizer` classes) or one of the eight BERT or three OpenAI GPT PyTorch model classes (to load the pre-trained weights): `BertModel`, `BertForMaskedLM`, `BertForNextSentencePrediction`, `BertForPreTraining`, `BertForSequenceClassification`, `BertForTokenClassification`, `BertForMultipleChoice`, `BertForQuestionAnswering`, `OpenAIGPTModel`, `OpenAIGPTLMHeadModel` or `OpenAIGPTDoubleHeadsModel`, and
-- `PRE_TRAINED_MODEL_NAME_OR_PATH` is either:
-
-  - the shortcut name of a Google AI's or OpenAI's pre-trained model selected in the list:
-
-    - `bert-base-uncased`: 12-layer, 768-hidden, 12-heads, 110M parameters
-    - `bert-large-uncased`: 24-layer, 1024-hidden, 16-heads, 340M parameters
-    - `bert-base-cased`: 12-layer, 768-hidden, 12-heads , 110M parameters
-    - `bert-large-cased`: 24-layer, 1024-hidden, 16-heads, 340M parameters
-    - `bert-base-multilingual-uncased`: (Orig, not recommended) 102 languages, 12-layer, 768-hidden, 12-heads, 110M parameters
-    - `bert-base-multilingual-cased`: **(New, recommended)** 104 languages, 12-layer, 768-hidden, 12-heads, 110M parameters
-    - `bert-base-chinese`: Chinese Simplified and Traditional, 12-layer, 768-hidden, 12-heads, 110M parameters
-    - `bert-base-german-cased`: Trained on German data only, 12-layer, 768-hidden, 12-heads, 110M parameters [Performance Evaluation](https://deepset.ai/german-bert)
-    - `bert-large-uncased-whole-word-masking`: 24-layer, 1024-hidden, 16-heads, 340M parameters - Trained with Whole Word Masking (mask all of the the tokens corresponding to a word at once)
-    - `bert-large-cased-whole-word-masking`: 24-layer, 1024-hidden, 16-heads, 340M parameters - Trained with Whole Word Masking (mask all of the the tokens corresponding to a word at once)
-    - `bert-large-uncased-whole-word-masking-finetuned-squad`: The `bert-large-uncased-whole-word-masking` model finetuned on SQuAD (using the `run_bert_squad.py` examples). Results: *exact_match: 86.91579943235573, f1: 93.1532499015869*
-    - `openai-gpt`: OpenAI GPT English model, 12-layer, 768-hidden, 12-heads, 110M parameters
-    - `gpt2`: OpenAI GPT-2 English model, 12-layer, 768-hidden, 12-heads, 117M parameters
-    - `gpt2-medium`: OpenAI GPT-2 English model, 24-layer, 1024-hidden, 16-heads, 345M parameters
-    - `transfo-xl-wt103`: Transformer-XL English model trained on wikitext-103, 18-layer, 1024-hidden, 16-heads, 257M parameters
-
-  - a path or url to a pretrained model archive containing:
-
-    - `bert_config.json` or `openai_gpt_config.json` a configuration file for the model, and
-    - `pytorch_model.bin` a PyTorch dump of a pre-trained instance of `BertForPreTraining`, `OpenAIGPTModel`, `TransfoXLModel`, `GPT2LMHeadModel` (saved with the usual `torch.save()`)
-
-  If `PRE_TRAINED_MODEL_NAME_OR_PATH` is a shortcut name, the pre-trained weights will be downloaded from AWS S3 (see the links [here](pytorch_transformers/modeling.py)) and stored in a cache folder to avoid future download (the cache folder can be found at `~/.pytorch_transformers/`).
-
-- `cache_dir` can be an optional path to a specific directory to download and cache the pre-trained model weights. This option is useful in particular when you are using distributed training: to avoid concurrent access to the same weights you can set for example `cache_dir='./pretrained_model_{}'.format(args.local_rank)` (see the section on distributed training for more information).
-- `from_tf`: should we load the weights from a locally saved TensorFlow checkpoint
-- `state_dict`: an optional state dictionnary (collections.OrderedDict object) to use instead of Google pre-trained models
-- `*inputs`, `**kwargs`: additional input for the specific Bert class (ex: num_labels for BertForSequenceClassification)
-
-`Uncased` means that the text has been lowercased before WordPiece tokenization, e.g., `John Smith` becomes `john smith`. The Uncased model also strips out any accent markers. `Cased` means that the true case and accent markers are preserved. Typically, the Uncased model is better unless you know that case information is important for your task (e.g., Named Entity Recognition or Part-of-Speech tagging). For information about the Multilingual and Chinese model, see the [Multilingual README](https://github.com/google-research/bert/blob/master/multilingual.md) or the original TensorFlow repository.
-
-**When using an `uncased model`, make sure to pass `--do_lower_case` to the example training scripts (or pass `do_lower_case=True` to FullTokenizer if you're using your own script and loading the tokenizer your-self.).**
-
-Examples:
-
-```python
-# BERT
-tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True, do_basic_tokenize=True)
-model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
-
-# OpenAI GPT
-tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
-model = OpenAIGPTModel.from_pretrained('openai-gpt')
-
-# Transformer-XL
-tokenizer = TransfoXLTokenizer.from_pretrained('transfo-xl-wt103')
-model = TransfoXLModel.from_pretrained('transfo-xl-wt103')
-
-# OpenAI GPT-2
-tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
-model = GPT2Model.from_pretrained('gpt2')
-
-```
-
-#### Cache directory
-
-`pytorch_transformers` save the pretrained weights in a cache directory which is located at (in this order of priority):
-
-- `cache_dir` optional arguments to the `from_pretrained()` method (see above),
-- shell environment variable `PYTORCH_PRETRAINED_BERT_CACHE`,
-- PyTorch cache home + `/pytorch_transformers/`
-  where PyTorch cache home is defined by (in this order):
-  - shell environment variable `ENV_TORCH_HOME`
-  - shell environment variable `ENV_XDG_CACHE_HOME` + `/torch/`)
-  - default: `~/.cache/torch/`
-
-Usually, if you don't set any specific environment variable, `pytorch_transformers` cache will be at `~/.cache/torch/pytorch_transformers/`.
-
-You can alsways safely delete `pytorch_transformers` cache but the pretrained model weights and vocabulary files wil have to be re-downloaded from our S3.
-
-### Serialization best-practices
-
-This section explain how you can save and re-load a fine-tuned model (BERT, GPT, GPT-2 and Transformer-XL).
-There are three types of files you need to save to be able to reload a fine-tuned model:
-
-- the model it-self which should be saved following PyTorch serialization [best practices](https://pytorch.org/docs/stable/notes/serialization.html#best-practices),
-- the configuration file of the model which is saved as a JSON file, and
-- the vocabulary (and the merges for the BPE-based models GPT and GPT-2).
-
-The *default filenames* of these files are as follow:
-
-- the model weights file: `pytorch_model.bin`,
-- the configuration file: `config.json`,
-- the vocabulary file: `vocab.txt` for BERT and Transformer-XL, `vocab.json` for GPT/GPT-2 (BPE vocabulary),
-- for GPT/GPT-2 (BPE vocabulary) the additional merges file: `merges.txt`.
-
-**If you save a model using these *default filenames*, you can then re-load the model and tokenizer using the `from_pretrained()` method.**
-
-Here is the recommended way of saving the model, configuration and vocabulary to an `output_dir` directory and reloading the model and tokenizer afterwards:
-
-```python
-from pytorch_transformers import WEIGHTS_NAME, CONFIG_NAME
-
-output_dir = "./models/"
-
-# Step 1: Save a model, configuration and vocabulary that you have fine-tuned
-
-# If we have a distributed model, save only the encapsulated model
-# (it was wrapped in PyTorch DistributedDataParallel or DataParallel)
-model_to_save = model.module if hasattr(model, 'module') else model
-
-# If we save using the predefined names, we can load using `from_pretrained`
-output_model_file = os.path.join(output_dir, WEIGHTS_NAME)
-output_config_file = os.path.join(output_dir, CONFIG_NAME)
-
-torch.save(model_to_save.state_dict(), output_model_file)
-model_to_save.config.to_json_file(output_config_file)
-tokenizer.save_vocabulary(output_dir)
-
-# Step 2: Re-load the saved model and vocabulary
-
-# Example for a Bert model
-model = BertForQuestionAnswering.from_pretrained(output_dir)
-tokenizer = BertTokenizer.from_pretrained(output_dir, do_lower_case=args.do_lower_case)  # Add specific options if needed
-# Example for a GPT model
-model = OpenAIGPTDoubleHeadsModel.from_pretrained(output_dir)
-tokenizer = OpenAIGPTTokenizer.from_pretrained(output_dir)
-```
-
-Here is another way you can save and reload the model if you want to use specific paths for each type of files:
-
-```python
-output_model_file = "./models/my_own_model_file.bin"
-output_config_file = "./models/my_own_config_file.bin"
-output_vocab_file = "./models/my_own_vocab_file.bin"
-
-# Step 1: Save a model, configuration and vocabulary that you have fine-tuned
-
-# If we have a distributed model, save only the encapsulated model
-# (it was wrapped in PyTorch DistributedDataParallel or DataParallel)
-model_to_save = model.module if hasattr(model, 'module') else model
-
-torch.save(model_to_save.state_dict(), output_model_file)
-model_to_save.config.to_json_file(output_config_file)
-tokenizer.save_vocabulary(output_vocab_file)
-
-# Step 2: Re-load the saved model and vocabulary
-
-# We didn't save using the predefined WEIGHTS_NAME, CONFIG_NAME names, we cannot load using `from_pretrained`.
-# Here is how to do it in this situation:
-
-# Example for a Bert model
-config = BertConfig.from_json_file(output_config_file)
-model = BertForQuestionAnswering(config)
-state_dict = torch.load(output_model_file)
-model.load_state_dict(state_dict)
-tokenizer = BertTokenizer(output_vocab_file, do_lower_case=args.do_lower_case)
-
-# Example for a GPT model
-config = OpenAIGPTConfig.from_json_file(output_config_file)
-model = OpenAIGPTDoubleHeadsModel(config)
-state_dict = torch.load(output_model_file)
-model.load_state_dict(state_dict)
-tokenizer = OpenAIGPTTokenizer(output_vocab_file)
-```
-
-### Configurations
-
-Models (BERT, GPT, GPT-2 and Transformer-XL) are defined and build from configuration classes which containes the parameters of the models (number of layers, dimensionalities...) and a few utilities to read and write from JSON configuration files. The respective configuration classes are:
-
-- `BertConfig` for `BertModel` and BERT classes instances.
-- `OpenAIGPTConfig` for `OpenAIGPTModel` and OpenAI GPT classes instances.
-- `GPT2Config` for `GPT2Model` and OpenAI GPT-2 classes instances.
-- `TransfoXLConfig` for `TransfoXLModel` and Transformer-XL classes instances.
-
-These configuration classes contains a few utilities to load and save configurations:
-
-- `from_dict(cls, json_object)`: A class method to construct a configuration from a Python dictionary of parameters. Returns an instance of the configuration class.
-- `from_json_file(cls, json_file)`: A class method to construct a configuration from a json file of parameters. Returns an instance of the configuration class.
-- `to_dict()`: Serializes an instance to a Python dictionary. Returns a dictionary.
-- `to_json_string()`: Serializes an instance to a JSON string. Returns a string.
-- `to_json_file(json_file_path)`: Save an instance to a json file.
-
-### Models
-
-#### 1. `BertModel`
-
-`BertModel` is the basic BERT Transformer model with a layer of summed token, position and sequence embeddings followed by a series of identical self-attention blocks (12 for BERT-base, 24 for BERT-large).
-
-Instantiation:
-The model can be instantiated with the following arguments:
-
-- `config`: a `BertConfig` class instance with the configuration to build a new model.
-- `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
-- `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient. This can be used to compute head importance metrics. Default: False
-
-The inputs and output are **identical to the TensorFlow model inputs and outputs**.
-
-We detail them here. This model takes as *inputs*:
-[`modeling.py`](./pytorch_transformers/modeling.py)
-
-- `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] with the word token indices in the vocabulary (see the tokens preprocessing logic in the scripts [`run_bert_extract_features.py`](./examples/run_bert_extract_features.py), [`run_bert_classifier.py`](./examples/run_bert_classifier.py) and [`run_bert_squad.py`](./examples/run_bert_squad.py)), and
-- `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to a `sentence B` token (see BERT paper for more details).
-- `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices selected in [0, 1]. It's a mask to be used if some input sequence lengths are smaller than the max input sequence length of the current batch. It's the mask that we typically use for attention when a batch has varying length sentences.
-- `output_all_encoded_layers`: boolean which controls the content of the `encoded_layers` output as described below. Default: `True`.
-- `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1. It's a mask to be used to nullify some heads of the transformer. 0.0 => head is fully masked, 1.0 => head is not masked.
-
-This model *outputs* a tuple composed of:
-
-- `encoded_layers`: controled by the value of the `output_encoded_layers` argument:
-
-  - `output_all_encoded_layers=True`: outputs a list of the encoded-hidden-states at the end of each attention block (i.e. 12 full sequences for BERT-base, 24 for BERT-large), each encoded-hidden-state is a torch.FloatTensor of size [batch_size, sequence_length, hidden_size],
-  - `output_all_encoded_layers=False`: outputs only the encoded-hidden-states corresponding to the last attention block, i.e. a single torch.FloatTensor of size [batch_size, sequence_length, hidden_size],
-
-- `pooled_output`: a torch.FloatTensor of size [batch_size, hidden_size] which is the output of a classifier pretrained on top of the hidden state associated to the first character of the input (`CLF`) to train on the Next-Sentence task (see BERT's paper).
-
-An example on how to use this class is given in the [`run_bert_extract_features.py`](./examples/run_bert_extract_features.py) script which can be used to extract the hidden states of the model for a given input.
-
-#### 2. `BertForPreTraining`
-
-`BertForPreTraining` includes the `BertModel` Transformer followed by the two pre-training heads:
-
-- the masked language modeling head, and
-- the next sentence classification head.
-
-*Inputs* comprises the inputs of the [`BertModel`](#-1.-`BertModel`) class plus two optional labels:
-
-- `masked_lm_labels`: masked language modeling labels: torch.LongTensor of shape [batch_size, sequence_length] with indices selected in [-1, 0, ..., vocab_size]. All labels set to -1 are ignored (masked), the loss is only computed for the labels set in [0, ..., vocab_size]
-- `next_sentence_label`: next sentence classification loss: torch.LongTensor of shape [batch_size] with indices selected in [0, 1]. 0 => next sentence is the continuation, 1 => next sentence is a random sentence.
-
-*Outputs*:
-
-- if `masked_lm_labels` and `next_sentence_label` are not `None`: Outputs the total_loss which is the sum of the masked language modeling loss and the next sentence classification loss.
-- if `masked_lm_labels` or `next_sentence_label` is `None`: Outputs a tuple comprising
-
-  - the masked language modeling logits, and
-  - the next sentence classification logits.
-
-An example on how to use this class is given in the [`run_lm_finetuning.py`](./examples/run_lm_finetuning.py) script which can be used to fine-tune the BERT language model on your specific different text corpus. This should improve model performance, if the language style is different from the original BERT training corpus (Wiki + BookCorpus).
-
-#### 3. `BertForMaskedLM`
-
-`BertForMaskedLM` includes the `BertModel` Transformer followed by the (possibly) pre-trained  masked language modeling head.
-
-*Inputs* comprises the inputs of the [`BertModel`](#-1.-`BertModel`) class plus optional label:
-
-- `masked_lm_labels`: masked language modeling labels: torch.LongTensor of shape [batch_size, sequence_length] with indices selected in [-1, 0, ..., vocab_size]. All labels set to -1 are ignored (masked), the loss is only computed for the labels set in [0, ..., vocab_size]
-
-*Outputs*:
-
-- if `masked_lm_labels` is not `None`: Outputs the masked language modeling loss.
-- if `masked_lm_labels` is `None`: Outputs the masked language modeling logits.
-
-#### 4. `BertForNextSentencePrediction`
-
-`BertForNextSentencePrediction` includes the `BertModel` Transformer followed by the next sentence classification head.
-
-*Inputs* comprises the inputs of the [`BertModel`](#-1.-`BertModel`) class plus an optional label:
-
-- `next_sentence_label`: next sentence classification loss: torch.LongTensor of shape [batch_size] with indices selected in [0, 1]. 0 => next sentence is the continuation, 1 => next sentence is a random sentence.
-
-*Outputs*:
-
-- if `next_sentence_label` is not `None`: Outputs the next sentence classification loss.
-- if `next_sentence_label` is `None`: Outputs the next sentence classification logits.
-
-#### 5. `BertForSequenceClassification`
-
-`BertForSequenceClassification` is a fine-tuning model that includes `BertModel` and a sequence-level (sequence or pair of sequences) classifier on top of the `BertModel`.
-
-The sequence-level classifier is a linear layer that takes as input the last hidden state of the first character in the input sequence (see Figures 3a and 3b in the BERT paper).
-
-An example on how to use this class is given in the [`run_bert_classifier.py`](./examples/run_bert_classifier.py) script which can be used to fine-tune a single sequence (or pair of sequence) classifier using BERT, for example for the MRPC task.
-
-#### 6. `BertForMultipleChoice`
-
-`BertForMultipleChoice` is a fine-tuning model that includes `BertModel` and a linear layer on top of the `BertModel`.
-
-The linear layer outputs a single value for each choice of a multiple choice problem, then all the outputs corresponding to an instance are passed through a softmax to get the model choice.
-
-This implementation is largely inspired by the work of OpenAI in [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) and the answer of Jacob Devlin in the following [issue](https://github.com/google-research/bert/issues/38).
-
-An example on how to use this class is given in the [`run_swag.py`](./examples/run_swag.py) script which can be used to fine-tune a multiple choice classifier using BERT, for example for the Swag task.
-
-#### 7. `BertForTokenClassification`
-
-`BertForTokenClassification` is a fine-tuning model that includes `BertModel` and a token-level classifier on top of the `BertModel`.
-
-The token-level classifier is a linear layer that takes as input the last hidden state of the sequence.
-
-#### 8. `BertForQuestionAnswering`
-
-`BertForQuestionAnswering` is a fine-tuning model that includes `BertModel` with a token-level classifiers on top of the full sequence of last hidden states.
-
-The token-level classifier takes as input the full sequence of the last hidden state and compute several (e.g. two) scores for each tokens that can for example respectively be the score that a given token is a `start_span` and a `end_span` token (see Figures 3c and 3d in the BERT paper).
-
-An example on how to use this class is given in the [`run_bert_squad.py`](./examples/run_bert_squad.py) script which can be used to fine-tune a token classifier using BERT, for example for the SQuAD task.
-
-#### 9. `OpenAIGPTModel`
-
-`OpenAIGPTModel` is the basic OpenAI GPT Transformer model with a layer of summed token and position embeddings followed by a series of 12 identical self-attention blocks.
-
-OpenAI GPT use a single embedding matrix to store the word and special embeddings.
-Special tokens embeddings are additional tokens that are not pre-trained: `[SEP]`, `[CLS]`...
-Special tokens need to be trained during the fine-tuning if you use them.
-The number of special embeddings can be controled using the `set_num_special_tokens(num_special_tokens)` function.
-
-The embeddings are ordered as follow in the token embeddings matrice:
-
-```python
-    [0,                                                         ----------------------
-      ...                                                        -> word embeddings
-      config.vocab_size - 1,                                     ______________________
-      config.vocab_size,
-      ...                                                        -> special embeddings
-      config.vocab_size + config.n_special - 1]                  ______________________
-```
-
-where total_tokens_embeddings can be obtained as config.total_tokens_embeddings and is:
-    `total_tokens_embeddings = config.vocab_size + config.n_special`
-You should use the associate indices to index the embeddings.
-
-Instantiation:
-The model can be instantiated with the following arguments:
-
-- `config`: a `OpenAIConfig` class instance with the configuration to build a new model.
-- `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
-- `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient. This can be used to compute head importance metrics. Default: False
-
-The inputs and output are **identical to the TensorFlow model inputs and outputs**.
-
-We detail them here. This model takes as *inputs*:
-[`modeling_openai.py`](./pytorch_transformers/modeling_openai.py)
-
-- `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] (or more generally [d_1, ..., d_n, sequence_length] were d_1 ... d_n are arbitrary dimensions) with the word BPE token indices selected in the range [0, total_tokens_embeddings[
-- `position_ids`: an optional torch.LongTensor with the same shape as input_ids
-    with the position indices (selected in the range [0, config.n_positions - 1[.
-- `token_type_ids`: an optional torch.LongTensor with the same shape as input_ids
-    You can use it to add a third type of embedding to each input token in the sequence
-    (the previous two being the word and position embeddings). The input, position and token_type embeddings are summed inside the Transformer before the first self-attention block.
-- `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1. It's a mask to be used to nullify some heads of the transformer. 0.0 => head is fully masked, 1.0 => head is not masked.
-
-This model *outputs*:
-
-- `hidden_states`: a list of all the encoded-hidden-states in the model (length of the list: number of layers + 1 for the output of the embeddings) as torch.FloatTensor of size [batch_size, sequence_length, hidden_size] (or more generally [d_1, ..., d_n, hidden_size] were d_1 ... d_n are the dimension of input_ids)
-
-#### 10. `OpenAIGPTLMHeadModel`
-
-`OpenAIGPTLMHeadModel` includes the `OpenAIGPTModel` Transformer followed by a language modeling head with weights tied to the input embeddings (no additional parameters).
-
-*Inputs* are the same as the inputs of the [`OpenAIGPTModel`](#-9.-`OpenAIGPTModel`) class plus optional labels:
-
-- `lm_labels`: optional language modeling labels: torch.LongTensor of shape [batch_size, sequence_length] with indices selected in [-1, 0, ..., vocab_size]. All labels set to -1 are ignored (masked), the loss is only computed for the labels set in [0, ..., vocab_size].
-
-*Outputs*:
-
-- if `lm_labels` is not `None`:
-  Outputs the language modeling loss.
-- else:
-  Outputs `lm_logits`: the language modeling logits as a torch.FloatTensor of size [batch_size, sequence_length, total_tokens_embeddings] (or more generally [d_1, ..., d_n, total_tokens_embeddings] were d_1 ... d_n are the dimension of input_ids)
-
-#### 11. `OpenAIGPTDoubleHeadsModel`
-
-`OpenAIGPTDoubleHeadsModel` includes the `OpenAIGPTModel` Transformer followed by two heads:
-
-- a language modeling head with weights tied to the input embeddings (no additional parameters) and:
-- a multiple choice classifier (linear layer that take as input a hidden state in a sequence to compute a score, see details in paper).
-
-*Inputs* are the same as the inputs of the [`OpenAIGPTModel`](#-9.-`OpenAIGPTModel`) class plus a classification mask and two optional labels:
-
-- `multiple_choice_token_ids`: a torch.LongTensor of shape [batch_size, num_choices] with the index of the token whose hidden state should be used as input for the multiple choice classifier (usually the [CLS] token for each choice).
-- `lm_labels`: optional language modeling labels: torch.LongTensor of shape [batch_size, sequence_length] with indices selected in [-1, 0, ..., vocab_size]. All labels set to -1 are ignored (masked), the loss is only computed for the labels set in [0, ..., vocab_size].
-- `multiple_choice_labels`: optional multiple choice labels: torch.LongTensor of shape [batch_size] with indices selected in [0, ..., num_choices].
-
-*Outputs*:
-
-- if `lm_labels` and `multiple_choice_labels` are not `None`:
-  Outputs a tuple of losses with the language modeling loss and the multiple choice loss.
-- else Outputs a tuple with:
-  - `lm_logits`: the language modeling logits as a torch.FloatTensor of size [batch_size, num_choices, sequence_length, total_tokens_embeddings]
-  - `multiple_choice_logits`: the multiple choice logits as a torch.FloatTensor of size [batch_size, num_choices]
-
-#### 12. `TransfoXLModel`
-
-The Transformer-XL model is described in "Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context".
-
-Transformer XL use a relative positioning with sinusiodal patterns and adaptive softmax inputs which means that:
-
-- you don't need to specify positioning embeddings indices
-- the tokens in the vocabulary have to be sorted to decreasing frequency.
-
-This model takes as *inputs*:
-[`modeling_transfo_xl.py`](./pytorch_transformers/modeling_transfo_xl.py)
-
-- `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] with the token indices selected in the range [0, self.config.n_token[
-- `mems`: an optional memory of hidden states from previous forward passes as a list (num layers) of hidden states at the entry of each layer. Each hidden states has shape [self.config.mem_len, bsz, self.config.d_model]. Note that the first two dimensions are transposed in `mems` with regards to `input_ids`.
-
-This model *outputs* a tuple of (last_hidden_state, new_mems)
-
-- `last_hidden_state`: the encoded-hidden-states at the top of the model as a torch.FloatTensor of size [batch_size, sequence_length, self.config.d_model]
-- `new_mems`: list (num layers) of updated mem states at the entry of each layer each mem state is a torch.FloatTensor of size [self.config.mem_len, batch_size, self.config.d_model]. Note that the first two dimensions are transposed in `mems` with regards to `input_ids`.
-
-##### Extracting a list of the hidden states at each layer of the Transformer-XL from `last_hidden_state` and `new_mems`
-
-The `new_mems` contain all the hidden states PLUS the output of the embeddings (`new_mems[0]`). `new_mems[-1]` is the output of the hidden state of the layer below the last layer and `last_hidden_state` is the output of the last layer (i.E. the input of the softmax when we have a language modeling head on top).
-
-There are two differences between the shapes of `new_mems` and `last_hidden_state`: `new_mems` have transposed first dimensions and are longer (of size `self.config.mem_len`). Here is how to extract the full list of hidden states from the model output:
-
-```python
-hidden_states, mems = model(tokens_tensor)
-seq_length = hidden_states.size(1)
-lower_hidden_states = list(t[-seq_length:, ...].transpose(0, 1) for t in mems)
-all_hidden_states = lower_hidden_states + [hidden_states]
-```
-
-#### 13. `TransfoXLLMHeadModel`
-
-`TransfoXLLMHeadModel` includes the `TransfoXLModel` Transformer followed by an (adaptive) softmax head with weights tied to the input embeddings.
-
-*Inputs* are the same as the inputs of the [`TransfoXLModel`](#-12.-`TransfoXLModel`) class plus optional labels:
-
-- `labels`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the labels token indices selected in the range [0, self.config.n_token[
-
-*Outputs* a tuple of (last_hidden_state, new_mems)
-
-- `softmax_output`: output of the (adaptive) softmax:
-  - if labels is None: log probabilities of tokens, shape [batch_size, sequence_length, n_tokens]
-  - else: Negative log likelihood of labels tokens with shape [batch_size, sequence_length]
-- `new_mems`: list (num layers) of updated mem states at the entry of each layer each mem state is a torch.FloatTensor of size [self.config.mem_len, batch_size, self.config.d_model]. Note that the first two dimensions are transposed in `mems` with regards to `input_ids`.
-
-#### 14. `GPT2Model`
-
-`GPT2Model` is the OpenAI GPT-2 Transformer model with a layer of summed token and position embeddings followed by a series of 12 identical self-attention blocks.
-
-Instantiation:
-The model can be instantiated with the following arguments:
-
-- `config`: a `GPT2Config` class instance with the configuration to build a new model.
-- `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
-- `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient. This can be used to compute head importance metrics. Default: False
-
-The inputs and output are **identical to the TensorFlow model inputs and outputs**.
-
-We detail them here. This model takes as *inputs*:
-[`modeling_gpt2.py`](./pytorch_transformers/modeling_gpt2.py)
-
-- `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] (or more generally [d_1, ..., d_n, sequence_length] were d_1 ... d_n are arbitrary dimensions) with the word BPE token indices selected in the range [0, vocab_size[
-- `position_ids`: an optional torch.LongTensor with the same shape as input_ids
-    with the position indices (selected in the range [0, config.n_positions - 1[.
-- `token_type_ids`: an optional torch.LongTensor with the same shape as input_ids
-    You can use it to add a third type of embedding to each input token in the sequence
-    (the previous two being the word and position embeddings). The input, position and token_type embeddings are summed inside the Transformer before the first self-attention block.
-- `past`: an optional list of torch.LongTensor that contains pre-computed hidden-states (key and values in the attention blocks) to speed up sequential decoding (this is the `presents` output of the model, cf. below).
-- `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1. It's a mask to be used to nullify some heads of the transformer. 0.0 => head is fully masked, 1.0 => head is not masked.
-
-This model *outputs*:
-
-- `hidden_states`: a list of all the encoded-hidden-states in the model (length of the list: number of layers + 1 for the output of the embeddings) as torch.FloatTensor of size [batch_size, sequence_length, hidden_size] (or more generally [d_1, ..., d_n, hidden_size] were d_1 ... d_n are the dimension of input_ids)
-- `presents`: a list of pre-computed hidden-states (key and values in each attention blocks) as a torch.FloatTensors. They can be reused to speed up sequential decoding (see the `run_gpt2.py` example).
-
-#### 15. `GPT2LMHeadModel`
-
-`GPT2LMHeadModel` includes the `GPT2Model` Transformer followed by a language modeling head with weights tied to the input embeddings (no additional parameters).
-
-*Inputs* are the same as the inputs of the [`GPT2Model`](#-14.-`GPT2Model`) class plus optional labels:
-
-- `lm_labels`: optional language modeling labels: torch.LongTensor of shape [batch_size, sequence_length] with indices selected in [-1, 0, ..., vocab_size]. All labels set to -1 are ignored (masked), the loss is only computed for the labels set in [0, ..., vocab_size].
-
-*Outputs*:
-
-- if `lm_labels` is not `None`:
-  Outputs the language modeling loss.
-- else: a tuple of
-  - `lm_logits`: the language modeling logits as a torch.FloatTensor of size [batch_size, sequence_length, total_tokens_embeddings] (or more generally [d_1, ..., d_n, total_tokens_embeddings] were d_1 ... d_n are the dimension of input_ids)
-  - `presents`: a list of pre-computed hidden-states (key and values in each attention blocks) as a torch.FloatTensors. They can be reused to speed up sequential decoding (see the `run_gpt2.py` example).
-
-#### 16. `GPT2DoubleHeadsModel`
-
-`GPT2DoubleHeadsModel` includes the `GPT2Model` Transformer followed by two heads:
-
-- a language modeling head with weights tied to the input embeddings (no additional parameters) and:
-- a multiple choice classifier (linear layer that take as input a hidden state in a sequence to compute a score, see details in paper).
-
-*Inputs* are the same as the inputs of the [`GPT2Model`](#-14.-`GPT2Model`) class plus a classification mask and two optional labels:
-
-- `multiple_choice_token_ids`: a torch.LongTensor of shape [batch_size, num_choices] with the index of the token whose hidden state should be used as input for the multiple choice classifier (usually the [CLS] token for each choice).
-- `lm_labels`: optional language modeling labels: torch.LongTensor of shape [batch_size, sequence_length] with indices selected in [-1, 0, ..., vocab_size]. All labels set to -1 are ignored (masked), the loss is only computed for the labels set in [0, ..., vocab_size].
-- `multiple_choice_labels`: optional multiple choice labels: torch.LongTensor of shape [batch_size] with indices selected in [0, ..., num_choices].
-
-*Outputs*:
-
-- if `lm_labels` and `multiple_choice_labels` are not `None`:
-  Outputs a tuple of losses with the language modeling loss and the multiple choice loss.
-- else Outputs a tuple with:
-  - `lm_logits`: the language modeling logits as a torch.FloatTensor of size [batch_size, num_choices, sequence_length, total_tokens_embeddings]
-  - `multiple_choice_logits`: the multiple choice logits as a torch.FloatTensor of size [batch_size, num_choices]
-  - `presents`: a list of pre-computed hidden-states (key and values in each attention blocks) as a torch.FloatTensors. They can be reused to speed up sequential decoding (see the `run_gpt2.py` example).
-
-### Tokenizers
-
-#### `BertTokenizer`
-
-`BertTokenizer` perform end-to-end tokenization, i.e. basic tokenization followed by WordPiece tokenization.
-
-This class has five arguments:
-
-- `vocab_file`: path to a vocabulary file.
-- `do_lower_case`: convert text to lower-case while tokenizing. **Default = True**.
-- `max_len`: max length to filter the input of the Transformer. Default to pre-trained value for the model if `None`. **Default = None**
-- `do_basic_tokenize`: Do basic tokenization before wordpice tokenization. Set to false if text is pre-tokenized. **Default = True**.
-- `never_split`: a list of tokens that should not be splitted during tokenization. **Default = `["[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]"]`**
-
-and three methods:
-
-- `tokenize(text)`: convert a `str` in a list of `str` tokens by (1) performing basic tokenization and (2) WordPiece tokenization.
-- `convert_tokens_to_ids(tokens)`: convert a list of `str` tokens in a list of `int` indices in the vocabulary.
-- `convert_ids_to_tokens(tokens)`: convert a list of `int` indices in a list of `str` tokens in the vocabulary.
-- `save_vocabulary(directory_path)`: save the vocabulary file to `directory_path`. Return the path to the saved vocabulary file: `vocab_file_path`. The vocabulary can be reloaded with `BertTokenizer.from_pretrained('vocab_file_path')` or `BertTokenizer.from_pretrained('directory_path')`.
-
-Please refer to the doc strings and code in [`tokenization.py`](./pytorch_transformers/tokenization.py) for the details of the `BasicTokenizer` and `WordpieceTokenizer` classes. In general it is recommended to use `BertTokenizer` unless you know what you are doing.
-
-#### `OpenAIGPTTokenizer`
-
-`OpenAIGPTTokenizer` perform Byte-Pair-Encoding (BPE) tokenization.
-
-This class has four arguments:
-
-- `vocab_file`: path to a vocabulary file.
-- `merges_file`: path to a file containing the BPE merges.
-- `max_len`: max length to filter the input of the Transformer. Default to pre-trained value for the model if `None`. **Default = None**
-- `special_tokens`: a list of tokens to add to the vocabulary for fine-tuning. If SpaCy is not installed and BERT's `BasicTokenizer` is used as the pre-BPE tokenizer, these tokens are not split. **Default= None**
-
-and five methods:
-
-- `tokenize(text)`: convert a `str` in a list of `str` tokens by performing BPE tokenization.
-- `convert_tokens_to_ids(tokens)`: convert a list of `str` tokens in a list of `int` indices in the vocabulary.
-- `convert_ids_to_tokens(tokens)`: convert a list of `int` indices in a list of `str` tokens in the vocabulary.
-- `set_special_tokens(self, special_tokens)`: update the list of special tokens (see above arguments)
-- `encode(text)`: convert a `str` in a list of `int` tokens by performing BPE encoding.
-- `decode(ids, skip_special_tokens=False, clean_up_tokenization_spaces=False)`: decode a list of `int` indices in a string and do some post-processing if needed: (i) remove special tokens from the output and (ii) clean up tokenization spaces.
-- `save_vocabulary(directory_path)`: save the vocabulary, merge and special tokens files to `directory_path`. Return the path to the three files: `vocab_file_path`, `merge_file_path`, `special_tokens_file_path`. The vocabulary can be reloaded with `OpenAIGPTTokenizer.from_pretrained('directory_path')`.
-
-Please refer to the doc strings and code in [`tokenization_openai.py`](./pytorch_transformers/tokenization_openai.py) for the details of the `OpenAIGPTTokenizer`.
-
-#### `TransfoXLTokenizer`
-
-`TransfoXLTokenizer` perform word tokenization. This tokenizer can be used for adaptive softmax and has utilities for counting tokens in a corpus to create a vocabulary ordered by toekn frequency (for adaptive softmax). See the adaptive softmax paper ([Efficient softmax approximation for GPUs](http://arxiv.org/abs/1609.04309)) for more details.
-
-The API is similar to the API of `BertTokenizer` (see above).
-
-Please refer to the doc strings and code in [`tokenization_transfo_xl.py`](./pytorch_transformers/tokenization_transfo_xl.py) for the details of these additional methods in `TransfoXLTokenizer`.
-
-#### `GPT2Tokenizer`
-
-`GPT2Tokenizer` perform byte-level Byte-Pair-Encoding (BPE) tokenization.
-
-This class has three arguments:
-
-- `vocab_file`: path to a vocabulary file.
-- `merges_file`: path to a file containing the BPE merges.
-- `errors`: How to handle unicode decoding errors. **Default = `replace`**
-
-and two methods:
-
-- `tokenize(text)`: convert a `str` in a list of `str` tokens by performing byte-level BPE.
-- `convert_tokens_to_ids(tokens)`: convert a list of `str` tokens in a list of `int` indices in the vocabulary.
-- `convert_ids_to_tokens(tokens)`: convert a list of `int` indices in a list of `str` tokens in the vocabulary.
-- `set_special_tokens(self, special_tokens)`: update the list of special tokens (see above arguments)
-- `encode(text)`: convert a `str` in a list of `int` tokens by performing byte-level BPE.
-- `decode(tokens)`: convert back a list of `int` tokens in a `str`.
-- `save_vocabulary(directory_path)`: save the vocabulary, merge and special tokens files to `directory_path`. Return the path to the three files: `vocab_file_path`, `merge_file_path`, `special_tokens_file_path`. The vocabulary can be reloaded with `OpenAIGPTTokenizer.from_pretrained('directory_path')`.
-
-Please refer to [`tokenization_gpt2.py`](./pytorch_transformers/tokenization_gpt2.py) for more details on the `GPT2Tokenizer`.
-
-### Optimizers
-
-#### `AdamW`
-
-`AdamW` is a `torch.optimizer` adapted to be closer to the optimizer used in the TensorFlow implementation of Bert. The differences with PyTorch Adam optimizer are the following:
-
-- AdamW implements weight decay fix,
-
-The optimizer accepts the following arguments:
-
-- `lr` : learning rate
-- `warmup` : portion of `t_total` for the warmup, `-1`  means no warmup. Default : `-1`
-- `t_total` : total number of training steps for the learning
-    rate schedule, `-1`  means constant learning rate. Default : `-1`
-- `schedule` : schedule to use for the warmup (see above).
-    Can be `'warmup_linear'`, `'warmup_constant'`, `'warmup_cosine'`, `'none'`, `None` or a `_LRSchedule` object (see below).
-    If `None` or `'none'`, learning rate is always kept constant.
-    Default : `'warmup_linear'`
-- `b1` : Adams b1. Default : `0.9`
-- `b2` : Adams b2. Default : `0.999`
-- `e` : Adams epsilon. Default : `1e-6`
-- `weight_decay:` Weight decay. Default : `0.01`
-- `max_grad_norm` : Maximum norm for the gradients (`-1` means no clipping). Default : `1.0`
-
-#### Learning Rate Schedules
-
-The `.optimization` module also provides additional schedules in the form of schedule objects that inherit from `_LRSchedule`.
-All `_LRSchedule` subclasses accept `warmup` and `t_total` arguments at construction.
-When an `_LRSchedule` object is passed into `BertAdam` or `OpenAIAdam`,
-the `warmup` and `t_total` arguments on the optimizer are ignored and the ones in the `_LRSchedule` object are used.
-An overview of the implemented schedules:
-
-- `ConstantLR`: always returns learning rate 1.
-- `WarmupConstantSchedule`: Linearly increases learning rate from 0 to 1 over `warmup` fraction of training steps.
-    Keeps learning rate equal to 1. after warmup.
-    ![](docs/source/imgs/warmup_constant_schedule.png)
-- `WarmupLinearSchedule`: Linearly increases learning rate from 0 to 1 over `warmup` fraction of training steps.
-    Linearly decreases learning rate from 1. to 0. over remaining `1 - warmup` steps.
-    ![](docs/source/imgs/warmup_linear_schedule.png)
--  `WarmupCosineSchedule`: Linearly increases learning rate from 0 to 1 over `warmup` fraction of training steps.
-    Decreases learning rate from 1. to 0. over remaining `1 - warmup` steps following a cosine curve.
-    If `cycles` (default=0.5) is different from default, learning rate follows cosine function after warmup.
-    ![](docs/source/imgs/warmup_cosine_schedule.png)
-- `WarmupCosineWithHardRestartsSchedule`: Linearly increases learning rate from 0 to 1 over `warmup` fraction of training steps.
-    If `cycles` (default=1.) is different from default, learning rate follows `cycles` times a cosine decaying learning rate (with hard restarts).
-    ![](docs/source/imgs/warmup_cosine_hard_restarts_schedule.png)
-- `WarmupCosineWithWarmupRestartsSchedule`: All training progress is divided in `cycles` (default=1.) parts of equal length.
-    Every part follows a schedule with the first `warmup` fraction of the training steps linearly increasing from 0. to 1.,
-    followed by a learning rate decreasing from 1. to 0. following a cosine curve.
-    Note that the total number of all warmup steps over all cycles together is equal to `warmup` * `cycles`
-    ![warmup cosine warm restarts schedule](docs/source/imgs/warmup_cosine_warm_restarts_schedule.png)
-
-## Examples
-
-| Sub-section | Description |
-|-|-|
-| [Training large models: introduction, tools and examples](#Training-large-models-introduction,-tools-and-examples) | How to use gradient-accumulation, multi-gpu training, distributed training, optimize on CPU and 16-bits training to train Bert models |
-| [Fine-tuning with BERT: running the examples](#Fine-tuning-with-BERT-running-the-examples) | Running the examples in [`./examples`](./examples/): `extract_classif.py`, `run_bert_classifier.py`, `run_bert_squad.py` and `run_lm_finetuning.py` |
-| [Fine-tuning with OpenAI GPT, Transformer-XL and GPT-2](#openai-gpt-transformer-xl-and-gpt-2-running-the-examples) | Running the examples in [`./examples`](./examples/): `run_openai_gpt.py`, `run_transfo_xl.py` and `run_gpt2.py` |
-| [Fine-tuning BERT-large on GPUs](#Fine-tuning-BERT-large-on-GPUs) | How to fine tune `BERT large`|
-
-### Training large models: introduction, tools and examples
-
-BERT-base and BERT-large are respectively 110M and 340M parameters models and it can be difficult to fine-tune them on a single GPU with the recommended batch size for good performance (in most case a batch size of 32).
-
-To help with fine-tuning these models, we have included several techniques that you can activate in the fine-tuning scripts [`run_bert_classifier.py`](./examples/run_bert_classifier.py) and [`run_bert_squad.py`](./examples/run_bert_squad.py): gradient-accumulation, multi-gpu training, distributed training and 16-bits training . For more details on how to use these techniques you can read [the tips on training large batches in PyTorch](https://medium.com/huggingface/training-larger-batches-practical-tips-on-1-gpu-multi-gpu-distributed-setups-ec88c3e51255) that I published earlier this month.
-
-Here is how to use these techniques in our scripts:
-
-- **Gradient Accumulation**: Gradient accumulation can be used by supplying a integer greater than 1 to the `--gradient_accumulation_steps` argument. The batch at each step will be divided by this integer and gradient will be accumulated over `gradient_accumulation_steps` steps.
-- **Multi-GPU**: Multi-GPU is automatically activated when several GPUs are detected and the batches are splitted over the GPUs.
-- **Distributed training**: Distributed training can be activated by supplying an integer greater or equal to 0 to the `--local_rank` argument (see below).
-- **16-bits training**: 16-bits training, also called mixed-precision training, can reduce the memory requirement of your model on the GPU by using half-precision training, basically allowing to double the batch size. If you have a recent GPU (starting from NVIDIA Volta architecture) you should see no decrease in speed. A good introduction to Mixed precision training can be found [here](https://devblogs.nvidia.com/mixed-precision-training-deep-neural-networks/) and a full documentation is [here](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html). In our scripts, this option can be activated by setting the `--fp16` flag and you can play with loss scaling using the `--loss_scale` flag (see the previously linked documentation for details on loss scaling). The loss scale can be zero in which case the scale is dynamically adjusted or a positive power of two in which case the scaling is static.
-
-To use 16-bits training and distributed training, you need to install NVIDIA's apex extension [as detailed here](https://github.com/nvidia/apex). You will find more information regarding the internals of `apex` and how to use `apex` in [the doc and the associated repository](https://github.com/nvidia/apex). The results of the tests performed on pytorch-BERT by the NVIDIA team (and my trials at reproducing them) can be consulted in [the relevant PR of the present repository](https://github.com/huggingface/pytorch-transformers/pull/116).
-
-Note: To use *Distributed Training*, you will need to run one training script on each of your machines. This can be done for example by running the following command on each server (see [the above mentioned blog post]((https://medium.com/huggingface/training-larger-batches-practical-tips-on-1-gpu-multi-gpu-distributed-setups-ec88c3e51255)) for more details):
-
-```bash
-python -m torch.distributed.launch --nproc_per_node=4 --nnodes=2 --node_rank=$THIS_MACHINE_INDEX --master_addr="192.168.1.1" --master_port=1234 run_bert_classifier.py (--arg1 --arg2 --arg3 and all other arguments of the run_classifier script)
-```
-
-Where `$THIS_MACHINE_INDEX` is an sequential index assigned to each of your machine (0, 1, 2...) and the machine with rank 0 has an IP address `192.168.1.1` and an open port `1234`.
-
-### Fine-tuning with BERT: running the examples
-
-We showcase several fine-tuning examples based on (and extended from) [the original implementation](https://github.com/google-research/bert/):
-
-- a *sequence-level classifier* on nine different GLUE tasks,
-- a *token-level classifier* on the question answering dataset SQuAD, and
-- a *sequence-level multiple-choice classifier* on the SWAG classification corpus.
-- a *BERT language model* on another target corpus
-
-#### GLUE results on dev set
-
-We get the following results on the dev set of GLUE benchmark with an uncased BERT base
-model. All experiments were run on a P100 GPU with a batch size of 32.
-
-| Task | Metric | Result |
-|-|-|-|
-| CoLA | Matthew's corr. | 57.29 |
-| SST-2 | accuracy | 93.00 |
-| MRPC | F1/accuracy | 88.85/83.82 |
-| STS-B | Pearson/Spearman corr. | 89.70/89.37 |
-| QQP | accuracy/F1 | 90.72/87.41 |
-| MNLI | matched acc./mismatched acc.| 83.95/84.39 |
-| QNLI | accuracy | 89.04 |
-| RTE | accuracy | 61.01 |
-| WNLI | accuracy | 53.52 |
-
-Some of these results are significantly different from the ones reported on the test set
-of GLUE benchmark on the website. For QQP and WNLI, please refer to [FAQ #12](https://gluebenchmark.com/faq) on the webite.
+The [General Language Understanding Evaluation (GLUE) benchmark](https://gluebenchmark.com/) is a collection of nine sentence- or sentence-pair language understanding tasks for evaluating and analyzing natural language understanding systems.
 
 Before running anyone of these GLUE tasks you should download the
 [GLUE data](https://gluebenchmark.com/tasks) by running
@@ -1239,64 +251,56 @@ where task name can be one of CoLA, SST-2, MRPC, STS-B, QQP, MNLI, QNLI, RTE, WN
 
 The dev set results will be present within the text file 'eval_results.txt' in the specified output_dir. In case of MNLI, since there are two separate dev sets, matched and mismatched, there will be a separate output folder called '/tmp/MNLI-MM/' in addition to '/tmp/MNLI/'.
 
-The code has not been tested with half-precision training with apex on any GLUE task apart from MRPC, MNLI, CoLA, SST-2. The following section provides details on how to run half-precision training with MRPC. With that being said, there shouldn't be any issues in running half-precision training with the remaining GLUE tasks as well, since the data processor for each task inherits from the base class DataProcessor.
+#### Fine-tuning XLNet model on the STS-B regression task
 
-#### MRPC
-
-This example code fine-tunes BERT on the Microsoft Research Paraphrase
-Corpus (MRPC) corpus and runs in less than 10 minutes on a single K-80 and in 27 seconds (!) on single tesla V100 16GB with apex installed.
-
-Before running this example you should download the
-[GLUE data](https://gluebenchmark.com/tasks) by running
-[this script](https://gist.github.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e)
-and unpack it to some directory `$GLUE_DIR`.
+This example code fine-tunes XLNet on the STS-B corpus using parallel training on a server with 4 V100 GPUs.
+Parallel training is a simple way to use several GPU (but it is slower and less flexible than distributed training, see below).
 
 ```shell
 export GLUE_DIR=/path/to/glue
 
-python run_bert_classifier.py \
-  --task_name MRPC \
-  --do_train \
-  --do_eval \
-  --do_lower_case \
-  --data_dir $GLUE_DIR/MRPC/ \
-  --bert_model bert-base-uncased \
-  --max_seq_length 128 \
-  --train_batch_size 32 \
-  --learning_rate 2e-5 \
-  --num_train_epochs 3.0 \
-  --output_dir /tmp/mrpc_output/
+python ./examples/run_glue.py \
+    --model_type xlnet \
+    --model_name_or_path xlnet-large-cased \
+    --do_train  \
+    --task_name=sts-b     \
+    --data_dir=${GLUE_DIR}/STS-B  \
+    --output_dir=./proc_data/sts-b-110   \
+    --max_seq_length=128   \
+    --per_gpu_eval_batch_size=8   \
+    --per_gpu_train_batch_size=8   \
+    --gradient_accumulation_steps=1 \
+    --max_steps=1200  \
+    --model_name=xlnet-large-cased   \
+    --overwrite_output_dir   \
+    --overwrite_cache \
+    --warmup_steps=120
 ```
 
-Our test ran on a few seeds with [the original implementation hyper-parameters](https://github.com/google-research/bert#sentence-and-sentence-pair-classification-tasks) gave evaluation results between 84% and 88%.
+On this machine we thus have a batch size of 32, please increase `gradient_accumulation_steps` to reach the same batch size if you have a smaller machine.
+These hyper-parameters give evaluation results pearsonr of `0.918`.
 
-**Fast run with apex and 16 bit precision: fine-tuning on MRPC in 27 seconds!**
-First install apex as indicated [here](https://github.com/NVIDIA/apex).
-Then run
+#### Fine-tuning Bert model on the MRPC classification task
 
-```shell
-export GLUE_DIR=/path/to/glue
-
-python run_bert_classifier.py \
-  --task_name MRPC \
-  --do_train \
-  --do_eval \
-  --do_lower_case \
-  --data_dir $GLUE_DIR/MRPC/ \
-  --bert_model bert-base-uncased \
-  --max_seq_length 128 \
-  --train_batch_size 32 \
-  --learning_rate 2e-5 \
-  --num_train_epochs 3.0 \
-  --output_dir /tmp/mrpc_output/ \
-  --fp16
-```
-
-**Distributed training**
-Here is an example using distributed training on 8 V100 GPUs and Bert Whole Word Masking model to reach a F1 > 92 on MRPC:
+This example code fine-tunes the Bert Whole Word Masking model on the Microsoft Research Paraphrase Corpus (MRPC) corpus using distributed training on 8 V100 GPUs to reach a F1 > 92.
 
 ```bash
-python -m torch.distributed.launch --nproc_per_node 8 run_bert_classifier.py   --bert_model bert-large-uncased-whole-word-masking    --task_name MRPC --do_train   --do_eval   --do_lower_case   --data_dir $GLUE_DIR/MRPC/   --max_seq_length 128   --train_batch_size 8   --learning_rate 2e-5   --num_train_epochs 3.0  --output_dir /tmp/mrpc_output/
+python -m torch.distributed.launch --nproc_per_node 8 run_bert_classifier.py   \
+    --model_type bert \
+    --model_name_or_path bert-large-uncased-whole-word-masking \
+    --task_name MRPC \
+    --do_train   \
+    --do_eval   \
+    --do_lower_case   \
+    --data_dir $GLUE_DIR/MRPC/   \
+    --max_seq_length 128   \
+    --per_gpu_eval_batch_size=8   \
+    --per_gpu_train_batch_size=8   \
+    --learning_rate 2e-5   \
+    --num_train_epochs 3.0  \
+    --output_dir /tmp/mrpc_output/ \
+    --overwrite_output_dir   \
+    --overwrite_cache \
 ```
 
 Training with these hyper-parameters gave us the following results:
@@ -1310,83 +314,26 @@ Training with these hyper-parameters gave us the following results:
   loss = 0.07231863956341798
 ```
 
-Here is an example on MNLI:
+### Fine-tuning for question-answering: SQuAD example
+
+This example code fine-tunes BERT on the SQuAD dataset using distributed training on 8 V100 GPUs and Bert Whole Word Masking uncased model to reach a F1 > 93 on SQuAD:
 
 ```bash
-python -m torch.distributed.launch --nproc_per_node 8 run_bert_classifier.py   --bert_model bert-large-uncased-whole-word-masking    --task_name mnli --do_train   --do_eval   --do_lower_case   --data_dir /datadrive/bert_data/glue_data//MNLI/   --max_seq_length 128   --train_batch_size 8   --learning_rate 2e-5   --num_train_epochs 3.0   --output_dir ../models/wwm-uncased-finetuned-mnli/ --overwrite_output_dir
-```
-
-```bash
-***** Eval results *****
-  acc = 0.8679706601466992
-  eval_loss = 0.4911287787382479
-  global_step = 18408
-  loss = 0.04755385363816904
-
-***** Eval results *****
-  acc = 0.8747965825874695
-  eval_loss = 0.45516540421714036
-  global_step = 18408
-  loss = 0.04755385363816904
-```
-
-This is the example of the `bert-large-uncased-whole-word-masking-finetuned-mnli` model
-
-#### SQuAD
-
-This example code fine-tunes BERT on the SQuAD dataset. It runs in 24 min (with BERT-base) or 68 min (with BERT-large) on a single tesla V100 16GB.
-
-The data for SQuAD can be downloaded with the following links and should be saved in a `$SQUAD_DIR` directory.
-
-- [train-v1.1.json](https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json)
-- [dev-v1.1.json](https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json)
-- [evaluate-v1.1.py](https://github.com/allenai/bi-att-flow/blob/master/squad/evaluate-v1.1.py)
-
-```shell
-export SQUAD_DIR=/path/to/SQUAD
-
-python run_bert_squad.py \
-  --bert_model bert-base-uncased \
-  --do_train \
-  --do_predict \
-  --do_lower_case \
-  --train_file $SQUAD_DIR/train-v1.1.json \
-  --predict_file $SQUAD_DIR/dev-v1.1.json \
-  --train_batch_size 12 \
-  --learning_rate 3e-5 \
-  --num_train_epochs 2.0 \
-  --max_seq_length 384 \
-  --doc_stride 128 \
-  --output_dir /tmp/debug_squad/
-```
-
-Training with the previous hyper-parameters gave us the following results:
-
-```bash
-python $SQUAD_DIR/evaluate-v1.1.py $SQUAD_DIR/dev-v1.1.json /tmp/debug_squad/predictions.json
-{"f1": 88.52381567990474, "exact_match": 81.22043519394512}
-```
-
-##### distributed training
-
-Here is an example using distributed training on 8 V100 GPUs and Bert Whole Word Masking uncased model to reach a F1 > 93 on SQuAD:
-
-```bash
-python -m torch.distributed.launch --nproc_per_node=8 \
- run_bert_squad.py \
- --bert_model bert-large-uncased-whole-word-masking  \
- --do_train \
- --do_predict \
- --do_lower_case \
- --train_file $SQUAD_DIR/train-v1.1.json \
- --predict_file $SQUAD_DIR/dev-v1.1.json \
- --learning_rate 3e-5 \
- --num_train_epochs 2 \
- --max_seq_length 384 \
- --doc_stride 128 \
- --output_dir ../models/wwm_uncased_finetuned_squad/ \
- --train_batch_size 24 \
- --gradient_accumulation_steps 12
+python -m torch.distributed.launch --nproc_per_node=8 run_squad.py \
+    --model_type bert \
+    --model_name_or_path bert-large-uncased-whole-word-masking \
+    --do_train \
+    --do_predict \
+    --do_lower_case \
+    --train_file $SQUAD_DIR/train-v1.1.json \
+    --predict_file $SQUAD_DIR/dev-v1.1.json \
+    --learning_rate 3e-5 \
+    --num_train_epochs 2 \
+    --max_seq_length 384 \
+    --doc_stride 128 \
+    --output_dir ../models/wwm_uncased_finetuned_squad/ \
+    --per_gpu_eval_batch_size=3   \
+    --per_gpu_train_batch_size=3   \
 ```
 
 Training with these hyper-parameters gave us the following results:
@@ -1398,391 +345,25 @@ python $SQUAD_DIR/evaluate-v1.1.py $SQUAD_DIR/dev-v1.1.json ../models/wwm_uncase
 
 This is the model provided as `bert-large-uncased-whole-word-masking-finetuned-squad`.
 
-And here is the model provided as `bert-large-cased-whole-word-masking-finetuned-squad`:
+### Conditional generation: Text generation with GPT, GPT-2, Transformer-XL and XLNet
 
-```bash
-python -m torch.distributed.launch --nproc_per_node=8  run_bert_squad.py  --bert_model bert-large-cased-whole-word-masking   --do_train  --do_predict  --do_lower_case  --train_file $SQUAD_DIR/train-v1.1.json  --predict_file $SQUAD_DIR/dev-v1.1.json  --learning_rate 3e-5  --num_train_epochs 2  --max_seq_length 384  --doc_stride 128  --output_dir ../models/wwm_cased_finetuned_squad/  --train_batch_size 24  --gradient_accumulation_steps 12
-```
+A conditional generation script is also included to generate text from a prompt.
+The generation script include the [tricks](https://github.com/rusiaaman/XLNet-gen#methodology) proposed by by Aman Rusia to get high quality generation with memory models like Transformer-XL and XLNet (include a predefined text to make short inputs longer).
 
-Training with these hyper-parameters gave us the following results:
-
-```bash
-python $SQUAD_DIR/evaluate-v1.1.py $SQUAD_DIR/dev-v1.1.json ../models/wwm_uncased_finetuned_squad/predictions.json
-{"exact_match": 84.18164616840113, "f1": 91.58645594850135}
-```
-
-#### SWAG
-
-The data for SWAG can be downloaded by cloning the following [repository](https://github.com/rowanz/swagaf)
+Here is how to run the script with the small version of OpenAI GPT-2 model:
 
 ```shell
-export SWAG_DIR=/path/to/SWAG
-
-python run_bert_swag.py \
-  --bert_model bert-base-uncased \
-  --do_train \
-  --do_lower_case \
-  --do_eval \
-  --data_dir $SWAG_DIR/data \
-  --train_batch_size 16 \
-  --learning_rate 2e-5 \
-  --num_train_epochs 3.0 \
-  --max_seq_length 80 \
-  --output_dir /tmp/swag_output/ \
-  --gradient_accumulation_steps 4
+python ./examples/run_glue.py \
+    --model_type=gpt2 \
+    --length=20 \
+    --model_name_or_path=gpt2 \
 ```
 
-Training with the previous hyper-parameters on a single GPU gave us the following results:
+## Documentation
 
-```bash
-eval_accuracy = 0.8062081375587323
-eval_loss = 0.5966546792367169
-global_step = 13788
-loss = 0.06423990014260186
-```
+The full documentation is available at https://huggingface.co/pytorch-transformers/.
 
-#### LM Fine-tuning
+## Citation
 
-The data should be a text file in the same format as [sample_text.txt](./samples/sample_text.txt)  (one sentence per line, docs separated by empty line).
-You can download an [exemplary training corpus](https://ext-bert-sample.obs.eu-de.otc.t-systems.com/small_wiki_sentence_corpus.txt) generated from wikipedia articles and splitted into ~500k sentences with spaCy.
-Training one epoch on this corpus takes about 1:20h on 4 x NVIDIA Tesla P100 with `train_batch_size=200` and `max_seq_length=128`:
-
-Thank to the work of @Rocketknight1 and @tholor there are now **several scripts** that can be used to fine-tune BERT using the pretraining objective (combination of masked-language modeling and next sentence prediction loss). These scripts are detailed in the [`README`](./examples/lm_finetuning/README.md) of the [`examples/lm_finetuning/`](./examples/lm_finetuning/) folder.
-
-### OpenAI GPT, Transformer-XL and GPT-2: running the examples
-
-We provide three examples of scripts for OpenAI GPT, Transformer-XL and OpenAI GPT-2 based on (and extended from) the respective original implementations:
-
-- fine-tuning OpenAI GPT on the ROCStories dataset
-- evaluating Transformer-XL on Wikitext 103
-- unconditional and conditional generation from a pre-trained OpenAI GPT-2 model
-
-#### Fine-tuning OpenAI GPT on the RocStories dataset
-
-This example code fine-tunes OpenAI GPT on the RocStories dataset.
-
-Before running this example you should download the
-[RocStories dataset](https://github.com/snigdhac/StoryComprehension_EMNLP/tree/master/Dataset/RoCStories) and unpack it to some directory `$ROC_STORIES_DIR`.
-
-```shell
-export ROC_STORIES_DIR=/path/to/RocStories
-
-python run_openai_gpt.py \
-  --model_name openai-gpt \
-  --do_train \
-  --do_eval \
-  --train_dataset $ROC_STORIES_DIR/cloze_test_val__spring2016\ -\ cloze_test_ALL_val.csv \
-  --eval_dataset $ROC_STORIES_DIR/cloze_test_test__spring2016\ -\ cloze_test_ALL_test.csv \
-  --output_dir ../log \
-  --train_batch_size 16 \
-```
-
-This command runs in about 10 min on a single K-80 an gives an evaluation accuracy of about 87.7% (the authors report a median accuracy with the TensorFlow code of 85.8% and the OpenAI GPT paper reports a best single run accuracy of 86.5%).
-
-#### Evaluating the pre-trained Transformer-XL on the WikiText 103 dataset
-
-This example code evaluate the pre-trained Transformer-XL on the WikiText 103 dataset.
-This command will download a pre-processed version of the WikiText 103 dataset in which the vocabulary has been computed.
-
-```shell
-python run_transfo_xl.py --work_dir ../log
-```
-
-This command runs in about 1 min on a V100 and gives an evaluation perplexity of 18.22 on WikiText-103 (the authors report a perplexity of about 18.3 on this dataset with the TensorFlow code).
-
-#### Unconditional and conditional generation from OpenAI's GPT-2 model
-
-This example code is identical to the original unconditional and conditional generation codes.
-
-Conditional generation:
-
-```shell
-python run_gpt2.py
-```
-
-Unconditional generation:
-
-```shell
-python run_gpt2.py --unconditional
-```
-
-The same option as in the original scripts are provided, please refere to the code of the example and the original repository of OpenAI.
-
-## Fine-tuning BERT-large on GPUs
-
-The options we list above allow to fine-tune BERT-large rather easily on GPU(s) instead of the TPU used by the original implementation.
-
-For example, fine-tuning BERT-large on SQuAD can be done on a server with 4 k-80 (these are pretty old now) in 18 hours. Our results are similar to the TensorFlow implementation results (actually slightly higher):
-
-```bash
-{"exact_match": 84.56953642384106, "f1": 91.04028647786927}
-```
-
-To get these results we used a combination of:
-
-- multi-GPU training (automatically activated on a multi-GPU server),
-- 2 steps of gradient accumulation and
-- perform the optimization step on CPU to store Adam's averages in RAM.
-
-Here is the full list of hyper-parameters for this run:
-
-```bash
-export SQUAD_DIR=/path/to/SQUAD
-
-python ./run_bert_squad.py \
-  --bert_model bert-large-uncased \
-  --do_train \
-  --do_predict \
-  --do_lower_case \
-  --train_file $SQUAD_DIR/train-v1.1.json \
-  --predict_file $SQUAD_DIR/dev-v1.1.json \
-  --learning_rate 3e-5 \
-  --num_train_epochs 2 \
-  --max_seq_length 384 \
-  --doc_stride 128 \
-  --output_dir /tmp/debug_squad/ \
-  --train_batch_size 24 \
-  --gradient_accumulation_steps 2
-```
-
-If you have a recent GPU (starting from NVIDIA Volta series), you should try **16-bit fine-tuning** (FP16).
-
-Here is an example of hyper-parameters for a FP16 run we tried:
-
-```bash
-export SQUAD_DIR=/path/to/SQUAD
-
-python ./run_bert_squad.py \
-  --bert_model bert-large-uncased \
-  --do_train \
-  --do_predict \
-  --do_lower_case \
-  --train_file $SQUAD_DIR/train-v1.1.json \
-  --predict_file $SQUAD_DIR/dev-v1.1.json \
-  --learning_rate 3e-5 \
-  --num_train_epochs 2 \
-  --max_seq_length 384 \
-  --doc_stride 128 \
-  --output_dir /tmp/debug_squad/ \
-  --train_batch_size 24 \
-  --fp16 \
-  --loss_scale 128
-```
-
-The results were similar to the above FP32 results (actually slightly higher):
-
-```bash
-{"exact_match": 84.65468306527909, "f1": 91.238669287002}
-```
-
-Here is an example with the recent `bert-large-uncased-whole-word-masking`:
-
-```bash
-python -m torch.distributed.launch --nproc_per_node=8 \
-  run_bert_squad.py \
-  --bert_model bert-large-uncased-whole-word-masking \
-  --do_train \
-  --do_predict \
-  --do_lower_case \
-  --train_file $SQUAD_DIR/train-v1.1.json \
-  --predict_file $SQUAD_DIR/dev-v1.1.json \
-  --learning_rate 3e-5 \
-  --num_train_epochs 2 \
-  --max_seq_length 384 \
-  --doc_stride 128 \
-  --output_dir /tmp/debug_squad/ \
-  --train_batch_size 24 \
-  --gradient_accumulation_steps 2
-```
-
-## Fine-tuning XLNet
-
-### STS-B
-
-This example code fine-tunes XLNet on the STS-B corpus.
-
-Before running this example you should download the
-[GLUE data](https://gluebenchmark.com/tasks) by running
-[this script](https://gist.github.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e)
-and unpack it to some directory `$GLUE_DIR`.
-
-```shell
-export GLUE_DIR=/path/to/glue
-
-CUDA_VISIBLE_DEVICES=0,1,2,3 python ./examples/run_glue.py   --do_train  --task_name=sts-b     --data_dir=${GLUE_DIR}/STS-B   --output_dir=./proc_data/sts-b-110   --max_seq_length=128   --per_gpu_eval_batch_size=8   --per_gpu_train_batch_size=8   --max_steps=1200  --model_name=xlnet-large-cased   --overwrite_output_dir   --overwrite_cache --warmup_steps=120
-```
-
-This hyper-parameters give evaluation results pearsonr > 0.918.
-
-### Distributed training
-
-Here is an example using distributed training on 8 V100 GPUs to reach XXXX:
-
-```bash
-python -m torch.distributed.launch --nproc_per_node 8 \
- run_xlnet_classifier.py \
- --task_name STS-B \
- --do_train \
- --do_eval \
- --data_dir $GLUE_DIR/STS-B/ \
- --max_seq_length 128 \
- --train_batch_size 8 \
- --gradient_accumulation_steps 1 \
- --learning_rate 5e-5 \
- --num_train_epochs 3.0 \
- --output_dir /tmp/mrpc_output/
-```
-
-Training with these hyper-parameters gave us the following results:
-
-```bash
-  acc = 0.8823529411764706
-  acc_and_f1 = 0.901702786377709
-  eval_loss = 0.3418912578906332
-  f1 = 0.9210526315789473
-  global_step = 174
-  loss = 0.07231863956341798
-```
-
-Here is an example on MNLI:
-
-```bash
-python -m torch.distributed.launch --nproc_per_node 8 run_bert_classifier.py   --bert_model bert-large-uncased-whole-word-masking    --task_name mnli --do_train   --do_eval   --data_dir /datadrive/bert_data/glue_data//MNLI/   --max_seq_length 128   --train_batch_size 8   --learning_rate 2e-5   --num_train_epochs 3.0   --output_dir ../models/wwm-uncased-finetuned-mnli/ --overwrite_output_dir
-```
-
-```bash
-***** Eval results *****
-  acc = 0.8679706601466992
-  eval_loss = 0.4911287787382479
-  global_step = 18408
-  loss = 0.04755385363816904
-
-***** Eval results *****
-  acc = 0.8747965825874695
-  eval_loss = 0.45516540421714036
-  global_step = 18408
-  loss = 0.04755385363816904
-```
-
-This is the example of the `bert-large-uncased-whole-word-masking-finetuned-mnli` model
-
-## BERTology
-
-There is a growing field of study concerned with investigating the inner working of large-scale transformers like BERT (that some call "BERTology"). Some good examples of this field are:
-
-- [BERT Rediscovers the Classical NLP Pipeline](https://arxiv.org/abs/1905.05950) by Ian Tenney, Dipanjan Das, Ellie Pavlick
-- [Are Sixteen Heads Really Better than One?](https://arxiv.org/abs/1905.10650) by Paul Michel, Omer Levy, Graham Neubig
-- [What Does BERT Look At? An Analysis of BERT's Attention](https://arxiv.org/abs/1906.04341) by Kevin Clark, Urvashi Khandelwal, Omer Levy, Christopher D. Manning
-
-In order to help this new field develop, we have included a few additional features in the BERT/GPT/GPT-2 models to help people access the inner representations, mainly adapted  from the great work of [Michel et al.](https://arxiv.org/abs/1905.10650):
-
-- accessing all the hidden-states of BERT/GPT/GPT-2,
-- accessing all the attention weights for each head of BERT/GPT/GPT-2,
-- retrieving heads output values and gradients to be able to compute head importance score and prune head as explained in [Michel et al.](https://arxiv.org/abs/1905.10650).
-
-To help you understand and use these features, we have added a specific example script: [`bertology.py`](./examples/bertology.py) while extract information and prune a model pre-trained on MRPC.
-
-## Notebooks
-
-We include [three Jupyter Notebooks](https://github.com/huggingface/pytorch-transformers/tree/master/notebooks) that can be used to check that the predictions of the PyTorch model are identical to the predictions of the original TensorFlow model.
-
-- The first NoteBook ([Comparing-TF-and-PT-models.ipynb](./notebooks/Comparing-TF-and-PT-models.ipynb)) extracts the hidden states of a full sequence on each layers of the TensorFlow and the PyTorch models and computes the standard deviation between them. In the given example, we get a standard deviation of 1.5e-7 to 9e-7 on the various hidden state of the models.
-
-- The second NoteBook ([Comparing-TF-and-PT-models-SQuAD.ipynb](./notebooks/Comparing-TF-and-PT-models-SQuAD.ipynb)) compares the loss computed by the TensorFlow and the PyTorch models for identical initialization of the fine-tuning layer of the `BertForQuestionAnswering` and computes the standard deviation between them. In the given example, we get a standard deviation of 2.5e-7 between the models.
-
-- The third NoteBook ([Comparing-TF-and-PT-models-MLM-NSP.ipynb](./notebooks/Comparing-TF-and-PT-models-MLM-NSP.ipynb)) compares the predictions computed by the TensorFlow and the PyTorch models for masked token language modeling using the pre-trained masked language modeling model.
-
-Please follow the instructions given in the notebooks to run and modify them.
-
-## Command-line interface
-
-A command-line interface is provided to convert a TensorFlow checkpoint in a PyTorch dump of the `BertForPreTraining` class  (for BERT) or NumPy checkpoint in a PyTorch dump of the `OpenAIGPTModel` class  (for OpenAI GPT).
-
-### BERT CLI
-
-You can convert any TensorFlow checkpoint for BERT (in particular [the pre-trained models released by Google](https://github.com/google-research/bert#pre-trained-models)) in a PyTorch save file by using the [`convert_tf_checkpoint_to_pytorch.py`](./pytorch_transformers/convert_tf_checkpoint_to_pytorch.py ) script.
-
-This CLI takes as input a TensorFlow checkpoint (three files starting with `bert_model.ckpt`) and the associated configuration file (`bert_config.json`), and creates a PyTorch model for this configuration, loads the weights from the TensorFlow checkpoint in the PyTorch model and saves the resulting model in a standard PyTorch save file that can be imported using `torch.load()` (see examples in [`run_bert_extract_features.py`](./examples/run_bert_extract_features.py), [`run_bert_classifier.py`](./examples/run_bert_classifier.py) and [`run_bert_squad.py`](./examples/run_bert_squad.py)).
-
-You only need to run this conversion script **once** to get a PyTorch model. You can then disregard the TensorFlow checkpoint (the three files starting with `bert_model.ckpt`) but be sure to keep the configuration file (`bert_config.json`) and the vocabulary file (`vocab.txt`) as these are needed for the PyTorch model too.
-
-To run this specific conversion script you will need to have TensorFlow and PyTorch installed (`pip install tensorflow`). The rest of the repository only requires PyTorch.
-
-Here is an example of the conversion process for a pre-trained `BERT-Base Uncased` model:
-
-```shell
-export BERT_BASE_DIR=/path/to/bert/uncased_L-12_H-768_A-12
-
-pytorch_transformers bert \
-  $BERT_BASE_DIR/bert_model.ckpt \
-  $BERT_BASE_DIR/bert_config.json \
-  $BERT_BASE_DIR/pytorch_model.bin
-```
-
-You can download Google's pre-trained models for the conversion [here](https://github.com/google-research/bert#pre-trained-models).
-
-### OpenAI GPT CLI
-
-Here is an example of the conversion process for a pre-trained OpenAI GPT model, assuming that your NumPy checkpoint save as the same format than OpenAI pretrained model (see [here](https://github.com/openai/finetune-transformer-lm))
-
-```shell
-export OPENAI_GPT_CHECKPOINT_FOLDER_PATH=/path/to/openai/pretrained/numpy/weights
-
-pytorch_transformers gpt \
-  $OPENAI_GPT_CHECKPOINT_FOLDER_PATH \
-  $PYTORCH_DUMP_OUTPUT \
-  [OPENAI_GPT_CONFIG]
-```
-
-### Transformer-XL CLI
-
-Here is an example of the conversion process for a pre-trained Transformer-XL model (see [here](https://github.com/kimiyoung/transformer-xl/tree/master/tf#obtain-and-evaluate-pretrained-sota-models))
-
-```shell
-export TRANSFO_XL_CHECKPOINT_FOLDER_PATH=/path/to/transfo/xl/checkpoint
-
-pytorch_transformers transfo_xl \
-  $TRANSFO_XL_CHECKPOINT_FOLDER_PATH \
-  $PYTORCH_DUMP_OUTPUT \
-  [TRANSFO_XL_CONFIG]
-```
-
-### GPT-2
-
-Here is an example of the conversion process for a pre-trained OpenAI's GPT-2 model.
-
-```shell
-export GPT2_DIR=/path/to/gpt2/checkpoint
-
-pytorch_transformers gpt2 \
-  $GPT2_DIR/model.ckpt \
-  $PYTORCH_DUMP_OUTPUT \
-  [GPT2_CONFIG]
-```
-
-### XLNet
-
-Here is an example of the conversion process for a pre-trained XLNet model, fine-tuned on STS-B using the TensorFlow script:
-
-```shell
-export TRANSFO_XL_CHECKPOINT_PATH=/path/to/xlnet/checkpoint
-export TRANSFO_XL_CONFIG_PATH=/path/to/xlnet/config
-
-pytorch_transformers xlnet \
-  $TRANSFO_XL_CHECKPOINT_PATH \
-  $TRANSFO_XL_CONFIG_PATH \
-  $PYTORCH_DUMP_OUTPUT \
-  STS-B \
-```
-
-## TPU
-
-TPU support and pretraining scripts
-
-TPU are not supported by the current stable release of PyTorch (0.4.1). However, the next version of PyTorch (v1.0) should support training on TPU and is expected to be released soon (see the recent [official announcement](https://cloud.google.com/blog/products/ai-machine-learning/introducing-pytorch-across-google-cloud)).
-
-We will add TPU support when this next release is published.
-
-The original TensorFlow code further comprises two scripts for pre-training BERT: [create_pretraining_data.py](https://github.com/google-research/bert/blob/master/create_pretraining_data.py) and [run_pretraining.py](https://github.com/google-research/bert/blob/master/run_pretraining.py).
-
-Since, pre-training BERT is a particularly expensive operation that basically requires one or several TPUs to be completed in a reasonable amout of time (see details [here](https://github.com/google-research/bert#pre-training-with-bert)) we have decided to wait for the inclusion of TPU support in PyTorch to convert these pre-training scripts.
+At the moment, there is no paper to cite for PyTorch-Transformers but we are working on preparing one.
+In the meantime, please include a mention of the library and a link to the present repository if you use this work in a published or open-source project.
diff --git a/docs/source/serialization.rst b/docs/source/serialization.rst
new file mode 100644
index 0000000000..fb947ffb51
--- /dev/null
+++ b/docs/source/serialization.rst
@@ -0,0 +1,171 @@
+### Loading Google AI or OpenAI pre-trained weights or PyTorch dump
+
+### `from_pretrained()` method
+
+To load one of Google AI's, OpenAI's pre-trained models or a PyTorch saved model (an instance of `BertForPreTraining` saved with `torch.save()`), the PyTorch model classes and the tokenizer can be instantiated using the `from_pretrained()` method:
+
+```python
+model = BERT_CLASS.from_pretrained(PRE_TRAINED_MODEL_NAME_OR_PATH, cache_dir=None, from_tf=False, state_dict=None, *input, **kwargs)
+```
+
+where
+
+- `BERT_CLASS` is either a tokenizer to load the vocabulary (`BertTokenizer` or `OpenAIGPTTokenizer` classes) or one of the eight BERT or three OpenAI GPT PyTorch model classes (to load the pre-trained weights): `BertModel`, `BertForMaskedLM`, `BertForNextSentencePrediction`, `BertForPreTraining`, `BertForSequenceClassification`, `BertForTokenClassification`, `BertForMultipleChoice`, `BertForQuestionAnswering`, `OpenAIGPTModel`, `OpenAIGPTLMHeadModel` or `OpenAIGPTDoubleHeadsModel`, and
+- `PRE_TRAINED_MODEL_NAME_OR_PATH` is either:
+
+  - the shortcut name of a Google AI's or OpenAI's pre-trained model selected in the list:
+
+    - `bert-base-uncased`: 12-layer, 768-hidden, 12-heads, 110M parameters
+    - `bert-large-uncased`: 24-layer, 1024-hidden, 16-heads, 340M parameters
+    - `bert-base-cased`: 12-layer, 768-hidden, 12-heads , 110M parameters
+    - `bert-large-cased`: 24-layer, 1024-hidden, 16-heads, 340M parameters
+    - `bert-base-multilingual-uncased`: (Orig, not recommended) 102 languages, 12-layer, 768-hidden, 12-heads, 110M parameters
+    - `bert-base-multilingual-cased`: **(New, recommended)** 104 languages, 12-layer, 768-hidden, 12-heads, 110M parameters
+    - `bert-base-chinese`: Chinese Simplified and Traditional, 12-layer, 768-hidden, 12-heads, 110M parameters
+    - `bert-base-german-cased`: Trained on German data only, 12-layer, 768-hidden, 12-heads, 110M parameters [Performance Evaluation](https://deepset.ai/german-bert)
+    - `bert-large-uncased-whole-word-masking`: 24-layer, 1024-hidden, 16-heads, 340M parameters - Trained with Whole Word Masking (mask all of the the tokens corresponding to a word at once)
+    - `bert-large-cased-whole-word-masking`: 24-layer, 1024-hidden, 16-heads, 340M parameters - Trained with Whole Word Masking (mask all of the the tokens corresponding to a word at once)
+    - `bert-large-uncased-whole-word-masking-finetuned-squad`: The `bert-large-uncased-whole-word-masking` model finetuned on SQuAD (using the `run_bert_squad.py` examples). Results: *exact_match: 86.91579943235573, f1: 93.1532499015869*
+    - `openai-gpt`: OpenAI GPT English model, 12-layer, 768-hidden, 12-heads, 110M parameters
+    - `gpt2`: OpenAI GPT-2 English model, 12-layer, 768-hidden, 12-heads, 117M parameters
+    - `gpt2-medium`: OpenAI GPT-2 English model, 24-layer, 1024-hidden, 16-heads, 345M parameters
+    - `transfo-xl-wt103`: Transformer-XL English model trained on wikitext-103, 18-layer, 1024-hidden, 16-heads, 257M parameters
+
+  - a path or url to a pretrained model archive containing:
+
+    - `bert_config.json` or `openai_gpt_config.json` a configuration file for the model, and
+    - `pytorch_model.bin` a PyTorch dump of a pre-trained instance of `BertForPreTraining`, `OpenAIGPTModel`, `TransfoXLModel`, `GPT2LMHeadModel` (saved with the usual `torch.save()`)
+
+  If `PRE_TRAINED_MODEL_NAME_OR_PATH` is a shortcut name, the pre-trained weights will be downloaded from AWS S3 (see the links [here](pytorch_transformers/modeling.py)) and stored in a cache folder to avoid future download (the cache folder can be found at `~/.pytorch_transformers/`).
+
+- `cache_dir` can be an optional path to a specific directory to download and cache the pre-trained model weights. This option is useful in particular when you are using distributed training: to avoid concurrent access to the same weights you can set for example `cache_dir='./pretrained_model_{}'.format(args.local_rank)` (see the section on distributed training for more information).
+- `from_tf`: should we load the weights from a locally saved TensorFlow checkpoint
+- `state_dict`: an optional state dictionnary (collections.OrderedDict object) to use instead of Google pre-trained models
+- `*inputs`, `**kwargs`: additional input for the specific Bert class (ex: num_labels for BertForSequenceClassification)
+
+`Uncased` means that the text has been lowercased before WordPiece tokenization, e.g., `John Smith` becomes `john smith`. The Uncased model also strips out any accent markers. `Cased` means that the true case and accent markers are preserved. Typically, the Uncased model is better unless you know that case information is important for your task (e.g., Named Entity Recognition or Part-of-Speech tagging). For information about the Multilingual and Chinese model, see the [Multilingual README](https://github.com/google-research/bert/blob/master/multilingual.md) or the original TensorFlow repository.
+
+**When using an `uncased model`, make sure to pass `--do_lower_case` to the example training scripts (or pass `do_lower_case=True` to FullTokenizer if you're using your own script and loading the tokenizer your-self.).**
+
+Examples:
+
+```python
+# BERT
+tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True, do_basic_tokenize=True)
+model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
+
+# OpenAI GPT
+tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
+model = OpenAIGPTModel.from_pretrained('openai-gpt')
+
+# Transformer-XL
+tokenizer = TransfoXLTokenizer.from_pretrained('transfo-xl-wt103')
+model = TransfoXLModel.from_pretrained('transfo-xl-wt103')
+
+# OpenAI GPT-2
+tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
+model = GPT2Model.from_pretrained('gpt2')
+
+```
+
+#### Cache directory
+
+`pytorch_transformers` save the pretrained weights in a cache directory which is located at (in this order of priority):
+
+- `cache_dir` optional arguments to the `from_pretrained()` method (see above),
+- shell environment variable `PYTORCH_PRETRAINED_BERT_CACHE`,
+- PyTorch cache home + `/pytorch_transformers/`
+  where PyTorch cache home is defined by (in this order):
+  - shell environment variable `ENV_TORCH_HOME`
+  - shell environment variable `ENV_XDG_CACHE_HOME` + `/torch/`)
+  - default: `~/.cache/torch/`
+
+Usually, if you don't set any specific environment variable, `pytorch_transformers` cache will be at `~/.cache/torch/pytorch_transformers/`.
+
+You can alsways safely delete `pytorch_transformers` cache but the pretrained model weights and vocabulary files wil have to be re-downloaded from our S3.
+
+### Serialization best-practices
+
+This section explain how you can save and re-load a fine-tuned model (BERT, GPT, GPT-2 and Transformer-XL).
+There are three types of files you need to save to be able to reload a fine-tuned model:
+
+- the model it-self which should be saved following PyTorch serialization [best practices](https://pytorch.org/docs/stable/notes/serialization.html#best-practices),
+- the configuration file of the model which is saved as a JSON file, and
+- the vocabulary (and the merges for the BPE-based models GPT and GPT-2).
+
+The *default filenames* of these files are as follow:
+
+- the model weights file: `pytorch_model.bin`,
+- the configuration file: `config.json`,
+- the vocabulary file: `vocab.txt` for BERT and Transformer-XL, `vocab.json` for GPT/GPT-2 (BPE vocabulary),
+- for GPT/GPT-2 (BPE vocabulary) the additional merges file: `merges.txt`.
+
+**If you save a model using these *default filenames*, you can then re-load the model and tokenizer using the `from_pretrained()` method.**
+
+Here is the recommended way of saving the model, configuration and vocabulary to an `output_dir` directory and reloading the model and tokenizer afterwards:
+
+```python
+from pytorch_transformers import WEIGHTS_NAME, CONFIG_NAME
+
+output_dir = "./models/"
+
+# Step 1: Save a model, configuration and vocabulary that you have fine-tuned
+
+# If we have a distributed model, save only the encapsulated model
+# (it was wrapped in PyTorch DistributedDataParallel or DataParallel)
+model_to_save = model.module if hasattr(model, 'module') else model
+
+# If we save using the predefined names, we can load using `from_pretrained`
+output_model_file = os.path.join(output_dir, WEIGHTS_NAME)
+output_config_file = os.path.join(output_dir, CONFIG_NAME)
+
+torch.save(model_to_save.state_dict(), output_model_file)
+model_to_save.config.to_json_file(output_config_file)
+tokenizer.save_vocabulary(output_dir)
+
+# Step 2: Re-load the saved model and vocabulary
+
+# Example for a Bert model
+model = BertForQuestionAnswering.from_pretrained(output_dir)
+tokenizer = BertTokenizer.from_pretrained(output_dir, do_lower_case=args.do_lower_case)  # Add specific options if needed
+# Example for a GPT model
+model = OpenAIGPTDoubleHeadsModel.from_pretrained(output_dir)
+tokenizer = OpenAIGPTTokenizer.from_pretrained(output_dir)
+```
+
+Here is another way you can save and reload the model if you want to use specific paths for each type of files:
+
+```python
+output_model_file = "./models/my_own_model_file.bin"
+output_config_file = "./models/my_own_config_file.bin"
+output_vocab_file = "./models/my_own_vocab_file.bin"
+
+# Step 1: Save a model, configuration and vocabulary that you have fine-tuned
+
+# If we have a distributed model, save only the encapsulated model
+# (it was wrapped in PyTorch DistributedDataParallel or DataParallel)
+model_to_save = model.module if hasattr(model, 'module') else model
+
+torch.save(model_to_save.state_dict(), output_model_file)
+model_to_save.config.to_json_file(output_config_file)
+tokenizer.save_vocabulary(output_vocab_file)
+
+# Step 2: Re-load the saved model and vocabulary
+
+# We didn't save using the predefined WEIGHTS_NAME, CONFIG_NAME names, we cannot load using `from_pretrained`.
+# Here is how to do it in this situation:
+
+# Example for a Bert model
+config = BertConfig.from_json_file(output_config_file)
+model = BertForQuestionAnswering(config)
+state_dict = torch.load(output_model_file)
+model.load_state_dict(state_dict)
+tokenizer = BertTokenizer(output_vocab_file, do_lower_case=args.do_lower_case)
+
+# Example for a GPT model
+config = OpenAIGPTConfig.from_json_file(output_config_file)
+model = OpenAIGPTDoubleHeadsModel(config)
+state_dict = torch.load(output_model_file)
+model.load_state_dict(state_dict)
+tokenizer = OpenAIGPTTokenizer(output_vocab_file)
+```
diff --git a/pytorch_transformers/modeling_utils.py b/pytorch_transformers/modeling_utils.py
index 2c15aa740b..76a6e00db0 100644
--- a/pytorch_transformers/modeling_utils.py
+++ b/pytorch_transformers/modeling_utils.py
@@ -614,19 +614,19 @@ class SQuADHead(nn.Module):
     Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
         **loss**: (`optional`, returned if both ``start_positions`` and ``end_positions`` are provided) ``torch.FloatTensor`` of shape ``(1,)``:
             Classification loss as the sum of start token, end token (and is_impossible if provided) classification losses.
-        **start_top_log_probs**: `(`optional`, returned if ``start_positions`` or ``end_positions`` is not provided)
+        **start_top_log_probs**: (`optional`, returned if ``start_positions`` or ``end_positions`` is not provided)
             ``torch.FloatTensor`` of shape ``(batch_size, config.start_n_top)``
             Log probabilities for the top config.start_n_top start token possibilities (beam-search).
-        **start_top_index**: `(`optional`, returned if ``start_positions`` or ``end_positions`` is not provided)
+        **start_top_index**: (`optional`, returned if ``start_positions`` or ``end_positions`` is not provided)
             ``torch.LongTensor`` of shape ``(batch_size, config.start_n_top)``
             Indices for the top config.start_n_top start token possibilities (beam-search).
-        **end_top_log_probs**: `(`optional`, returned if ``start_positions`` or ``end_positions`` is not provided)
+        **end_top_log_probs**: (`optional`, returned if ``start_positions`` or ``end_positions`` is not provided)
             ``torch.FloatTensor`` of shape ``(batch_size, config.start_n_top * config.end_n_top)``
             Log probabilities for the top ``config.start_n_top * config.end_n_top`` end token possibilities (beam-search).
-        **end_top_index**: `(`optional`, returned if ``start_positions`` or ``end_positions`` is not provided)
+        **end_top_index**: (`optional`, returned if ``start_positions`` or ``end_positions`` is not provided)
             ``torch.LongTensor`` of shape ``(batch_size, config.start_n_top * config.end_n_top)``
             Indices for the top ``config.start_n_top * config.end_n_top`` end token possibilities (beam-search).
-        **cls_logits**: `(`optional`, returned if ``start_positions`` or ``end_positions`` is not provided)
+        **cls_logits**: (`optional`, returned if ``start_positions`` or ``end_positions`` is not provided)
             ``torch.FloatTensor`` of shape ``(batch_size,)``
             Log probabilities for the ``is_impossible`` label of the answers.
     """
diff --git a/pytorch_transformers/modeling_xlnet.py b/pytorch_transformers/modeling_xlnet.py
index 848e73cfc9..c50d0a3f48 100644
--- a/pytorch_transformers/modeling_xlnet.py
+++ b/pytorch_transformers/modeling_xlnet.py
@@ -1169,19 +1169,19 @@ class XLNetForQuestionAnswering(XLNetPreTrainedModel):
     Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
         **loss**: (`optional`, returned if both ``start_positions`` and ``end_positions`` are provided) ``torch.FloatTensor`` of shape ``(1,)``:
             Classification loss as the sum of start token, end token (and is_impossible if provided) classification losses.
-        **start_top_log_probs**: `(`optional`, returned if ``start_positions`` or ``end_positions`` is not provided)
+        **start_top_log_probs**: (`optional`, returned if ``start_positions`` or ``end_positions`` is not provided)
             ``torch.FloatTensor`` of shape ``(batch_size, config.start_n_top)``
             Log probabilities for the top config.start_n_top start token possibilities (beam-search).
-        **start_top_index**: `(`optional`, returned if ``start_positions`` or ``end_positions`` is not provided)
+        **start_top_index**: (`optional`, returned if ``start_positions`` or ``end_positions`` is not provided)
             ``torch.LongTensor`` of shape ``(batch_size, config.start_n_top)``
             Indices for the top config.start_n_top start token possibilities (beam-search).
-        **end_top_log_probs**: `(`optional`, returned if ``start_positions`` or ``end_positions`` is not provided)
+        **end_top_log_probs**: (`optional`, returned if ``start_positions`` or ``end_positions`` is not provided)
             ``torch.FloatTensor`` of shape ``(batch_size, config.start_n_top * config.end_n_top)``
             Log probabilities for the top ``config.start_n_top * config.end_n_top`` end token possibilities (beam-search).
-        **end_top_index**: `(`optional`, returned if ``start_positions`` or ``end_positions`` is not provided)
+        **end_top_index**: (`optional`, returned if ``start_positions`` or ``end_positions`` is not provided)
             ``torch.LongTensor`` of shape ``(batch_size, config.start_n_top * config.end_n_top)``
             Indices for the top ``config.start_n_top * config.end_n_top`` end token possibilities (beam-search).
-        **cls_logits**: `(`optional`, returned if ``start_positions`` or ``end_positions`` is not provided)
+        **cls_logits**: (`optional`, returned if ``start_positions`` or ``end_positions`` is not provided)
             ``torch.FloatTensor`` of shape ``(batch_size,)``
             Log probabilities for the ``is_impossible`` label of the answers.
         **mems**:

From 064d0a0b76f8d8fb90e44960452849720ce38f46 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Tue, 16 Jul 2019 00:21:33 +0200
Subject: [PATCH 136/139] update readme

---
 README.md | 45 +++++++++++++++------------------------------
 1 file changed, 15 insertions(+), 30 deletions(-)

diff --git a/README.md b/README.md
index 129179b817..6b24454101 100644
--- a/README.md
+++ b/README.md
@@ -1,27 +1,23 @@
 # 👾 PyTorch-Transformers
 
-[![CircleCI](https://circleci.com/gh/huggingface/pytorch-pretrained-bert.svg?style=svg)](https://circleci.com/gh/huggingface/pytorch-pretrained-bert)
+[![CircleCI](https://circleci.com/gh/huggingface/pytorch-pretrained-BERT.svg?style=svg)](https://circleci.com/gh/huggingface/pytorch-pretrained-BERT)
 
 PyTorch-Transformers is a library of state-of-the-art pre-trained models for Natural Language Processing (NLP). The library currently contains PyTorch implementations, pre-trained model weights, usage scripts and conversion utilities for the following models:
 
 - **[Google's BERT model](https://github.com/google-research/bert)** released with the paper [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova.
-- **[OpenAI's GPT model](https://github.com/openai/finetune-transformer-lm) released  with the paper [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/)** by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.
+- **[OpenAI's GPT model](https://github.com/openai/finetune-transformer-lm)** released  with the paper [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.
 - **[OpenAI's GPT-2 model](https://blog.openai.com/better-language-models/)** released with the paper [Language Models are Unsupervised Multitask Learners](https://blog.openai.com/better-language-models/) by Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever**.
 - **[Google/CMU's Transformer-XL model](https://github.com/kimiyoung/transformer-xl)** released with the paper [​XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
 - **[Google/CMU's XLNet model](https://github.com/zihangdai/xlnet/)** released with the paper [​XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
 - **[Facebook's XLM model](https://github.com/facebookresearch/XLM/)** released together with the paper [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) by Guillaume Lample and Alexis Conneau.
 
-These implementations have been tested on several datasets (see the example scripts) and should match the performances of the original implementations (e.g. ~93 F1 on SQuAD for BERT Whole-Word-Masking, ~88 F1 on RocStories for OpenAI GPT, ~18.3 perplexity on WikiText 103 for Transformer-XL, ~0.916 Peason R coefficient on STS-B for XLNet).
-
-You can find more details in the [Examples](#examples) section of the documentation.
-
-## Readme
+These implementations have been tested on several datasets (see the example scripts) and should match the performances of the original implementations (e.g. ~93 F1 on SQuAD for BERT Whole-Word-Masking, ~88 F1 on RocStories for OpenAI GPT, ~18.3 perplexity on WikiText 103 for Transformer-XL, ~0.916 Peason R coefficient on STS-B for XLNet). You can find more details on the performances in the Examples section of the [documentation](#documentation).
 
 | Section | Description |
 |-|-|
 | [Installation](#installation) | How to install the package |
 | [Quick tour: Usage](#quick-tour-usage) | Tokenizers & models usage: Bert and GPT-2 |
-| [Quick tour: Fine-tuning/usage scripts](#quick-tour-fine-tuning-usage-scripts) | Using provided scripts: GLUE, SQuAD and Text generation |
+| [Quick tour: Fine-tuning/usage scripts](#quick-tour-fine-tuning/usage-scripts) | Using provided scripts: GLUE, SQuAD and Text generation |
 | [Documentation](#documentation) | Full API documentation and more |
 
 ## Installation
@@ -44,17 +40,6 @@ Clone the repository and run:
 pip install [--editable] .
 ```
 
-### SpaCy, ftfy
-
-If you want to reproduce the original tokenization process of the `OpenAI GPT` paper, you can install `ftfy` (version 4.4.3 if you are using Python 2) and `SpaCy` :
-
-```bash
-pip install spacy ftfy==4.4.3
-python -m spacy download en
-```
-
-If you don't install `ftfy` and `SpaCy`, the `OpenAI GPT` tokenizer will default to tokenize using BERT's `BasicTokenizer` followed by Byte-Pair Encoding (which should be fine for most usage, don't worry).
-
 ### Tests
 
 A series of tests is included for the library and the example scripts. Library tests can be found in the [tests folder](https://github.com/huggingface/pytorch-transformers/tree/master/pytorch_transformers/tests) and examples tests in the [examples folder](https://github.com/huggingface/pytorch-transformers/tree/master/examples).
@@ -72,11 +57,11 @@ python -m pytest -sv ./examples/
 
 Here are two quick-start examples using `Bert` and `GPT2` with pre-trained models.
 
-See the [documentation](#doc) for the details of all the models and classes.
+See the [documentation](#documentation) for the details of all the models and classes.
 
 ### BERT example
 
-First let's prepare a tokenized input with `BertTokenizer`
+First let's prepare a tokenized input from a text string using `BertTokenizer`
 
 ```python
 import torch
@@ -108,7 +93,7 @@ tokens_tensor = torch.tensor([indexed_tokens])
 segments_tensors = torch.tensor([segments_ids])
 ```
 
-Let's see how to use `BertModel` to get encoded inputs:
+Let's see how we can use `BertModel` to encode our inputs in hidden-states:
 
 ```python
 # Load pre-trained model (weights)
@@ -160,9 +145,9 @@ assert predicted_token == 'henson'
 
 ### OpenAI GPT-2
 
-Here is a quick-start example using `GPT2Tokenizer` and `GPT2LMHeadModel` class with OpenAI's pre-trained model.
+Here is a quick-start example using `GPT2Tokenizer` and `GPT2LMHeadModel` class with OpenAI's pre-trained model to predict the next token from a text prompt.
 
-First let's prepare a tokenized input with `GPT2Tokenizer`
+First let's prepare a tokenized input from our text string using `GPT2Tokenizer`
 
 ```python
 import torch
@@ -175,15 +160,15 @@ logging.basicConfig(level=logging.INFO)
 # Load pre-trained model tokenizer (vocabulary)
 tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
 
-# Encode some inputs
+# Encode a text inputs
 text = "Who was Jim Henson ? Jim Henson was a"
 indexed_tokens = tokenizer.encode(text)
 
-# Convert inputs to PyTorch tensors
+# Convert indexed tokens in a PyTorch tensor
 tokens_tensor = torch.tensor([indexed_tokens])
 ```
 
-Let's see how to use `GPT2LMHeadModel` to generate some text from our prompt:
+Let's see how to use `GPT2LMHeadModel` to generate the next token following our text:
 
 ```python
 # Load pre-trained model (weights)
@@ -208,17 +193,17 @@ predicted_text = tokenizer.decode(indexed_tokens + [predicted_index])
 assert predicted_text == 'Who was Jim Henson? Jim Henson was a man'
 ```
 
-Examples for each model class of each model architecture (Bert, GPT, GPT-2, Transformer-XL, XLNet and XLM) can be found in the documentation.
+Examples for each model class of each model architecture (Bert, GPT, GPT-2, Transformer-XL, XLNet and XLM) can be found in the [documentation](#documentation).
 
 ## Quick tour: Fine-tuning/usage scripts
 
-We include several example script with SOTA performances for NLU and NLG tasks:
+The library comprises several example scripts with SOTA performances for NLU and NLG tasks:
 
 - fine-tuning Bert/XLNet/XLM with a *sequence-level classifier* on nine different GLUE tasks,
 - fine-tuning Bert/XLNet/XLM with a *token-level classifier* on the question answering dataset SQuAD 2.0, and
 - using GPT/GPT-2/Transformer-XL and XLNet for conditional language generation.
 
-Here are three quick examples:
+Here are three quick usage examples for these scripts:
 
 ### Fine-tuning for sequence classification: GLUE tasks examples
 

From 8ad7e5b4f29d43c6be1363a6b1ebf58133d7c09c Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Tue, 16 Jul 2019 00:29:15 +0200
Subject: [PATCH 137/139] indeed

---
 README.md | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 6b24454101..89a0d7a4fe 100644
--- a/README.md
+++ b/README.md
@@ -2,12 +2,14 @@
 
 [![CircleCI](https://circleci.com/gh/huggingface/pytorch-pretrained-BERT.svg?style=svg)](https://circleci.com/gh/huggingface/pytorch-pretrained-BERT)
 
-PyTorch-Transformers is a library of state-of-the-art pre-trained models for Natural Language Processing (NLP). The library currently contains PyTorch implementations, pre-trained model weights, usage scripts and conversion utilities for the following models:
+PyTorch-Transformers is a library of state-of-the-art pre-trained models for Natural Language Processing (NLP).
+
+The library currently contains PyTorch implementations, pre-trained model weights, usage scripts and conversion utilities for the following models:
 
 - **[Google's BERT model](https://github.com/google-research/bert)** released with the paper [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova.
 - **[OpenAI's GPT model](https://github.com/openai/finetune-transformer-lm)** released  with the paper [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.
 - **[OpenAI's GPT-2 model](https://blog.openai.com/better-language-models/)** released with the paper [Language Models are Unsupervised Multitask Learners](https://blog.openai.com/better-language-models/) by Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever**.
-- **[Google/CMU's Transformer-XL model](https://github.com/kimiyoung/transformer-xl)** released with the paper [​XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
+- **[Google/CMU's Transformer-XL model](https://github.com/kimiyoung/transformer-xl)** released with the paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
 - **[Google/CMU's XLNet model](https://github.com/zihangdai/xlnet/)** released with the paper [​XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
 - **[Facebook's XLM model](https://github.com/facebookresearch/XLM/)** released together with the paper [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) by Guillaume Lample and Alexis Conneau.
 

From 352e3ff998c7e743439f3d42086ce17edea1abbf Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Tue, 16 Jul 2019 09:03:49 +0200
Subject: [PATCH 138/139] added migration guide to readme

---
 README.md                              | 102 ++++++++++++++++++++++++-
 pytorch_transformers/modeling_utils.py |   3 +-
 2 files changed, 101 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index 89a0d7a4fe..4b56f24920 100644
--- a/README.md
+++ b/README.md
@@ -19,7 +19,8 @@ These implementations have been tested on several datasets (see the example scri
 |-|-|
 | [Installation](#installation) | How to install the package |
 | [Quick tour: Usage](#quick-tour-usage) | Tokenizers & models usage: Bert and GPT-2 |
-| [Quick tour: Fine-tuning/usage scripts](#quick-tour-fine-tuning/usage-scripts) | Using provided scripts: GLUE, SQuAD and Text generation |
+| [Quick tour: Fine-tuning/usage scripts](#quick-tour-fine-tuningusage-scripts) | Using provided scripts: GLUE, SQuAD and Text generation |
+| [Migrating from pytorch-pretrained-bert to pytorch-transformers](#Migrating-from-pytorch-pretrained-bert-to-pytorch-transformers) | Migrating your code from pytorch-pretrained-bert to pytorch-transformers |
 | [Documentation](#documentation) | Full API documentation and more |
 
 ## Installation
@@ -350,7 +351,102 @@ python ./examples/run_glue.py \
 
 The full documentation is available at https://huggingface.co/pytorch-transformers/.
 
+## Migrating from pytorch-pretrained-bert to pytorch-transformers
+
+Here is a quick summary of what you should take care of when migrating from `pytorch-pretrained-bert` to `pytorch-transformers`
+
+### Models always output `tuples`
+
+The main breaking change when migrating from `pytorch-pretrained-bert` to `pytorch-transformers` is that the models forward method always outputs a `tuple` with various elements depending on the model and the configuration parameters.
+
+The exact content of the tuples for each model are detailled in the models' docstrings and the [documentation](https://huggingface.co/pytorch-transformers/).
+
+In pretty much every case, you will be fine by taking the first element of the output as the output you previously used in `pytorch-pretrained-bert`.
+
+Here is a `pytorch-pretrained-bert` to `pytorch-transformers` conversion example for a `BertForSequenceClassification` classification model:
+
+```python
+# Let's load our model
+model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
+
+# If you used to have this line in pytorch-pretrained-bert:
+loss = model(input_ids, labels=labels)
+
+# Now just use this line in pytorch-transformers to extract the loss from the output tuple:
+outputs = model(input_ids, labels=labels)
+loss = outputs[0]
+
+# In pytorch-transformers you can also have access to the logits:
+loss, logits = outputs[:2]
+
+# And even the attention weigths if you configure the model to output them (and other outputs too, see the docstrings and documentation)
+model = BertForSequenceClassification.from_pretrained('bert-base-uncased', output_attentions=True)
+outputs = model(input_ids, labels=labels)
+loss, logits, attentions = outputs
+```
+
+### Serialization
+
+While not a breaking change, the serialization methods have been standardized and you probably should switch to the new method `save_pretrained(save_directory)` if you were using any other seralization method before.
+
+Here is an example:
+
+```python
+### Let's load a model and tokenizer
+model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
+tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+
+### Do some stuff to our model and tokenizer
+# Ex: add new tokens to the vocabulary and embeddings of our model
+tokenizer.add_tokens(['[SPECIAL_TOKEN_1]', '[SPECIAL_TOKEN_2]'])
+model.resize_token_embeddings(len(tokenizer))
+# Train our model
+train(model)
+
+### Now let's save our model and tokenizer to a directory
+model.save_pretrained('./my_saved_model_directory/')
+tokenizer.save_pretrained('./my_saved_model_directory/')
+
+### Reload the model and the tokenizer
+model = BertForSequenceClassification.from_pretrained('./my_saved_model_directory/')
+tokenizer = BertTokenizer.from_pretrained('./my_saved_model_directory/')
+```
+
+### Optimizers: BertAdam & OpenAIAdam are now AdamW, schedules are standard PyTorch schedules
+
+The two optimizers previously included, `BertAdam` and `OpenAIAdam`, have been replaced by a single `AdamW` optimizer.
+The new optimizer `AdamW` matches PyTorch `Adam` optimizer API.
+
+The schedules are now standard [PyTorch learning rate schedulers](https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate) and not part of the optimizer anymore.
+
+Here is a conversion examples from `BertAdam` with a linear warmup and decay schedule to `AdamW` and the same schedule:
+
+```python
+# Parameters:
+lr = 1e-3
+num_total_steps = 1000
+num_warmup_steps = 100
+warmup_proportion = float(num_warmup_steps) / float(num_total_steps)  # 0.1
+
+### Previously BertAdam optimizer was instantiated like this:
+optimizer = BertAdam(model.parameters(), lr=lr, schedule='warmup_linear', warmup=warmup_proportion, t_total=num_total_steps)
+### and used like this:
+for batch in train_data:
+    loss = model(batch)
+    loss.backward()
+    optimizer.step()
+
+### In PyTorch-Transformers, optimizer and schedules are splitted and instantiated like this:
+optimizer = AdamW(model.parameters(), lr=lr, correct_bias=False)  # To reproduce BertAdam specific behavior set correct_bias=False
+scheduler = WarmupLinearSchedule(optimizer, warmup_steps=num_warmup_steps, t_total=num_total_steps)  # PyTorch scheduler
+### and used like this:
+for batch in train_data:
+    loss = model(batch)
+    loss.backward()
+    scheduler.step()
+    optimizer.step()
+```
+
 ## Citation
 
-At the moment, there is no paper to cite for PyTorch-Transformers but we are working on preparing one.
-In the meantime, please include a mention of the library and a link to the present repository if you use this work in a published or open-source project.
+At the moment, there is no paper associated to PyTorch-Transformers but we are working on preparing one. In the meantime, please include a mention of the library and a link to the present repository if you use this work in a published or open-source project.
diff --git a/pytorch_transformers/modeling_utils.py b/pytorch_transformers/modeling_utils.py
index 76a6e00db0..542c70b223 100644
--- a/pytorch_transformers/modeling_utils.py
+++ b/pytorch_transformers/modeling_utils.py
@@ -259,7 +259,8 @@ class PreTrainedModel(nn.Module):
                 New number of tokens in the embedding matrix.
                 Increasing the size will add newly initialized vectors at the end
                 Reducing the size will remove vectors from the end
-                If not provided or None: does nothing.
+                If not provided or None: does nothing and just returns a pointer to the input tokens Embedding Module of the model.
+
         Return: ``torch.nn.Embeddings``
             Pointer to the input tokens Embedding Module of the model
         """

From 1b35d05d4b3c121a9740544aa6f884f1039780b1 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Tue, 16 Jul 2019 09:41:55 +0200
Subject: [PATCH 139/139] update conversion scripts and __main__

---
 pytorch_transformers/__main__.py              | 28 ++++++++++++++-----
 .../convert_gpt2_checkpoint_to_pytorch.py     |  5 +++-
 .../convert_openai_checkpoint_to_pytorch.py   |  5 +++-
 .../convert_tf_checkpoint_to_pytorch.py       |  9 +++---
 ...onvert_transfo_xl_checkpoint_to_pytorch.py |  3 ++
 .../convert_xlm_checkpoint_to_pytorch.py      |  3 +-
 .../convert_xlnet_checkpoint_to_pytorch.py    |  9 ++++--
 pytorch_transformers/modeling_xlnet.py        |  2 ++
 .../tokenization_transfo_xl.py                |  2 +-
 pytorch_transformers/tokenization_utils.py    |  3 +-
 pytorch_transformers/tokenization_xlnet.py    |  4 ++-
 11 files changed, 53 insertions(+), 20 deletions(-)

diff --git a/pytorch_transformers/__main__.py b/pytorch_transformers/__main__.py
index 95504c1493..b047fa7447 100644
--- a/pytorch_transformers/__main__.py
+++ b/pytorch_transformers/__main__.py
@@ -1,14 +1,15 @@
 # coding: utf8
 def main():
     import sys
-    if (len(sys.argv) < 4 or len(sys.argv) > 6) or sys.argv[1] not in ["bert", "gpt", "transfo_xl", "gpt2", "xlnet"]:
+    if (len(sys.argv) < 4 or len(sys.argv) > 6) or sys.argv[1] not in ["bert", "gpt", "transfo_xl", "gpt2", "xlnet", "xlm"]:
         print(
         "Should be used as one of: \n"
-        ">> `pytorch_transformers bert TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT`, \n"
-        ">> `pytorch_transformers gpt OPENAI_GPT_CHECKPOINT_FOLDER_PATH PYTORCH_DUMP_OUTPUT [OPENAI_GPT_CONFIG]`, \n"
-        ">> `pytorch_transformers transfo_xl TF_CHECKPOINT_OR_DATASET PYTORCH_DUMP_OUTPUT [TF_CONFIG]` or \n"
-        ">> `pytorch_transformers gpt2 TF_CHECKPOINT PYTORCH_DUMP_OUTPUT [GPT2_CONFIG]` or \n"
-        ">> `pytorch_transformers xlnet TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT [FINETUNING_TASK_NAME]`")
+        ">> pytorch_transformers bert TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT, \n"
+        ">> pytorch_transformers gpt OPENAI_GPT_CHECKPOINT_FOLDER_PATH PYTORCH_DUMP_OUTPUT [OPENAI_GPT_CONFIG], \n"
+        ">> pytorch_transformers transfo_xl TF_CHECKPOINT_OR_DATASET PYTORCH_DUMP_OUTPUT [TF_CONFIG] or \n"
+        ">> pytorch_transformers gpt2 TF_CHECKPOINT PYTORCH_DUMP_OUTPUT [GPT2_CONFIG] or \n"
+        ">> pytorch_transformers xlnet TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT [FINETUNING_TASK_NAME] or \n"
+        ">> pytorch_transformers xlm XLM_CHECKPOINT_PATH PYTORCH_DUMP_OUTPUT")
     else:
         if sys.argv[1] == "bert":
             try:
@@ -86,7 +87,7 @@ def main():
                 else:
                     TF_CONFIG = ""
                 convert_gpt2_checkpoint_to_pytorch(TF_CHECKPOINT, TF_CONFIG, PYTORCH_DUMP_OUTPUT)
-        else:
+        elif sys.argv[1] == "xlnet":
             try:
                 from .convert_xlnet_checkpoint_to_pytorch import convert_xlnet_checkpoint_to_pytorch
             except ImportError:
@@ -104,11 +105,24 @@ def main():
                 PYTORCH_DUMP_OUTPUT = sys.argv[4]
                 if len(sys.argv) == 6:
                     FINETUNING_TASK = sys.argv[5]
+                else:
+                    FINETUNING_TASK = None
 
                 convert_xlnet_checkpoint_to_pytorch(TF_CHECKPOINT,
                                                     TF_CONFIG,
                                                     PYTORCH_DUMP_OUTPUT,
                                                     FINETUNING_TASK)
+        elif sys.argv[1] == "xlm":
+            from .convert_xlm_checkpoint_to_pytorch import convert_xlm_checkpoint_to_pytorch
+
+            if len(sys.argv) != 4:
+                # pylint: disable=line-too-long
+                print("Should be used as `pytorch_transformers xlm XLM_CHECKPOINT_PATH PYTORCH_DUMP_OUTPUT`")
+            else:
+                XLM_CHECKPOINT_PATH = sys.argv[2]
+                PYTORCH_DUMP_OUTPUT = sys.argv[3]
+
+                convert_xlm_checkpoint_to_pytorch(XLM_CHECKPOINT_PATH, PYTORCH_DUMP_OUTPUT)
 
 if __name__ == '__main__':
     main()
diff --git a/pytorch_transformers/convert_gpt2_checkpoint_to_pytorch.py b/pytorch_transformers/convert_gpt2_checkpoint_to_pytorch.py
index 86c8264cb5..68cb798a7d 100755
--- a/pytorch_transformers/convert_gpt2_checkpoint_to_pytorch.py
+++ b/pytorch_transformers/convert_gpt2_checkpoint_to_pytorch.py
@@ -26,6 +26,9 @@ from pytorch_transformers.modeling_gpt2 import (CONFIG_NAME, WEIGHTS_NAME,
                                                      GPT2Model,
                                                      load_tf_weights_in_gpt2)
 
+import logging
+logging.basicConfig(level=logging.INFO)
+
 
 def convert_gpt2_checkpoint_to_pytorch(gpt2_checkpoint_path, gpt2_config_file, pytorch_dump_folder_path):
     # Construct model
@@ -36,7 +39,7 @@ def convert_gpt2_checkpoint_to_pytorch(gpt2_checkpoint_path, gpt2_config_file, p
     model = GPT2Model(config)
 
     # Load weights from numpy
-    load_tf_weights_in_gpt2(model, gpt2_checkpoint_path)
+    load_tf_weights_in_gpt2(model, config, gpt2_checkpoint_path)
 
     # Save pytorch-model
     pytorch_weights_dump_path = pytorch_dump_folder_path + '/' + WEIGHTS_NAME
diff --git a/pytorch_transformers/convert_openai_checkpoint_to_pytorch.py b/pytorch_transformers/convert_openai_checkpoint_to_pytorch.py
index 68e9dea624..8ec852a4bd 100755
--- a/pytorch_transformers/convert_openai_checkpoint_to_pytorch.py
+++ b/pytorch_transformers/convert_openai_checkpoint_to_pytorch.py
@@ -26,6 +26,9 @@ from pytorch_transformers.modeling_openai import (CONFIG_NAME, WEIGHTS_NAME,
                                                      OpenAIGPTModel,
                                                      load_tf_weights_in_openai_gpt)
 
+import logging
+logging.basicConfig(level=logging.INFO)
+
 
 def convert_openai_checkpoint_to_pytorch(openai_checkpoint_folder_path, openai_config_file, pytorch_dump_folder_path):
     # Construct model
@@ -36,7 +39,7 @@ def convert_openai_checkpoint_to_pytorch(openai_checkpoint_folder_path, openai_c
     model = OpenAIGPTModel(config)
 
     # Load weights from numpy
-    load_tf_weights_in_openai_gpt(model, openai_checkpoint_folder_path)
+    load_tf_weights_in_openai_gpt(model, config, openai_checkpoint_folder_path)
 
     # Save pytorch-model
     pytorch_weights_dump_path = pytorch_dump_folder_path + '/' + WEIGHTS_NAME
diff --git a/pytorch_transformers/convert_tf_checkpoint_to_pytorch.py b/pytorch_transformers/convert_tf_checkpoint_to_pytorch.py
index 7530d7e12d..9f121e8b79 100755
--- a/pytorch_transformers/convert_tf_checkpoint_to_pytorch.py
+++ b/pytorch_transformers/convert_tf_checkpoint_to_pytorch.py
@@ -18,15 +18,14 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import os
-import re
 import argparse
-import tensorflow as tf
 import torch
-import numpy as np
 
 from pytorch_transformers.modeling_bert import BertConfig, BertForPreTraining, load_tf_weights_in_bert
 
+import logging
+logging.basicConfig(level=logging.INFO)
+
 def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, pytorch_dump_path):
     # Initialise PyTorch model
     config = BertConfig.from_json_file(bert_config_file)
@@ -34,7 +33,7 @@ def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, pytor
     model = BertForPreTraining(config)
 
     # Load weights from tf checkpoint
-    load_tf_weights_in_bert(model, tf_checkpoint_path)
+    load_tf_weights_in_bert(model, config, tf_checkpoint_path)
 
     # Save pytorch-model
     print("Save PyTorch model to {}".format(pytorch_dump_path))
diff --git a/pytorch_transformers/convert_transfo_xl_checkpoint_to_pytorch.py b/pytorch_transformers/convert_transfo_xl_checkpoint_to_pytorch.py
index db23e5bffe..b6672aedf7 100755
--- a/pytorch_transformers/convert_transfo_xl_checkpoint_to_pytorch.py
+++ b/pytorch_transformers/convert_transfo_xl_checkpoint_to_pytorch.py
@@ -36,6 +36,9 @@ if sys.version_info[0] == 2:
 else:
     import pickle
 
+import logging
+logging.basicConfig(level=logging.INFO)
+
 # We do this to be able to load python 2 datasets pickles
 # See e.g. https://stackoverflow.com/questions/2121874/python-pickling-after-changing-a-modules-directory/2121918#2121918
 data_utils.Vocab = data_utils.TransfoXLTokenizer
diff --git a/pytorch_transformers/convert_xlm_checkpoint_to_pytorch.py b/pytorch_transformers/convert_xlm_checkpoint_to_pytorch.py
index 416f1bc16d..8825f3c0dc 100755
--- a/pytorch_transformers/convert_xlm_checkpoint_to_pytorch.py
+++ b/pytorch_transformers/convert_xlm_checkpoint_to_pytorch.py
@@ -24,9 +24,10 @@ import torch
 import numpy
 
 from pytorch_transformers.modeling_utils import CONFIG_NAME, WEIGHTS_NAME
-from pytorch_transformers.modeling_xlm import (XLMConfig, XLMModel)
 from pytorch_transformers.tokenization_xlm import VOCAB_FILES_NAMES
 
+import logging
+logging.basicConfig(level=logging.INFO)
 
 def convert_xlm_checkpoint_to_pytorch(xlm_checkpoint_path, pytorch_dump_folder_path):
     # Load checkpoint
diff --git a/pytorch_transformers/convert_xlnet_checkpoint_to_pytorch.py b/pytorch_transformers/convert_xlnet_checkpoint_to_pytorch.py
index f41db87124..834b47484f 100755
--- a/pytorch_transformers/convert_xlnet_checkpoint_to_pytorch.py
+++ b/pytorch_transformers/convert_xlnet_checkpoint_to_pytorch.py
@@ -40,6 +40,8 @@ GLUE_TASKS_NUM_LABELS = {
     "wnli": 2,
 }
 
+import logging
+logging.basicConfig(level=logging.INFO)
 
 def convert_xlnet_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, pytorch_dump_folder_path, finetuning_task=None):
     # Initialise PyTorch model
@@ -48,14 +50,17 @@ def convert_xlnet_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, py
     finetuning_task = finetuning_task.lower() if finetuning_task is not None else ""
     if finetuning_task in GLUE_TASKS_NUM_LABELS:
         print("Building PyTorch XLNetForSequenceClassification model from configuration: {}".format(str(config)))
-        model = XLNetForSequenceClassification(config, num_labels=GLUE_TASKS_NUM_LABELS[finetuning_task])
+        config.finetuning_task = finetuning_task
+        config.num_labels = GLUE_TASKS_NUM_LABELS[finetuning_task]
+        model = XLNetForSequenceClassification(config)
     elif 'squad' in finetuning_task:
+        config.finetuning_task = finetuning_task
         model = XLNetForQuestionAnswering(config)
     else:
         model = XLNetLMHeadModel(config)
 
     # Load weights from tf checkpoint
-    load_tf_weights_in_xlnet(model, config, tf_checkpoint_path, finetuning_task)
+    load_tf_weights_in_xlnet(model, config, tf_checkpoint_path)
 
     # Save pytorch-model
     pytorch_weights_dump_path = os.path.join(pytorch_dump_folder_path, WEIGHTS_NAME)
diff --git a/pytorch_transformers/modeling_xlnet.py b/pytorch_transformers/modeling_xlnet.py
index c50d0a3f48..855bce7dfe 100644
--- a/pytorch_transformers/modeling_xlnet.py
+++ b/pytorch_transformers/modeling_xlnet.py
@@ -37,9 +37,11 @@ from .modeling_utils import (CONFIG_NAME, WEIGHTS_NAME, PretrainedConfig, PreTra
 logger = logging.getLogger(__name__)
 
 XLNET_PRETRAINED_MODEL_ARCHIVE_MAP = {
+    'xlnet-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-base-cased-pytorch_model.bin",
     'xlnet-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-large-cased-pytorch_model.bin",
 }
 XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    'xlnet-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-base-cased-config.json",
     'xlnet-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-large-cased-config.json",
 }
 
diff --git a/pytorch_transformers/tokenization_transfo_xl.py b/pytorch_transformers/tokenization_transfo_xl.py
index b08e8e1cca..9406d48c7b 100644
--- a/pytorch_transformers/tokenization_transfo_xl.py
+++ b/pytorch_transformers/tokenization_transfo_xl.py
@@ -50,7 +50,7 @@ PRETRAINED_VOCAB_FILES_MAP = {
 }
 
 PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
-    'transfo-xl-wt103': 512,
+    'transfo-xl-wt103': None,
 }
 
 PRETRAINED_CORPUS_ARCHIVE_MAP = {
diff --git a/pytorch_transformers/tokenization_utils.py b/pytorch_transformers/tokenization_utils.py
index d857e6f2d4..df18f5e536 100644
--- a/pytorch_transformers/tokenization_utils.py
+++ b/pytorch_transformers/tokenization_utils.py
@@ -208,7 +208,8 @@ class PreTrainedTokenizer(object):
             # if we're using a pretrained model, ensure the tokenizer
             # wont index sequences longer than the number of positional embeddings
             max_len = cls.max_model_input_sizes[pretrained_model_name_or_path]
-            kwargs['max_len'] = min(kwargs.get('max_len', int(1e12)), max_len)
+            if max_len is not None and isinstance(max_len, (int, float)):
+                kwargs['max_len'] = min(kwargs.get('max_len', int(1e12)), max_len)
 
         # Merge resolved_vocab_files arguments in kwargs.
         added_tokens_file = resolved_vocab_files.pop('added_tokens_file', None)
diff --git a/pytorch_transformers/tokenization_xlnet.py b/pytorch_transformers/tokenization_xlnet.py
index fa60a18d8a..a4f3fdfde2 100644
--- a/pytorch_transformers/tokenization_xlnet.py
+++ b/pytorch_transformers/tokenization_xlnet.py
@@ -32,12 +32,14 @@ VOCAB_FILES_NAMES = {'vocab_file': 'spiece.model'}
 PRETRAINED_VOCAB_FILES_MAP = {
     'vocab_file':
     {
+    'xlnet-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-base-cased-spiece.model",
     'xlnet-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-large-cased-spiece.model",
     }
 }
 
 PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
-    'xlnet-large-cased': 512,
+    'xlnet-base-cased': None,
+    'xlnet-large-cased': None,
 }
 
 SPIECE_UNDERLINE = u'▁'