Merge branch 'master' into pr/2189

2019-12-17 11:47:32 +01:00
parent f349826a57 f061606277
commit 83bc5235cf
63 changed files with 783 additions and 442 deletions
--- a/README.md
+++ b/README.md
@@ -56,6 +56,7 @@ Choose the right framework for every part of a model's lifetime
 | [Quick tour: Usage](#quick-tour) | Tokenizers & models usage: Bert and GPT-2 |
 | [Quick tour: TF 2.0 and PyTorch ](#Quick-tour-TF-20-training-and-PyTorch-interoperability) | Train a TF 2.0 model in 10 lines of code, load it in PyTorch |
 | [Quick tour: Fine-tuning/usage scripts](#quick-tour-of-the-fine-tuningusage-scripts) | Using provided scripts: GLUE, SQuAD and Text generation |
 | [Quick tour: Share your models ](#Quick-tour-of-model-sharing) | Upload and share your fine-tuned models with the community |
 | [Migrating from pytorch-transformers to transformers](#Migrating-from-pytorch-transformers-to-transformers) | Migrating your code from pytorch-transformers to transformers |
 | [Migrating from pytorch-pretrained-bert to pytorch-transformers](#Migrating-from-pytorch-pretrained-bert-to-transformers) | Migrating your code from pytorch-pretrained-bert to transformers |
 | [Documentation][(v2.2.0/v2.2.1/v2.2.2)](https://huggingface.co/transformers/v2.2.0) [(v2.1.1)](https://huggingface.co/transformers/v2.1.1) [(v2.0.0)](https://huggingface.co/transformers/v2.0.0) [(v1.2.0)](https://huggingface.co/transformers/v1.2.0) [(v1.1.0)](https://huggingface.co/transformers/v1.1.0) [(v1.0.0)](https://huggingface.co/transformers/v1.0.0) [(master)](https://huggingface.co/transformers) | Full API documentation and more |
@@ -446,6 +447,46 @@ python ./examples/run_generation.py \
    --repetition_penalty=1.2 \
 ```
 ## Quick tour of model sharing
 New in `v2.2.2`: you can now upload and share your fine-tuned models with the community, using the <abbr title="Command-line interface">CLI</abbr> that's built-in to the library.
 **First, create an account on [https://huggingface.co/join](https://huggingface.co/join)**. Then:
 ```shell
 transformers-cli login
 # log in using the same credentials as on huggingface.co
 ```
 Upload your model:
 ```shell
 transformers-cli upload ./path/to/pretrained_model/
 # ^^ Upload folder containing weights/tokenizer/config
 # saved via `.save_pretrained()`
 transformers-cli upload ./config.json [--filename folder/foobar.json]
 # ^^ Upload a single file
 # (you can optionally override its filename, which can be nested inside a folder)
 ```
 Your model will then be accessible through its identifier, a concatenation of your username and the folder name above:
 ```python
 "username/model_name"
 ```
 Anyone can load it from code:
 ```python
 tokenizer = AutoTokenizer.from_pretrained("username/pretrained_model")
 model = AutoModel.from_pretrained("username/pretrained_model")
 ```
 Finally, list all your files on S3:
 ```shell
 transformers-cli ls
 # List all your S3 objects.
 ```
 ## Migrating from pytorch-transformers to transformers
 Here is a quick summary of what you should take care of when migrating from `pytorch-transformers` to `transformers`.
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -58,6 +58,7 @@ The library currently contains PyTorch and Tensorflow implementations, pre-train
    installation
    quickstart
    pretrained_models
    model_sharing
    examples
    notebooks
    serialization
--- a/docs/source/model_sharing.md
+++ b/docs/source/model_sharing.md
@@ -0,0 +1,40 @@
 # Model upload and sharing
 Starting with `v2.2.2`, you can now upload and share your fine-tuned models with the community, using the <abbr title="Command-line interface">CLI</abbr> that's built-in to the library.
 **First, create an account on [https://huggingface.co/join](https://huggingface.co/join)**. Then:
 ```shell
 transformers-cli login
 # log in using the same credentials as on huggingface.co
 ```
 Upload your model:
 ```shell
 transformers-cli upload ./path/to/pretrained_model/
 # ^^ Upload folder containing weights/tokenizer/config
 # saved via `.save_pretrained()`
 transformers-cli upload ./config.json [--filename folder/foobar.json]
 # ^^ Upload a single file
 # (you can optionally override its filename, which can be nested inside a folder)
 ```
 Your model will then be accessible through its identifier, a concatenation of your username and the folder name above:
 ```python
 "username/pretrained_model"
 ```
 Anyone can load it from code:
 ```python
 tokenizer = AutoTokenizer.from_pretrained("username/pretrained_model")
 model = AutoModel.from_pretrained("username/pretrained_model")
 ```
 Finally, list all your files on S3:
 ```shell
 transformers-cli ls
 # List all your S3 objects.
 ```
--- a/examples/run_generation.py
+++ b/examples/run_generation.py
@@ -247,7 +247,11 @@ def main():
        out = out[:, len(context_tokens):].tolist()
        for o in out:
            text = tokenizer.decode(o, clean_up_tokenization_spaces=True)
-            text = text[: text.find(args.stop_token) if args.stop_token else None]
+            if args.stop_token:
                index =  text.find(args.stop_token)
                if index == -1:
                    index = None
                text = text[:index]
            print(text)
--- a/examples/run_squad.py
+++ b/examples/run_squad.py
@@ -223,7 +223,7 @@ def evaluate(args, model, tokenizer, prefix=""):
    eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)
    # multi-gpu evaluate
-    if args.n_gpu > 1:
+    if args.n_gpu > 1 and not isinstance(model, torch.nn.DataParallel):
        model = torch.nn.DataParallel(model)
    # Eval!
--- a/examples/summarization/configuration_bertabs.py
+++ b/examples/summarization/configuration_bertabs.py
@@ -33,6 +33,8 @@ class BertAbsConfig(PretrainedConfig):
    r""" Class to store the configuration of the BertAbs model.
    Arguments:
        vocab_size: int
            Number of tokens in the vocabulary.
        max_pos: int
            The maximum sequence length that this model will be used with.
        enc_layer: int
@@ -65,7 +67,7 @@ class BertAbsConfig(PretrainedConfig):
    def __init__(
        self,
-        vocab_size_or_config_json_file=30522,
+        vocab_size=30522,
        max_pos=512,
        enc_layers=6,
        enc_hidden_size=512,
@@ -81,39 +83,17 @@ class BertAbsConfig(PretrainedConfig):
    ):
        super(BertAbsConfig, self).__init__(**kwargs)
-        if self._input_is_path_to_json(vocab_size_or_config_json_file):
+        self.vocab_size = vocab_size
-            path_to_json = vocab_size_or_config_json_file
+        self.max_pos = max_pos
            with open(path_to_json, "r", encoding="utf-8") as reader:
                json_config = json.loads(reader.read())
            for key, value in json_config.items():
                self.__dict__[key] = value
        elif isinstance(vocab_size_or_config_json_file, int):
            self.vocab_size = vocab_size_or_config_json_file
            self.max_pos = max_pos
-            self.enc_layers = enc_layers
+        self.enc_layers = enc_layers
-            self.enc_hidden_size = enc_hidden_size
+        self.enc_hidden_size = enc_hidden_size
-            self.enc_heads = enc_heads
+        self.enc_heads = enc_heads
-            self.enc_ff_size = enc_ff_size
+        self.enc_ff_size = enc_ff_size
-            self.enc_dropout = enc_dropout
+        self.enc_dropout = enc_dropout
-            self.dec_layers = dec_layers
+        self.dec_layers = dec_layers
-            self.dec_hidden_size = dec_hidden_size
+        self.dec_hidden_size = dec_hidden_size
-            self.dec_heads = dec_heads
+        self.dec_heads = dec_heads
-            self.dec_ff_size = dec_ff_size
+        self.dec_ff_size = dec_ff_size
-            self.dec_dropout = dec_dropout
+        self.dec_dropout = dec_dropout
        else:
            raise ValueError(
                "First argument must be either a vocabulary size (int)"
                "or the path to a pretrained model config file (str)"
            )
    def _input_is_path_to_json(self, first_argument):
        """ Checks whether the first argument passed to config
        is the path to a JSON file that contains the config.
        """
        is_python_2 = sys.version_info[0] == 2
        if is_python_2:
            return isinstance(first_argument, unicode)
        else:
            return isinstance(first_argument, str)
--- a/templates/adding_a_new_model/configuration_xxx.py
+++ b/templates/adding_a_new_model/configuration_xxx.py
@@ -39,7 +39,7 @@ class XxxConfig(PretrainedConfig):
        Arguments:
-            vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `XxxModel`.
+            vocab_size: Vocabulary size of `inputs_ids` in `XxxModel`.
            hidden_size: Size of the encoder layers and the pooler layer.
            num_hidden_layers: Number of hidden layers in the Transformer encoder.
            num_attention_heads: Number of attention heads for each attention layer in
@@ -64,7 +64,7 @@ class XxxConfig(PretrainedConfig):
    pretrained_config_archive_map = XXX_PRETRAINED_CONFIG_ARCHIVE_MAP
    def __init__(self,
-                 vocab_size_or_config_json_file=50257,
+                 vocab_size=50257,
                 n_positions=1024,
                 n_ctx=1024,
                 n_embd=768,
@@ -75,8 +75,6 @@ class XxxConfig(PretrainedConfig):
                 attn_pdrop=0.1,
                 layer_norm_epsilon=1e-5,
                 initializer_range=0.02,
                 num_labels=1,
                 summary_type='cls_index',
                 summary_use_proj=True,
                 summary_activation=None,
@@ -84,7 +82,7 @@ class XxxConfig(PretrainedConfig):
                 summary_first_dropout=0.1,
                 **kwargs):
        super(XxxConfig, self).__init__(**kwargs)
-        self.vocab_size = vocab_size_or_config_json_file if isinstance(vocab_size_or_config_json_file, int) else -1
+        self.vocab_size = vocab_size
        self.n_ctx = n_ctx
        self.n_positions = n_positions
        self.n_embd = n_embd
@@ -95,23 +93,11 @@ class XxxConfig(PretrainedConfig):
        self.attn_pdrop = attn_pdrop
        self.layer_norm_epsilon = layer_norm_epsilon
        self.initializer_range = initializer_range
        self.num_labels = num_labels
        self.summary_type = summary_type
        self.summary_use_proj = summary_use_proj
        self.summary_activation = summary_activation
        self.summary_first_dropout = summary_first_dropout
        self.summary_proj_to_labels = summary_proj_to_labels
        if isinstance(vocab_size_or_config_json_file, six.string_types):
            with open(vocab_size_or_config_json_file, "r", encoding="utf-8") as reader:
                json_config = json.loads(reader.read())
            for key, value in json_config.items():
                self.__dict__[key] = value
        elif not isinstance(vocab_size_or_config_json_file, int):
            raise ValueError(
                "First argument must be either a vocabulary size (int)"
                "or the path to a pretrained model config file (str)"
            )
    @property
    def max_position_embeddings(self):
--- a/templates/adding_a_new_model/tests/modeling_tf_xxx_test.py
+++ b/templates/adding_a_new_model/tests/modeling_tf_xxx_test.py
@@ -111,7 +111,7 @@ class TFXxxModelTest(TFCommonTestCases.TFCommonModelTester):
                choice_labels = ids_tensor([self.batch_size], self.num_choices)
            config = XxxConfig(
-                vocab_size_or_config_json_file=self.vocab_size,
+                vocab_size=self.vocab_size,
                hidden_size=self.hidden_size,
                num_hidden_layers=self.num_hidden_layers,
                num_attention_heads=self.num_attention_heads,
--- a/templates/adding_a_new_model/tests/modeling_xxx_test.py
+++ b/templates/adding_a_new_model/tests/modeling_xxx_test.py
@@ -109,7 +109,7 @@ class XxxModelTest(CommonTestCases.CommonModelTester):
                choice_labels = ids_tensor([self.batch_size], self.num_choices)
            config = XxxConfig(
-                vocab_size_or_config_json_file=self.vocab_size,
+                vocab_size=self.vocab_size,
                hidden_size=self.hidden_size,
                num_hidden_layers=self.num_hidden_layers,
                num_attention_heads=self.num_attention_heads,
--- a/transformers/init.py
+++ b/transformers/init.py
@@ -19,7 +19,7 @@ logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
 # Files and general utilities
 from .file_utils import (TRANSFORMERS_CACHE, PYTORCH_TRANSFORMERS_CACHE, PYTORCH_PRETRAINED_BERT_CACHE,
                         cached_path, add_start_docstrings, add_end_docstrings,
-                         WEIGHTS_NAME, TF2_WEIGHTS_NAME, TF_WEIGHTS_NAME, CONFIG_NAME,
+                         WEIGHTS_NAME, TF2_WEIGHTS_NAME, TF_WEIGHTS_NAME, CONFIG_NAME, MODEL_CARD_NAME,
                         is_tf_available, is_torch_available)
 from .data import (is_sklearn_available,
@@ -33,6 +33,9 @@ from .data import (is_sklearn_available,
 if is_sklearn_available():
    from .data import glue_compute_metrics, xnli_compute_metrics
 # Model Cards
 from .model_card import ModelCard
 # Tokenizers
 from .tokenization_utils import (PreTrainedTokenizer)
 from .tokenization_auto import AutoTokenizer
@@ -53,7 +56,7 @@ from .tokenization_xlm_roberta import XLMRobertaTokenizer
 # Configurations
 from .configuration_utils import PretrainedConfig
-from .configuration_auto import AutoConfig
+from .configuration_auto import AutoConfig, ALL_PRETRAINED_CONFIG_ARCHIVE_MAP
 from .configuration_bert import BertConfig, BERT_PRETRAINED_CONFIG_ARCHIVE_MAP
 from .configuration_openai import OpenAIGPTConfig, OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP
 from .configuration_transfo_xl import TransfoXLConfig, TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP
@@ -72,7 +75,7 @@ from .configuration_xlm_roberta import XLMRobertaConfig, XLM_ROBERTA_PRETRAINED_
 if is_torch_available():
    from .modeling_utils import (PreTrainedModel, prune_layer, Conv1D)
    from .modeling_auto import (AutoModel, AutoModelForSequenceClassification, AutoModelForQuestionAnswering,
-                                AutoModelWithLMHead)
+                                AutoModelWithLMHead, ALL_PRETRAINED_MODEL_ARCHIVE_MAP)
    from .modeling_bert import (BertPreTrainedModel, BertModel, BertForPreTraining,
                                BertForMaskedLM, BertForNextSentencePrediction,
@@ -133,7 +136,7 @@ if is_torch_available():
 if is_tf_available():
    from .modeling_tf_utils import TFPreTrainedModel, TFSharedEmbeddings, TFSequenceSummary, shape_list
    from .modeling_tf_auto import (TFAutoModel, TFAutoModelForSequenceClassification, TFAutoModelForQuestionAnswering,
-                                   TFAutoModelWithLMHead)
+                                   TFAutoModelWithLMHead, TF_ALL_PRETRAINED_MODEL_ARCHIVE_MAP)
    from .modeling_tf_bert import (TFBertPreTrainedModel, TFBertMainLayer, TFBertEmbeddings,
                                   TFBertModel, TFBertForPreTraining,
--- a/transformers/configuration_albert.py
+++ b/transformers/configuration_albert.py
@@ -37,7 +37,7 @@ class AlbertConfig(PretrainedConfig):
    pretrained_config_archive_map = ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
    def __init__(self,
-                 vocab_size_or_config_json_file=30000,
+                 vocab_size=30000,
                 embedding_size=128,
                 hidden_size=4096,
                 num_hidden_layers=12,
@@ -83,7 +83,7 @@ class AlbertConfig(PretrainedConfig):
        """
        super(AlbertConfig, self).__init__(**kwargs)
-        self.vocab_size = vocab_size_or_config_json_file
+        self.vocab_size = vocab_size
        self.embedding_size = embedding_size
        self.hidden_size = hidden_size
        self.num_hidden_layers = num_hidden_layers
@@ -97,4 +97,4 @@ class AlbertConfig(PretrainedConfig):
        self.max_position_embeddings = max_position_embeddings
        self.type_vocab_size = type_vocab_size
        self.initializer_range = initializer_range
-        self.layer_norm_eps = layer_norm_eps
+        self.layer_norm_eps = layer_norm_eps
--- a/transformers/configuration_auto.py
+++ b/transformers/configuration_auto.py
@@ -18,22 +18,40 @@ from __future__ import absolute_import, division, print_function, unicode_litera
 import logging
-from .configuration_bert import BertConfig
+from .configuration_bert import BertConfig, BERT_PRETRAINED_CONFIG_ARCHIVE_MAP
-from .configuration_openai import OpenAIGPTConfig
+from .configuration_openai import OpenAIGPTConfig, OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP
-from .configuration_gpt2 import GPT2Config
+from .configuration_transfo_xl import TransfoXLConfig, TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP
-from .configuration_transfo_xl import TransfoXLConfig
+from .configuration_gpt2 import GPT2Config, GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP
-from .configuration_xlnet import XLNetConfig
+from .configuration_ctrl import CTRLConfig, CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP
-from .configuration_xlm import XLMConfig
+from .configuration_xlnet import XLNetConfig, XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP
-from .configuration_roberta import RobertaConfig
+from .configuration_xlm import XLMConfig, XLM_PRETRAINED_CONFIG_ARCHIVE_MAP
-from .configuration_distilbert import DistilBertConfig
+from .configuration_roberta import RobertaConfig, ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP
-from .configuration_ctrl import CTRLConfig
+from .configuration_distilbert import DistilBertConfig, DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
-from .configuration_camembert import CamembertConfig
+from .configuration_albert import AlbertConfig, ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
-from .configuration_albert import AlbertConfig
+from .configuration_camembert import CamembertConfig, CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
-from .configuration_t5 import T5Config
+from .configuration_t5 import T5Config, T5_PRETRAINED_CONFIG_ARCHIVE_MAP
 logger = logging.getLogger(__name__)
 ALL_PRETRAINED_CONFIG_ARCHIVE_MAP = dict((key, value)
    for pretrained_map in [
        BERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
        OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP,
        TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP,
        GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP,
        CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP,
        XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP,
        XLM_PRETRAINED_CONFIG_ARCHIVE_MAP,
        ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP,
        DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
        ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
        CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
        T5_PRETRAINED_CONFIG_ARCHIVE_MAP,
        ]
    for key, value, in pretrained_map.items())
 class AutoConfig(object):
    r""":class:`~transformers.AutoConfig` is a generic configuration class
        that will be instantiated as one of the configuration classes of the library
--- a/transformers/configuration_bert.py
+++ b/transformers/configuration_bert.py
@@ -56,7 +56,7 @@ class BertConfig(PretrainedConfig):
        Arguments:
-            vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `BertModel`.
+            vocab_size: Vocabulary size of `inputs_ids` in `BertModel`.
            hidden_size: Size of the encoder layers and the pooler layer.
            num_hidden_layers: Number of hidden layers in the Transformer encoder.
            num_attention_heads: Number of attention heads for each attention layer in
@@ -81,7 +81,7 @@ class BertConfig(PretrainedConfig):
    pretrained_config_archive_map = BERT_PRETRAINED_CONFIG_ARCHIVE_MAP
    def __init__(self,
-                 vocab_size_or_config_json_file=30522,
+                 vocab_size=30522,
                 hidden_size=768,
                 num_hidden_layers=12,
                 num_attention_heads=12,
@@ -95,25 +95,15 @@ class BertConfig(PretrainedConfig):
                 layer_norm_eps=1e-12,
                 **kwargs):
        super(BertConfig, self).__init__(**kwargs)
-        if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
+        self.vocab_size = vocab_size
-                        and isinstance(vocab_size_or_config_json_file, unicode)):
+        self.hidden_size = hidden_size
-            with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader:
+        self.num_hidden_layers = num_hidden_layers
-                json_config = json.loads(reader.read())
+        self.num_attention_heads = num_attention_heads
-            for key, value in json_config.items():
+        self.hidden_act = hidden_act
-                self.__dict__[key] = value
+        self.intermediate_size = intermediate_size
-        elif isinstance(vocab_size_or_config_json_file, int):
+        self.hidden_dropout_prob = hidden_dropout_prob
-            self.vocab_size = vocab_size_or_config_json_file
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-            self.hidden_size = hidden_size
+        self.max_position_embeddings = max_position_embeddings
-            self.num_hidden_layers = num_hidden_layers
+        self.type_vocab_size = type_vocab_size
-            self.num_attention_heads = num_attention_heads
+        self.initializer_range = initializer_range
-            self.hidden_act = hidden_act
+        self.layer_norm_eps = layer_norm_eps
            self.intermediate_size = intermediate_size
            self.hidden_dropout_prob = hidden_dropout_prob
            self.attention_probs_dropout_prob = attention_probs_dropout_prob
            self.max_position_embeddings = max_position_embeddings
            self.type_vocab_size = type_vocab_size
            self.initializer_range = initializer_range
            self.layer_norm_eps = layer_norm_eps
        else:
            raise ValueError("First argument must be either a vocabulary size (int)"
                             " or the path to a pretrained model config file (str)")
--- a/transformers/configuration_ctrl.py
+++ b/transformers/configuration_ctrl.py
@@ -31,7 +31,7 @@ class CTRLConfig(PretrainedConfig):
    """Configuration class to store the configuration of a `CTRLModel`.
    Args:
-        vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `CTRLModel` or a configuration json file.
+        vocab_size: Vocabulary size of `inputs_ids` in `CTRLModel` or a configuration json file.
        n_positions: Number of positional embeddings.
        n_ctx: Size of the causal mask (usually same as n_positions).
        dff: Size of the inner dimension of the FFN.
@@ -52,7 +52,7 @@ class CTRLConfig(PretrainedConfig):
    def __init__(
        self,
-        vocab_size_or_config_json_file=246534,
+        vocab_size=246534,
        n_positions=256,
        n_ctx=256,
        n_embd=1280,
@@ -64,8 +64,6 @@ class CTRLConfig(PretrainedConfig):
        attn_pdrop=0.1,
        layer_norm_epsilon=1e-6,
        initializer_range=0.02,
        num_labels=1,
        summary_type='cls_index',
        summary_use_proj=True,
        summary_activation=None,
@@ -76,7 +74,7 @@ class CTRLConfig(PretrainedConfig):
        """Constructs CTRLConfig.
        Args:
-            vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `CTRLModel` or a configuration json file.
+            vocab_size: Vocabulary size of `inputs_ids` in `CTRLModel` or a configuration json file.
            n_positions: Number of positional embeddings.
            n_ctx: Size of the causal mask (usually same as n_positions).
            dff: Size of the inner dimension of the FFN.
@@ -94,8 +92,7 @@ class CTRLConfig(PretrainedConfig):
                initializing all weight matrices.
        """
        super(CTRLConfig, self).__init__(**kwargs)
-
+        self.vocab_size = vocab_size
        self.vocab_size = vocab_size_or_config_json_file if isinstance(vocab_size_or_config_json_file, int) else -1
        self.n_ctx = n_ctx
        self.n_positions = n_positions
        self.n_embd = n_embd
@@ -108,23 +105,11 @@ class CTRLConfig(PretrainedConfig):
        self.layer_norm_epsilon = layer_norm_epsilon
        self.initializer_range = initializer_range
        self.num_labels = num_labels
        self.summary_type = summary_type
        self.summary_use_proj = summary_use_proj
        self.summary_activation = summary_activation
        self.summary_first_dropout = summary_first_dropout
        self.summary_proj_to_labels = summary_proj_to_labels
        if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
                        and isinstance(vocab_size_or_config_json_file, unicode)):
            with open(vocab_size_or_config_json_file, "r", encoding="utf-8") as reader:
                json_config = json.loads(reader.read())
            for key, value in json_config.items():
                self.__dict__[key] = value
        elif not isinstance(vocab_size_or_config_json_file, int):
            raise ValueError(
                "First argument must be either a vocabulary size (int)"
                "or the path to a pretrained model config file (str)"
            )
    @property
    def max_position_embeddings(self):
--- a/transformers/configuration_distilbert.py
+++ b/transformers/configuration_distilbert.py
@@ -37,7 +37,7 @@ class DistilBertConfig(PretrainedConfig):
    pretrained_config_archive_map = DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
    def __init__(self,
-                 vocab_size_or_config_json_file=30522,
+                 vocab_size=30522,
                 max_position_embeddings=512,
                 sinusoidal_pos_embds=False,
                 n_layers=6,
@@ -53,31 +53,21 @@ class DistilBertConfig(PretrainedConfig):
                 seq_classif_dropout=0.2,
                 **kwargs):
        super(DistilBertConfig, self).__init__(**kwargs)
        self.vocab_size = vocab_size
        self.max_position_embeddings = max_position_embeddings
        self.sinusoidal_pos_embds = sinusoidal_pos_embds
        self.n_layers = n_layers
        self.n_heads = n_heads
        self.dim = dim
        self.hidden_dim = hidden_dim
        self.dropout = dropout
        self.attention_dropout = attention_dropout
        self.activation = activation
        self.initializer_range = initializer_range
        self.tie_weights_ = tie_weights_
        self.qa_dropout = qa_dropout
        self.seq_classif_dropout = seq_classif_dropout
        if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
                        and isinstance(vocab_size_or_config_json_file, unicode)):
            with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader:
                json_config = json.loads(reader.read())
            for key, value in json_config.items():
                self.__dict__[key] = value
        elif isinstance(vocab_size_or_config_json_file, int):
            self.vocab_size = vocab_size_or_config_json_file
            self.max_position_embeddings = max_position_embeddings
            self.sinusoidal_pos_embds = sinusoidal_pos_embds
            self.n_layers = n_layers
            self.n_heads = n_heads
            self.dim = dim
            self.hidden_dim = hidden_dim
            self.dropout = dropout
            self.attention_dropout = attention_dropout
            self.activation = activation
            self.initializer_range = initializer_range
            self.tie_weights_ = tie_weights_
            self.qa_dropout = qa_dropout
            self.seq_classif_dropout = seq_classif_dropout
        else:
            raise ValueError("First argument must be either a vocabulary size (int)"
                             " or the path to a pretrained model config file (str)")
    @property
    def hidden_size(self):
        return self.dim
--- a/transformers/configuration_gpt2.py
+++ b/transformers/configuration_gpt2.py
@@ -36,7 +36,7 @@ class GPT2Config(PretrainedConfig):
    """Configuration class to store the configuration of a `GPT2Model`.
    Args:
-        vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `GPT2Model` or a configuration json file.
+        vocab_size: Vocabulary size of `inputs_ids` in `GPT2Model` or a configuration json file.
        n_positions: Number of positional embeddings.
        n_ctx: Size of the causal mask (usually same as n_positions).
        n_embd: Dimensionality of the embeddings and hidden states.
@@ -56,7 +56,7 @@ class GPT2Config(PretrainedConfig):
    def __init__(
        self,
-        vocab_size_or_config_json_file=50257,
+        vocab_size=50257,
        n_positions=1024,
        n_ctx=1024,
        n_embd=768,
@@ -67,8 +67,6 @@ class GPT2Config(PretrainedConfig):
        attn_pdrop=0.1,
        layer_norm_epsilon=1e-5,
        initializer_range=0.02,
        num_labels=1,
        summary_type='cls_index',
        summary_use_proj=True,
        summary_activation=None,
@@ -79,7 +77,7 @@ class GPT2Config(PretrainedConfig):
        """Constructs GPT2Config.
        Args:
-            vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `GPT2Model` or a configuration json file.
+            vocab_size: Vocabulary size of `inputs_ids` in `GPT2Model` or a configuration json file.
            n_positions: Number of positional embeddings.
            n_ctx: Size of the causal mask (usually same as n_positions).
            n_embd: Dimensionality of the embeddings and hidden states.
@@ -96,37 +94,22 @@ class GPT2Config(PretrainedConfig):
                initializing all weight matrices.
        """
        super(GPT2Config, self).__init__(**kwargs)
-
+        self.vocab_size = vocab_size
-        if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
+        self.n_ctx = n_ctx
-                        and isinstance(vocab_size_or_config_json_file, unicode)):
+        self.n_positions = n_positions
-            with open(vocab_size_or_config_json_file, "r", encoding="utf-8") as reader:
+        self.n_embd = n_embd
-                json_config = json.loads(reader.read())
+        self.n_layer = n_layer
-            for key, value in json_config.items():
+        self.n_head = n_head
-                self.__dict__[key] = value
+        self.resid_pdrop = resid_pdrop
-        elif isinstance(vocab_size_or_config_json_file, int):
+        self.embd_pdrop = embd_pdrop
-            self.vocab_size = vocab_size_or_config_json_file
+        self.attn_pdrop = attn_pdrop
-            self.n_ctx = n_ctx
+        self.layer_norm_epsilon = layer_norm_epsilon
-            self.n_positions = n_positions
+        self.initializer_range = initializer_range
-            self.n_embd = n_embd
+        self.summary_type = summary_type
-            self.n_layer = n_layer
+        self.summary_use_proj = summary_use_proj
-            self.n_head = n_head
+        self.summary_activation = summary_activation
-            self.resid_pdrop = resid_pdrop
+        self.summary_first_dropout = summary_first_dropout
-            self.embd_pdrop = embd_pdrop
+        self.summary_proj_to_labels = summary_proj_to_labels
            self.attn_pdrop = attn_pdrop
            self.layer_norm_epsilon = layer_norm_epsilon
            self.initializer_range = initializer_range
            self.num_labels = num_labels
            self.summary_type = summary_type
            self.summary_use_proj = summary_use_proj
            self.summary_activation = summary_activation
            self.summary_first_dropout = summary_first_dropout
            self.summary_proj_to_labels = summary_proj_to_labels
        else:
            raise ValueError(
                "First argument must be either a vocabulary size (int)"
                "or the path to a pretrained model config file (str)"
            )
    @property
    def max_position_embeddings(self):
--- a/transformers/configuration_openai.py
+++ b/transformers/configuration_openai.py
@@ -35,7 +35,7 @@ class OpenAIGPTConfig(PretrainedConfig):
    Configuration class to store the configuration of a `OpenAIGPTModel`.
    Args:
-        vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `OpenAIGPTModel` or a configuration json file.
+        vocab_size: Vocabulary size of `inputs_ids` in `OpenAIGPTModel` or a configuration json file.
        n_positions: Number of positional embeddings.
        n_ctx: Size of the causal mask (usually same as n_positions).
        n_embd: Dimensionality of the embeddings and hidden states.
@@ -58,7 +58,7 @@ class OpenAIGPTConfig(PretrainedConfig):
    def __init__(
        self,
-        vocab_size_or_config_json_file=40478,
+        vocab_size=40478,
        n_positions=512,
        n_ctx=512,
        n_embd=768,
@@ -71,8 +71,6 @@ class OpenAIGPTConfig(PretrainedConfig):
        layer_norm_epsilon=1e-5,
        initializer_range=0.02,
        predict_special_tokens=True,
        num_labels=1,
        summary_type='cls_index',
        summary_use_proj=True,
        summary_activation=None,
@@ -83,39 +81,24 @@ class OpenAIGPTConfig(PretrainedConfig):
        """Constructs OpenAIGPTConfig.
        """
        super(OpenAIGPTConfig, self).__init__(**kwargs)
-
+        self.vocab_size = vocab_size
-        if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
+        self.n_ctx = n_ctx
-                        and isinstance(vocab_size_or_config_json_file, unicode)):
+        self.n_positions = n_positions
-            with open(vocab_size_or_config_json_file, "r", encoding="utf-8") as reader:
+        self.n_embd = n_embd
-                json_config = json.loads(reader.read())
+        self.n_layer = n_layer
-            for key, value in json_config.items():
+        self.n_head = n_head
-                self.__dict__[key] = value
+        self.afn = afn
-        elif isinstance(vocab_size_or_config_json_file, int):
+        self.resid_pdrop = resid_pdrop
-            self.vocab_size = vocab_size_or_config_json_file
+        self.embd_pdrop = embd_pdrop
-            self.n_ctx = n_ctx
+        self.attn_pdrop = attn_pdrop
-            self.n_positions = n_positions
+        self.layer_norm_epsilon = layer_norm_epsilon
-            self.n_embd = n_embd
+        self.initializer_range = initializer_range
-            self.n_layer = n_layer
+        self.predict_special_tokens = predict_special_tokens
-            self.n_head = n_head
+        self.summary_type = summary_type
-            self.afn = afn
+        self.summary_use_proj = summary_use_proj
-            self.resid_pdrop = resid_pdrop
+        self.summary_activation = summary_activation
-            self.embd_pdrop = embd_pdrop
+        self.summary_first_dropout = summary_first_dropout
-            self.attn_pdrop = attn_pdrop
+        self.summary_proj_to_labels = summary_proj_to_labels
            self.layer_norm_epsilon = layer_norm_epsilon
            self.initializer_range = initializer_range
            self.predict_special_tokens = predict_special_tokens
            self.num_labels = num_labels
            self.summary_type = summary_type
            self.summary_use_proj = summary_use_proj
            self.summary_activation = summary_activation
            self.summary_first_dropout = summary_first_dropout
            self.summary_proj_to_labels = summary_proj_to_labels
        else:
            raise ValueError(
                "First argument must be either a vocabulary size (int)"
                "or the path to a pretrained model config file (str)"
            )
    @property
    def max_position_embeddings(self):
--- a/transformers/configuration_t5.py
+++ b/transformers/configuration_t5.py
@@ -66,7 +66,7 @@ class T5Config(PretrainedConfig):
    pretrained_config_archive_map = T5_PRETRAINED_CONFIG_ARCHIVE_MAP
    def __init__(self,
-                 vocab_size_or_config_json_file=32128,
+                 vocab_size=32128,
                 n_positions=512,
                 d_model=512,
                 d_kv=64,
@@ -79,7 +79,7 @@ class T5Config(PretrainedConfig):
                 initializer_factor=1.0,
                 **kwargs):
        super(T5Config, self).__init__(**kwargs)
-        self.vocab_size = vocab_size_or_config_json_file if isinstance(vocab_size_or_config_json_file, int) else -1
+        self.vocab_size = vocab_size
        self.n_positions = n_positions
        self.d_model = d_model
        self.d_kv = d_kv
@@ -91,17 +91,6 @@ class T5Config(PretrainedConfig):
        self.layer_norm_epsilon = layer_norm_epsilon
        self.initializer_factor = initializer_factor
        if isinstance(vocab_size_or_config_json_file, six.string_types):
            with open(vocab_size_or_config_json_file, "r", encoding="utf-8") as reader:
                json_config = json.loads(reader.read())
            for key, value in json_config.items():
                self.__dict__[key] = value
        elif not isinstance(vocab_size_or_config_json_file, int):
            raise ValueError(
                "First argument must be either a vocabulary size (int)"
                "or the path to a pretrained model config file (str)"
            )
    @property
    def max_position_embeddings(self):
        return self.n_positions
--- a/transformers/configuration_transfo_xl.py
+++ b/transformers/configuration_transfo_xl.py
@@ -34,7 +34,7 @@ class TransfoXLConfig(PretrainedConfig):
    """Configuration class to store the configuration of a `TransfoXLModel`.
        Args:
-            vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `TransfoXLModel` or a configuration json file.
+            vocab_size: Vocabulary size of `inputs_ids` in `TransfoXLModel` or a configuration json file.
            cutoffs: cutoffs for the adaptive softmax
            d_model: Dimensionality of the model's hidden states.
            d_embed: Dimensionality of the embeddings
@@ -68,7 +68,7 @@ class TransfoXLConfig(PretrainedConfig):
    pretrained_config_archive_map = TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP
    def __init__(self,
-                 vocab_size_or_config_json_file=267735,
+                 vocab_size=267735,
                 cutoffs=[20000, 40000, 200000],
                 d_model=1024,
                 d_embed=1024,
@@ -100,7 +100,7 @@ class TransfoXLConfig(PretrainedConfig):
        """Constructs TransfoXLConfig.
        """
        super(TransfoXLConfig, self).__init__(**kwargs)
-        self.n_token = vocab_size_or_config_json_file if isinstance(vocab_size_or_config_json_file, int) else -1
+        self.vocab_size = vocab_size
        self.cutoffs = []
        self.cutoffs.extend(cutoffs)
        self.tie_weight = tie_weight
@@ -133,27 +133,17 @@ class TransfoXLConfig(PretrainedConfig):
        self.init_std = init_std
        self.layer_norm_epsilon = layer_norm_epsilon
        if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
                        and isinstance(vocab_size_or_config_json_file, unicode)):
            with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader:
                json_config = json.loads(reader.read())
            for key, value in json_config.items():
                self.__dict__[key] = value
        elif not isinstance(vocab_size_or_config_json_file, int):
            raise ValueError("First argument must be either a vocabulary size (int)"
                             " or the path to a pretrained model config file (str)")
    @property
    def max_position_embeddings(self):
        return self.tgt_len + self.ext_len + self.mem_len
    @property
-    def vocab_size(self):
+    def n_token(self):  # Backward compatibility
-        return self.n_token
+        return self.vocab_size
-    @vocab_size.setter
+    @n_token.setter
-    def vocab_size(self, value):
+    def n_token(self, value):  # Backward compatibility
-        self.n_token = value
+        self.vocab_size = value
    @property
    def hidden_size(self):
--- a/transformers/configuration_utils.py
+++ b/transformers/configuration_utils.py
@@ -49,8 +49,7 @@ class PretrainedConfig(object):
    pretrained_config_archive_map = {}
    def __init__(self, **kwargs):
-        self.finetuning_task = kwargs.pop('finetuning_task', None)
+        # Attributes with defaults
        self.num_labels = kwargs.pop('num_labels', 2)
        self.output_attentions = kwargs.pop('output_attentions', False)
        self.output_hidden_states = kwargs.pop('output_hidden_states', False)
        self.output_past = kwargs.pop('output_past', True)  # Not used by all models
@@ -59,6 +58,22 @@ class PretrainedConfig(object):
        self.pruned_heads = kwargs.pop('pruned_heads', {})
        self.is_decoder = kwargs.pop('is_decoder', False)
        # Fine-tuning task arguments
        self.finetuning_task = kwargs.pop('finetuning_task', None)
        self.num_labels = kwargs.pop('num_labels', 2)
        self.id2label = kwargs.pop('id2label', {i: 'LABEL_{}'.format(i) for i in range(self.num_labels)})
        self.id2label = dict((int(key), value) for key, value in self.id2label.items())
        self.label2id = kwargs.pop('label2id', dict(zip(self.id2label.values(), self.id2label.keys())))
        self.label2id = dict((key, int(value)) for key, value in self.label2id.items())
        # Additional attributes without default values
        for key, value in kwargs.items():
            try:
                setattr(self, key, value)
            except AttributeError as err:
                logger.error("Can't set {} with value {} for {}".format(key, value, self))
                raise err
    def save_pretrained(self, save_directory):
        """ Save a configuration object to the directory `save_directory`, so that it
            can be re-loaded using the :func:`~transformers.PretrainedConfig.from_pretrained` class method.
@@ -136,10 +151,14 @@ class PretrainedConfig(object):
            config_file = pretrained_model_name_or_path
        else:
            config_file = hf_bucket_url(pretrained_model_name_or_path, postfix=CONFIG_NAME)
-        # redirect to the cache, if necessary
+
        try:
            # Load from URL or cache if already cached
            resolved_config_file = cached_path(config_file, cache_dir=cache_dir, force_download=force_download,
                                               proxies=proxies, resume_download=resume_download)
            # Load config
            config = cls.from_json_file(resolved_config_file)
        except EnvironmentError:
            if pretrained_model_name_or_path in cls.pretrained_config_archive_map:
                msg = "Couldn't reach server at '{}' to download pretrained model configuration file.".format(
@@ -153,15 +172,18 @@ class PretrainedConfig(object):
                        config_file, CONFIG_NAME)
            raise EnvironmentError(msg)
        except json.JSONDecodeError:
            msg = "Couldn't reach server at '{}' to download configuration file or " \
                  "configuration file is not a valid JSON file. " \
                  "Please check network or file content here: {}.".format(config_file, resolved_config_file)
            raise EnvironmentError(msg)
        if resolved_config_file == config_file:
            logger.info("loading configuration file {}".format(config_file))
        else:
            logger.info("loading configuration file {} from cache at {}".format(
                config_file, resolved_config_file))
        # Load config
        config = cls.from_json_file(resolved_config_file)
        if hasattr(config, 'pruned_heads'):
            config.pruned_heads = dict((int(key), value) for key, value in config.pruned_heads.items())
@@ -183,17 +205,15 @@ class PretrainedConfig(object):
    @classmethod
    def from_dict(cls, json_object):
        """Constructs a `Config` from a Python dictionary of parameters."""
-        config = cls(vocab_size_or_config_json_file=-1)
+        return cls(**json_object)
        for key, value in json_object.items():
            setattr(config, key, value)
        return config
    @classmethod
    def from_json_file(cls, json_file):
        """Constructs a `Config` from a json file of parameters."""
        with open(json_file, "r", encoding='utf-8') as reader:
            text = reader.read()
-        return cls.from_dict(json.loads(text))
+        dict_obj = json.loads(text)
        return cls(**dict_obj)
    def __eq__(self, other):
        return self.__dict__ == other.__dict__
--- a/transformers/configuration_xlm.py
+++ b/transformers/configuration_xlm.py
@@ -42,7 +42,7 @@ class XLMConfig(PretrainedConfig):
    """Configuration class to store the configuration of a `XLMModel`.
    Args:
-        vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `XLMModel`.
+        vocab_size: Vocabulary size of `inputs_ids` in `XLMModel`.
        d_model: Size of the encoder layers and the pooler layer.
        n_layer: Number of hidden layers in the Transformer encoder.
        n_head: Number of attention heads for each attention layer in
@@ -81,7 +81,7 @@ class XLMConfig(PretrainedConfig):
    pretrained_config_archive_map = XLM_PRETRAINED_CONFIG_ARCHIVE_MAP
    def __init__(self,
-                 vocab_size_or_config_json_file=30145,
+                 vocab_size=30145,
                 emb_dim=2048,
                 n_layers=12,
                 n_heads=16,
@@ -103,9 +103,6 @@ class XLMConfig(PretrainedConfig):
                 unk_index=3,
                 mask_index=5,
                 is_encoder=True,
                 finetuning_task=None,
                 num_labels=2,
                 summary_type='first',
                 summary_use_proj=True,
                 summary_activation=None,
@@ -117,56 +114,43 @@ class XLMConfig(PretrainedConfig):
        """Constructs XLMConfig.
        """
        super(XLMConfig, self).__init__(**kwargs)
-
+        self.vocab_size = vocab_size
-        if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
+        self.emb_dim = emb_dim
-                        and isinstance(vocab_size_or_config_json_file, unicode)):
+        self.n_layers = n_layers
-            with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader:
+        self.n_heads = n_heads
-                json_config = json.loads(reader.read())
+        self.dropout = dropout
-            for key, value in json_config.items():
+        self.attention_dropout = attention_dropout
-                self.__dict__[key] = value
+        self.gelu_activation = gelu_activation
-        elif isinstance(vocab_size_or_config_json_file, int):
+        self.sinusoidal_embeddings = sinusoidal_embeddings
-            self.n_words = vocab_size_or_config_json_file
+        self.causal = causal
-            self.emb_dim = emb_dim
+        self.asm = asm
-            self.n_layers = n_layers
+        self.n_langs = n_langs
-            self.n_heads = n_heads
+        self.use_lang_emb = use_lang_emb
-            self.dropout = dropout
+        self.layer_norm_eps = layer_norm_eps
-            self.attention_dropout = attention_dropout
+        self.bos_index = bos_index
-            self.gelu_activation = gelu_activation
+        self.eos_index = eos_index
-            self.sinusoidal_embeddings = sinusoidal_embeddings
+        self.pad_index = pad_index
-            self.causal = causal
+        self.unk_index = unk_index
-            self.asm = asm
+        self.mask_index = mask_index
-            self.n_langs = n_langs
+        self.is_encoder = is_encoder
-            self.use_lang_emb = use_lang_emb
+        self.max_position_embeddings = max_position_embeddings
-            self.layer_norm_eps = layer_norm_eps
+        self.embed_init_std = embed_init_std
-            self.bos_index = bos_index
+        self.init_std = init_std
-            self.eos_index = eos_index
+        self.summary_type = summary_type
-            self.pad_index = pad_index
+        self.summary_use_proj = summary_use_proj
-            self.unk_index = unk_index
+        self.summary_activation = summary_activation
-            self.mask_index = mask_index
+        self.summary_proj_to_labels = summary_proj_to_labels
-            self.is_encoder = is_encoder
+        self.summary_first_dropout = summary_first_dropout
-            self.max_position_embeddings = max_position_embeddings
+        self.start_n_top = start_n_top
-            self.embed_init_std = embed_init_std
+        self.end_n_top = end_n_top
            self.init_std = init_std
            self.finetuning_task = finetuning_task
            self.num_labels = num_labels
            self.summary_type = summary_type
            self.summary_use_proj = summary_use_proj
            self.summary_activation = summary_activation
            self.summary_proj_to_labels = summary_proj_to_labels
            self.summary_first_dropout = summary_first_dropout
            self.start_n_top = start_n_top
            self.end_n_top = end_n_top
        else:
            raise ValueError("First argument must be either a vocabulary size (int)"
                             " or the path to a pretrained model config file (str)")
    @property
-    def vocab_size(self):
+    def n_words(self):  # For backward compatibility
-        return self.n_words
+        return self.vocab_size
-    @vocab_size.setter
+    @n_words.setter
-    def vocab_size(self, value):
+    def n_words(self, value):  # For backward compatibility
-        self.n_words = value
+        self.vocab_size = value
    @property
    def hidden_size(self):
--- a/transformers/configuration_xlnet.py
+++ b/transformers/configuration_xlnet.py
@@ -35,7 +35,7 @@ class XLNetConfig(PretrainedConfig):
    """Configuration class to store the configuration of a ``XLNetModel``.
    Args:
-        vocab_size_or_config_json_file: Vocabulary size of ``inputs_ids`` in ``XLNetModel``.
+        vocab_size: Vocabulary size of ``inputs_ids`` in ``XLNetModel``.
        d_model: Size of the encoder layers and the pooler layer.
        n_layer: Number of hidden layers in the Transformer encoder.
        n_head: Number of attention heads for each attention layer in
@@ -72,28 +72,22 @@ class XLNetConfig(PretrainedConfig):
    pretrained_config_archive_map = XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP
    def __init__(self,
-                 vocab_size_or_config_json_file=32000,
+                 vocab_size=32000,
                 d_model=1024,
                 n_layer=24,
                 n_head=16,
                 d_inner=4096,
                 max_position_embeddings=512,
                 ff_activation="gelu",
                 untie_r=True,
                 attn_type="bi",
                 initializer_range=0.02,
                 layer_norm_eps=1e-12,
                 dropout=0.1,
                 mem_len=None,
                 reuse_len=None,
                 bi_data=False,
                 clamp_len=-1,
                 same_length=False,
                 finetuning_task=None,
                 num_labels=2,
                 summary_type='last',
                 summary_use_proj=True,
                 summary_activation='tanh',
@@ -104,58 +98,45 @@ class XLNetConfig(PretrainedConfig):
        """Constructs XLNetConfig.
        """
        super(XLNetConfig, self).__init__(**kwargs)
        self.vocab_size = vocab_size
        self.d_model = d_model
        self.n_layer = n_layer
        self.n_head = n_head
        assert d_model % n_head == 0
        self.d_head = d_model // n_head
        self.ff_activation = ff_activation
        self.d_inner = d_inner
        self.untie_r = untie_r
        self.attn_type = attn_type
-        if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
+        self.initializer_range = initializer_range
-                        and isinstance(vocab_size_or_config_json_file, unicode)):
+        self.layer_norm_eps = layer_norm_eps
            with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader:
                json_config = json.loads(reader.read())
            for key, value in json_config.items():
                setattr(config, key, value)
        elif isinstance(vocab_size_or_config_json_file, int):
            self.n_token = vocab_size_or_config_json_file
            self.d_model = d_model
            self.n_layer = n_layer
            self.n_head = n_head
            assert d_model % n_head == 0
            self.d_head = d_model // n_head
            self.ff_activation = ff_activation
            self.d_inner = d_inner
            self.untie_r = untie_r
            self.attn_type = attn_type
-            self.initializer_range = initializer_range
+        self.dropout = dropout
-            self.layer_norm_eps = layer_norm_eps
+        self.mem_len = mem_len
        self.reuse_len = reuse_len
        self.bi_data = bi_data
        self.clamp_len = clamp_len
        self.same_length = same_length
-            self.dropout = dropout
+        self.summary_type = summary_type
-            self.mem_len = mem_len
+        self.summary_use_proj = summary_use_proj
-            self.reuse_len = reuse_len
+        self.summary_activation = summary_activation
-            self.bi_data = bi_data
+        self.summary_last_dropout = summary_last_dropout
-            self.clamp_len = clamp_len
+        self.start_n_top = start_n_top
-            self.same_length = same_length
+        self.end_n_top = end_n_top
            self.finetuning_task = finetuning_task
            self.num_labels = num_labels
            self.summary_type = summary_type
            self.summary_use_proj = summary_use_proj
            self.summary_activation = summary_activation
            self.summary_last_dropout = summary_last_dropout
            self.start_n_top = start_n_top
            self.end_n_top = end_n_top
        else:
            raise ValueError("First argument must be either a vocabulary size (int)"
                             " or the path to a pretrained model config file (str)")
    @property
    def max_position_embeddings(self):
        return -1
    @property
-    def vocab_size(self):
+    def n_token(self):  # Backward compatibility
-        return self.n_token
+        return self.vocab_size
-    @vocab_size.setter
+    @n_token.setter
-    def vocab_size(self, value):
+    def n_token(self, value):  # Backward compatibility
-        self.n_token = value
+        self.vocab_size = value
    @property
    def hidden_size(self):
--- a/transformers/convert_roberta_original_pytorch_checkpoint_to_pytorch.py
+++ b/transformers/convert_roberta_original_pytorch_checkpoint_to_pytorch.py
@@ -46,7 +46,7 @@ def convert_roberta_checkpoint_to_pytorch(roberta_checkpoint_path, pytorch_dump_
    roberta = FairseqRobertaModel.from_pretrained(roberta_checkpoint_path)
    roberta.eval()  # disable dropout
    config = BertConfig(
-        vocab_size_or_config_json_file=50265,
+        vocab_size=50265,
        hidden_size=roberta.args.encoder_embed_dim,
        num_hidden_layers=roberta.args.encoder_layers,
        num_attention_heads=roberta.args.encoder_attention_heads,
--- a/transformers/file_utils.py
+++ b/transformers/file_utils.py
@@ -72,7 +72,7 @@ WEIGHTS_NAME = "pytorch_model.bin"
 TF2_WEIGHTS_NAME = 'tf_model.h5'
 TF_WEIGHTS_NAME = 'model.ckpt'
 CONFIG_NAME = "config.json"
-
+MODEL_CARD_NAME = "model_card.json"
 DUMMY_INPUTS = [[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]]
 DUMMY_MASK = [[1, 1, 1, 1, 1], [1, 1, 1, 0, 0], [0, 0, 0, 1, 1]]
--- a/transformers/model_card.py
+++ b/transformers/model_card.py
@@ -0,0 +1,226 @@
 # coding=utf-8
 # Copyright 2018 The HuggingFace Inc. team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """ Configuration base class and utilities."""
 from __future__ import (absolute_import, division, print_function,
                        unicode_literals)
 import copy
 import json
 import logging
 import os
 from io import open
 from .configuration_auto import ALL_PRETRAINED_CONFIG_ARCHIVE_MAP
 from .file_utils import CONFIG_NAME, MODEL_CARD_NAME, cached_path, is_remote_url, hf_bucket_url
 logger = logging.getLogger(__name__)
 class ModelCard(object):
    r""" Model Card class.
        Store model card as well as methods for loading/downloading/saving model cards.
        Please read the following paper for details and explanation on the sections:
            "Model Cards for Model Reporting"
                by Margaret Mitchell, Simone Wu,
                Andrew Zaldivar, Parker Barnes, Lucy Vasserman, Ben Hutchinson, Elena Spitzer,
                Inioluwa Deborah Raji and Timnit Gebru for the proposal behind model cards.
            Link: https://arxiv.org/abs/1810.03993
        Note:
            A model card can be loaded and saved to disk.
        Parameters:
    """
    def __init__(self, **kwargs):
        # Recomended attributes from https://arxiv.org/abs/1810.03993 (see papers)
        self.model_details = kwargs.pop('model_details', {})
        self.intended_use = kwargs.pop('intended_use', {})
        self.factors = kwargs.pop('factors', {})
        self.metrics = kwargs.pop('metrics', {})
        self.evaluation_data = kwargs.pop('evaluation_data', {})
        self.training_data = kwargs.pop('training_data', {})
        self.quantitative_analyses = kwargs.pop('quantitative_analyses', {})
        self.ethical_considerations = kwargs.pop('ethical_considerations', {})
        self.caveats_and_recommendations = kwargs.pop('caveats_and_recommendations', {})
        # Open additional attributes
        for key, value in kwargs.items():
            try:
                setattr(self, key, value)
            except AttributeError as err:
                logger.error("Can't set {} with value {} for {}".format(key, value, self))
                raise err
    def save_pretrained(self, save_directory_or_file):
        """ Save a model card object to the directory or file `save_directory_or_file`.
        """
        if os.path.isdir(save_directory_or_file):
            # If we save using the predefined names, we can load using `from_pretrained`
            output_model_card_file = os.path.join(save_directory_or_file, MODEL_CARD_NAME)
        else:
            output_model_card_file = save_directory_or_file
        self.to_json_file(output_model_card_file)
        logger.info("Model card saved in {}".format(output_model_card_file))
    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
        r""" Instantiate a :class:`~transformers.ModelCard` from a pre-trained model model card.
        Parameters:
            pretrained_model_name_or_path: either:
                - a string with the `shortcut name` of a pre-trained model card to load from cache or download, e.g.: ``bert-base-uncased``.
                - a string with the `identifier name` of a pre-trained model card that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
                - a path to a `directory` containing a mode card file saved using the :func:`~transformers.ModelCard.save_pretrained` method, e.g.: ``./my_model_directory/``.
                - a path or url to a saved model card JSON `file`, e.g.: ``./my_model_directory/model_card.json``.
            cache_dir: (`optional`) string:
                Path to a directory in which a downloaded pre-trained model
                card should be cached if the standard cache should not be used.
            kwargs: (`optional`) dict: key/value pairs with which to update the ModelCard object after loading.
                - The values in kwargs of any keys which are model card attributes will be used to override the loaded values.
                - Behavior concerning key/value pairs whose keys are *not* model card attributes is controlled by the `return_unused_kwargs` keyword parameter.
            force_download: (`optional`) boolean, default False:
                Force to (re-)download the model card file and override the cached version if it exists.
            resume_download: (`optional`) boolean, default False:
                Do not delete incompletely recieved file. Attempt to resume the download if such a file exists.
            proxies: (`optional`) dict, default None:
                A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
                The proxies are used on each request.
            return_unused_kwargs: (`optional`) bool:
                - If False, then this function returns just the final model card object.
                - If True, then this functions returns a tuple `(model card, unused_kwargs)` where `unused_kwargs` is a dictionary consisting of the key/value pairs whose keys are not model card attributes: ie the part of kwargs which has not been used to update `ModelCard` and is otherwise ignored.
        Examples::
            model_card = ModelCard.from_pretrained('bert-base-uncased')    # Download model card from S3 and cache.
            model_card = ModelCard.from_pretrained('./test/saved_model/')  # E.g. model card was saved using `save_pretrained('./test/saved_model/')`
            model_card = ModelCard.from_pretrained('./test/saved_model/model_card.json')
            model_card = ModelCard.from_pretrained('bert-base-uncased', output_attention=True, foo=False)
        """
        cache_dir = kwargs.pop('cache_dir', None)
        force_download = kwargs.pop('force_download', False)
        resume_download = kwargs.pop('resume_download', False)
        proxies = kwargs.pop('proxies', None)
        return_unused_kwargs = kwargs.pop('return_unused_kwargs', False)
        if pretrained_model_name_or_path in ALL_PRETRAINED_CONFIG_ARCHIVE_MAP:
            # For simplicity we use the same pretrained url than the configuration files but with a different suffix (model_card.json)
            model_card_file = ALL_PRETRAINED_CONFIG_ARCHIVE_MAP[pretrained_model_name_or_path]
            model_card_file = model_card_file.replace(CONFIG_NAME, MODEL_CARD_NAME)
        elif os.path.isdir(pretrained_model_name_or_path):
            model_card_file = os.path.join(pretrained_model_name_or_path, MODEL_CARD_NAME)
        elif os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path):
            model_card_file = pretrained_model_name_or_path
        else:
            model_card_file = hf_bucket_url(pretrained_model_name_or_path, postfix=MODEL_CARD_NAME)
        try:
            # Load from URL or cache if already cached
            resolved_model_card_file = cached_path(model_card_file, cache_dir=cache_dir, force_download=force_download,
                                               proxies=proxies, resume_download=resume_download)
            if resolved_model_card_file == model_card_file:
                logger.info("loading model card file {}".format(model_card_file))
            else:
                logger.info("loading model card file {} from cache at {}".format(
                    model_card_file, resolved_model_card_file))
            # Load model card
            model_card = cls.from_json_file(resolved_model_card_file)
        except EnvironmentError:
            if pretrained_model_name_or_path in ALL_PRETRAINED_CONFIG_ARCHIVE_MAP:
                logger.warning("Couldn't reach server at '{}' to download model card file.".format(
                        model_card_file))
            else:
                logger.warning("Model name '{}' was not found in model name list ({}). " \
                      "We assumed '{}' was a path or url to a model card file named {} or " \
                      "a directory containing such a file but couldn't find any such file at this path or url.".format(
                        pretrained_model_name_or_path,
                        ', '.join(ALL_PRETRAINED_CONFIG_ARCHIVE_MAP.keys()),
                        model_card_file, MODEL_CARD_NAME))
            logger.warning("Creating an empty model card.")
            # We fall back on creating an empty model card
            model_card = cls()
        except json.JSONDecodeError:
            logger.warning("Couldn't reach server at '{}' to download model card file or "
                           "model card file is not a valid JSON file. "
                           "Please check network or file content here: {}.".format(model_card_file, resolved_model_card_file))
            logger.warning("Creating an empty model card.")
            # We fall back on creating an empty model card
            model_card = cls()
        # Update model card with kwargs if needed
        to_remove = []
        for key, value in kwargs.items():
            if hasattr(model_card, key):
                setattr(model_card, key, value)
                to_remove.append(key)
        for key in to_remove:
            kwargs.pop(key, None)
        logger.info("Model card: %s", str(model_card))
        if return_unused_kwargs:
            return model_card, kwargs
        else:
            return model_card
    @classmethod
    def from_dict(cls, json_object):
        """Constructs a `ModelCard` from a Python dictionary of parameters."""
        return cls(**json_object)
    @classmethod
    def from_json_file(cls, json_file):
        """Constructs a `ModelCard` from a json file of parameters."""
        with open(json_file, "r", encoding='utf-8') as reader:
            text = reader.read()
        dict_obj = json.loads(text)
        return cls(**dict_obj)
    def __eq__(self, other):
        return self.__dict__ == other.__dict__
    def __repr__(self):
        return str(self.to_json_string())
    def to_dict(self):
        """Serializes this instance to a Python dictionary."""
        output = copy.deepcopy(self.__dict__)
        return output
    def to_json_string(self):
        """Serializes this instance to a JSON string."""
        return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"
    def to_json_file(self, json_file_path):
        """ Save this instance to a json file."""
        with open(json_file_path, "w", encoding='utf-8') as writer:
            writer.write(self.to_json_string())
--- a/transformers/modeling_auto.py
+++ b/transformers/modeling_auto.py
@@ -18,18 +18,18 @@ from __future__ import absolute_import, division, print_function, unicode_litera
 import logging
-from .modeling_bert import BertModel, BertForMaskedLM, BertForSequenceClassification, BertForQuestionAnswering
+from .modeling_bert import BertModel, BertForMaskedLM, BertForSequenceClassification, BertForQuestionAnswering, BERT_PRETRAINED_MODEL_ARCHIVE_MAP
-from .modeling_openai import OpenAIGPTModel, OpenAIGPTLMHeadModel
+from .modeling_openai import OpenAIGPTModel, OpenAIGPTLMHeadModel, OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP
-from .modeling_gpt2 import GPT2Model, GPT2LMHeadModel
+from .modeling_gpt2 import GPT2Model, GPT2LMHeadModel, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP
-from .modeling_ctrl import CTRLModel, CTRLLMHeadModel
+from .modeling_ctrl import CTRLModel, CTRLLMHeadModel, CTRL_PRETRAINED_MODEL_ARCHIVE_MAP
-from .modeling_transfo_xl import TransfoXLModel, TransfoXLLMHeadModel
+from .modeling_transfo_xl import TransfoXLModel, TransfoXLLMHeadModel, TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP
-from .modeling_xlnet import XLNetModel, XLNetLMHeadModel, XLNetForSequenceClassification, XLNetForQuestionAnswering
+from .modeling_xlnet import XLNetModel, XLNetLMHeadModel, XLNetForSequenceClassification, XLNetForQuestionAnswering, XLNET_PRETRAINED_MODEL_ARCHIVE_MAP
-from .modeling_xlm import XLMModel, XLMWithLMHeadModel, XLMForSequenceClassification, XLMForQuestionAnswering
+from .modeling_xlm import XLMModel, XLMWithLMHeadModel, XLMForSequenceClassification, XLMForQuestionAnswering, XLM_PRETRAINED_MODEL_ARCHIVE_MAP
-from .modeling_roberta import RobertaModel, RobertaForMaskedLM, RobertaForSequenceClassification
+from .modeling_roberta import RobertaModel, RobertaForMaskedLM, RobertaForSequenceClassification, ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
-from .modeling_distilbert import DistilBertModel, DistilBertForQuestionAnswering, DistilBertForMaskedLM, DistilBertForSequenceClassification
+from .modeling_distilbert import DistilBertModel, DistilBertForQuestionAnswering, DistilBertForMaskedLM, DistilBertForSequenceClassification, DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP
-from .modeling_camembert import CamembertModel, CamembertForMaskedLM, CamembertForSequenceClassification, CamembertForMultipleChoice
+from .modeling_camembert import CamembertModel, CamembertForMaskedLM, CamembertForSequenceClassification, CamembertForMultipleChoice, CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP
-from .modeling_albert import AlbertModel, AlbertForMaskedLM, AlbertForSequenceClassification, AlbertForQuestionAnswering
+from .modeling_albert import AlbertModel, AlbertForMaskedLM, AlbertForSequenceClassification, AlbertForQuestionAnswering, ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP
-from .modeling_t5 import T5Model, T5WithLMHeadModel
+from .modeling_t5 import T5Model, T5WithLMHeadModel, T5_PRETRAINED_MODEL_ARCHIVE_MAP
 from .modeling_utils import PreTrainedModel, SequenceSummary
@@ -38,6 +38,24 @@ from .file_utils import add_start_docstrings
 logger = logging.getLogger(__name__)
 ALL_PRETRAINED_MODEL_ARCHIVE_MAP = dict((key, value)
    for pretrained_map in [
        BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
        OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP,
        TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP,
        GPT2_PRETRAINED_MODEL_ARCHIVE_MAP,
        CTRL_PRETRAINED_MODEL_ARCHIVE_MAP,
        XLNET_PRETRAINED_MODEL_ARCHIVE_MAP,
        XLM_PRETRAINED_MODEL_ARCHIVE_MAP,
        ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP,
        DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
        ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
        CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
        T5_PRETRAINED_MODEL_ARCHIVE_MAP,
        ]
    for key, value, in pretrained_map.items())
 class AutoModel(object):
    r"""
        :class:`~transformers.AutoModel` is a generic model class
--- a/transformers/modeling_gpt2.py
+++ b/transformers/modeling_gpt2.py
@@ -634,6 +634,7 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
    """
    def __init__(self, config):
        super(GPT2DoubleHeadsModel, self).__init__(config)
        config.num_labels = 1
        self.transformer = GPT2Model(config)
        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
        self.multiple_choice_head = SequenceSummary(config)
--- a/transformers/modeling_openai.py
+++ b/transformers/modeling_openai.py
@@ -590,6 +590,7 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
    def __init__(self, config):
        super(OpenAIGPTDoubleHeadsModel, self).__init__(config)
        config.num_labels = 1
        self.transformer = OpenAIGPTModel(config)
        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
        self.multiple_choice_head = SequenceSummary(config)
--- a/transformers/modeling_tf_auto.py
+++ b/transformers/modeling_tf_auto.py
@@ -18,22 +18,40 @@ from __future__ import absolute_import, division, print_function, unicode_litera
 import logging
-from .modeling_tf_bert import TFBertModel, TFBertForMaskedLM, TFBertForSequenceClassification, TFBertForQuestionAnswering
+from .modeling_tf_bert import TFBertModel, TFBertForMaskedLM, TFBertForSequenceClassification, TFBertForQuestionAnswering, TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP
-from .modeling_tf_openai import TFOpenAIGPTModel, TFOpenAIGPTLMHeadModel
+from .modeling_tf_openai import TFOpenAIGPTModel, TFOpenAIGPTLMHeadModel, TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP
-from .modeling_tf_gpt2 import TFGPT2Model, TFGPT2LMHeadModel
+from .modeling_tf_gpt2 import TFGPT2Model, TFGPT2LMHeadModel, TF_GPT2_PRETRAINED_MODEL_ARCHIVE_MAP
-from .modeling_tf_transfo_xl import TFTransfoXLModel, TFTransfoXLLMHeadModel
+from .modeling_tf_transfo_xl import TFTransfoXLModel, TFTransfoXLLMHeadModel, TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP
-from .modeling_tf_xlnet import TFXLNetModel, TFXLNetLMHeadModel, TFXLNetForSequenceClassification, TFXLNetForQuestionAnsweringSimple
+from .modeling_tf_xlnet import TFXLNetModel, TFXLNetLMHeadModel, TFXLNetForSequenceClassification, TFXLNetForQuestionAnsweringSimple, TF_XLNET_PRETRAINED_MODEL_ARCHIVE_MAP
-from .modeling_tf_xlm import TFXLMModel, TFXLMWithLMHeadModel, TFXLMForSequenceClassification, TFXLMForQuestionAnsweringSimple
+from .modeling_tf_xlm import TFXLMModel, TFXLMWithLMHeadModel, TFXLMForSequenceClassification, TFXLMForQuestionAnsweringSimple, TF_XLM_PRETRAINED_MODEL_ARCHIVE_MAP
-from .modeling_tf_roberta import TFRobertaModel, TFRobertaForMaskedLM, TFRobertaForSequenceClassification
+from .modeling_tf_roberta import TFRobertaModel, TFRobertaForMaskedLM, TFRobertaForSequenceClassification, TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
-from .modeling_tf_distilbert import TFDistilBertModel, TFDistilBertForQuestionAnswering, TFDistilBertForMaskedLM, TFDistilBertForSequenceClassification
+from .modeling_tf_distilbert import TFDistilBertModel, TFDistilBertForQuestionAnswering, TFDistilBertForMaskedLM, TFDistilBertForSequenceClassification, TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP
-from .modeling_tf_ctrl import TFCTRLModel, TFCTRLLMHeadModel
+from .modeling_tf_ctrl import TFCTRLModel, TFCTRLLMHeadModel, TF_CTRL_PRETRAINED_MODEL_ARCHIVE_MAP
-from .modeling_tf_t5 import TFT5Model, TFT5WithLMHeadModel
+from .modeling_tf_albert import TFAlbertModel, TFAlbertForMaskedLM, TFAlbertForSequenceClassification, TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP
 from .modeling_tf_t5 import TFT5Model, TFT5WithLMHeadModel, TF_T5_PRETRAINED_MODEL_ARCHIVE_MAP
 from .file_utils import add_start_docstrings
 logger = logging.getLogger(__name__)
 TF_ALL_PRETRAINED_MODEL_ARCHIVE_MAP = dict((key, value)
    for pretrained_map in [
        TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
        TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP,
        TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP,
        TF_GPT2_PRETRAINED_MODEL_ARCHIVE_MAP,
        TF_CTRL_PRETRAINED_MODEL_ARCHIVE_MAP,
        TF_XLNET_PRETRAINED_MODEL_ARCHIVE_MAP,
        TF_XLM_PRETRAINED_MODEL_ARCHIVE_MAP,
        TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP,
        TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
        TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
        TF_T5_PRETRAINED_MODEL_ARCHIVE_MAP,
        ]
    for key, value, in pretrained_map.items())
 class TFAutoModel(object):
    r"""
        :class:`~transformers.TFAutoModel` is a generic model class
@@ -144,6 +162,8 @@ class TFAutoModel(object):
            return TFT5Model.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
        elif 'distilbert' in pretrained_model_name_or_path:
            return TFDistilBertModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
        elif 'albert' in pretrained_model_name_or_path:
            return TFAlbertModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
        elif 'roberta' in pretrained_model_name_or_path:
            return TFRobertaModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
        elif 'bert' in pretrained_model_name_or_path:
@@ -280,6 +300,8 @@ class TFAutoModelWithLMHead(object):
            return TFT5WithLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
        elif 'distilbert' in pretrained_model_name_or_path:
            return TFDistilBertForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
        elif 'albert' in pretrained_model_name_or_path:
            return TFAlbertForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
        elif 'roberta' in pretrained_model_name_or_path:
            return TFRobertaForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
        elif 'bert' in pretrained_model_name_or_path:
@@ -407,6 +429,8 @@ class TFAutoModelForSequenceClassification(object):
        """
        if 'distilbert' in pretrained_model_name_or_path:
            return TFDistilBertForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
        elif 'albert' in pretrained_model_name_or_path:
            return TFAlbertForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
        elif 'roberta' in pretrained_model_name_or_path:
            return TFRobertaForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
        elif 'bert' in pretrained_model_name_or_path:
--- a/transformers/modeling_tf_gpt2.py
+++ b/transformers/modeling_tf_gpt2.py
@@ -574,6 +574,7 @@ class TFGPT2DoubleHeadsModel(TFGPT2PreTrainedModel):
    """
    def __init__(self, config, *inputs, **kwargs):
        super(TFGPT2DoubleHeadsModel, self).__init__(config, *inputs, **kwargs)
        config.num_labels = 1
        self.transformer = TFGPT2MainLayer(config, name='transformer')
        self.multiple_choice_head = TFSequenceSummary(config, initializer_range=config.initializer_range, name='multiple_choice_head')
--- a/transformers/modeling_tf_openai.py
+++ b/transformers/modeling_tf_openai.py
@@ -538,6 +538,7 @@ class TFOpenAIGPTDoubleHeadsModel(TFOpenAIGPTPreTrainedModel):
    """
    def __init__(self, config, *inputs, **kwargs):
        super(TFOpenAIGPTDoubleHeadsModel, self).__init__(config, *inputs, **kwargs)
        config.num_labels = 1
        self.transformer = TFOpenAIGPTMainLayer(config, name='transformer')
        self.multiple_choice_head = TFSequenceSummary(config, initializer_range=config.initializer_range, name='multiple_choice_head')
--- a/transformers/modeling_tf_pytorch_utils.py
+++ b/transformers/modeling_tf_pytorch_utils.py
@@ -143,7 +143,11 @@ def load_pytorch_weights_in_tf2_model(tf_model, pt_state_dict, tf_inputs=None, a
        name, transpose = convert_tf_weight_name_to_pt_weight_name(sw_name, start_prefix_to_remove=start_prefix_to_remove)
        # Find associated numpy array in pytorch model state dict
-        assert name in pt_state_dict, "{} not found in PyTorch model".format(name)
+        if name not in pt_state_dict:
            if allow_missing_keys:
                continue
            raise AttributeError("{} not found in PyTorch model".format(name))
        array = pt_state_dict[name].numpy()
        if transpose:
@@ -250,6 +254,7 @@ def load_tf2_weights_in_pytorch_model(pt_model, tf_weights, allow_missing_keys=F
    all_tf_weights = set(list(tf_weights_map.keys()))
    loaded_pt_weights_data_ptr = {}
    missing_keys_pt = []
    for pt_weight_name, pt_weight in current_pt_params_dict.items():
        # Handle PyTorch shared weight ()not duplicated in TF 2.0
        if pt_weight.data_ptr() in loaded_pt_weights_data_ptr:
@@ -258,7 +263,10 @@ def load_tf2_weights_in_pytorch_model(pt_model, tf_weights, allow_missing_keys=F
        # Find associated numpy array in pytorch model state dict
        if pt_weight_name not in tf_weights_map:
-            raise ValueError("{} not found in TF 2.0 model".format(pt_weight_name))
+            if allow_missing_keys:
                missing_keys_pt.append(pt_weight_name)
                continue
            raise AttributeError("{} not found in TF 2.0 model".format(pt_weight_name))
        array, transpose = tf_weights_map[pt_weight_name]
@@ -283,6 +291,7 @@ def load_tf2_weights_in_pytorch_model(pt_model, tf_weights, allow_missing_keys=F
        all_tf_weights.discard(pt_weight_name)
    missing_keys, unexpected_keys = pt_model.load_state_dict(new_pt_params_dict, strict=False)
    missing_keys += missing_keys_pt
    if len(missing_keys) > 0:
        logger.info("Weights of {} not initialized from TF 2.0 model: {}".format(
--- a/transformers/modeling_tf_transfo_xl.py
+++ b/transformers/modeling_tf_transfo_xl.py
@@ -353,7 +353,7 @@ class TFTransfoXLMainLayer(tf.keras.layers.Layer):
        self.output_attentions = config.output_attentions
        self.output_hidden_states = config.output_hidden_states
-        self.n_token = config.n_token
+        self.n_token = config.vocab_size
        self.d_embed = config.d_embed
        self.d_model = config.d_model
@@ -361,7 +361,7 @@ class TFTransfoXLMainLayer(tf.keras.layers.Layer):
        self.d_head = config.d_head
        self.untie_r = config.untie_r
-        self.word_emb = TFAdaptiveEmbedding(config.n_token, config.d_embed, config.d_model, config.cutoffs, 
+        self.word_emb = TFAdaptiveEmbedding(config.vocab_size, config.d_embed, config.d_model, config.cutoffs, 
                                            div_val=config.div_val, init_std=config.init_std, name='word_emb')
        self.drop = tf.keras.layers.Dropout(config.dropout)
@@ -729,7 +729,7 @@ class TFTransfoXLLMHeadModel(TFTransfoXLPreTrainedModel):
            raise NotImplementedError
        # use adaptive softmax (including standard softmax)
        else:
-            self.crit = TFAdaptiveSoftmaxMask(config.n_token, config.d_embed, config.d_model, 
+            self.crit = TFAdaptiveSoftmaxMask(config.vocab_size, config.d_embed, config.d_model, 
                                              config.cutoffs, div_val=config.div_val, name='crit')
    def reset_length(self, tgt_len, ext_len, mem_len):
--- a/transformers/modeling_tf_transfo_xl_utilities.py
+++ b/transformers/modeling_tf_transfo_xl_utilities.py
@@ -25,15 +25,15 @@ import tensorflow as tf
 from .modeling_tf_utils import shape_list
 class TFAdaptiveSoftmaxMask(tf.keras.layers.Layer):
-    def __init__(self, n_token, d_embed, d_proj, cutoffs, div_val=1,
+    def __init__(self, vocab_size, d_embed, d_proj, cutoffs, div_val=1,
                 keep_order=False, **kwargs):
        super(TFAdaptiveSoftmaxMask, self).__init__(**kwargs)
-        self.n_token = n_token
+        self.vocab_size = vocab_size
        self.d_embed = d_embed
        self.d_proj = d_proj
-        self.cutoffs = cutoffs + [n_token]
+        self.cutoffs = cutoffs + [vocab_size]
        self.cutoff_ends = [0] + self.cutoffs
        self.div_val = div_val
@@ -66,11 +66,11 @@ class TFAdaptiveSoftmaxMask(tf.keras.layers.Layer):
                    self.out_projs.append(weight)
                else:
                    self.out_projs.append(None)
-                weight = self.add_weight(shape=(self.n_token, self.d_embed,),
+                weight = self.add_weight(shape=(self.vocab_size, self.d_embed,),
                                         initializer='zeros',
                                         trainable=True,
                                         name='out_layers_._{}_._weight'.format(i))
-                bias = self.add_weight(shape=(self.n_token,),
+                bias = self.add_weight(shape=(self.vocab_size,),
                                         initializer='zeros',
                                         trainable=True,
                                         name='out_layers_._{}_._bias'.format(i))
@@ -114,7 +114,7 @@ class TFAdaptiveSoftmaxMask(tf.keras.layers.Layer):
        hidden, target = inputs
        head_logprob = 0
        if self.n_clusters == 0:
-            softmax_b = tf.get_variable('bias', [n_token], initializer=tf.zeros_initializer())
+            softmax_b = tf.get_variable('bias', [self.config.vocab_size], initializer=tf.zeros_initializer())
            output = self._logit(hidden, self.out_layers[0][0], self.out_layers[0][1], self.out_projs[0])
            if target is not None:
                loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=target, logits=output)
--- a/transformers/modeling_tf_utils.py
+++ b/transformers/modeling_tf_utils.py
@@ -297,7 +297,7 @@ class TFPreTrainedModel(tf.keras.Model):
        if from_pt:
            # Load from a PyTorch checkpoint
-            return load_pytorch_checkpoint_in_tf2_model(model, resolved_archive_file)
+            return load_pytorch_checkpoint_in_tf2_model(model, resolved_archive_file, allow_missing_keys=True)
        ret = model(model.dummy_inputs, training=False)  # build the network with dummy inputs
--- a/transformers/modeling_tf_xlnet.py
+++ b/transformers/modeling_tf_xlnet.py
@@ -366,7 +366,7 @@ class TFXLNetMainLayer(tf.keras.layers.Layer):
        self.use_bfloat16 = config.use_bfloat16
        self.initializer_range = config.initializer_range
-        self.word_embedding = TFSharedEmbeddings(config.n_token, config.d_model, initializer_range=config.initializer_range, name='word_embedding')
+        self.word_embedding = TFSharedEmbeddings(config.vocab_size, config.d_model, initializer_range=config.initializer_range, name='word_embedding')
        self.layer = [TFXLNetLayer(config, name='layer_._{}'.format(i)) for i in range(config.n_layer)]
        self.dropout = tf.keras.layers.Dropout(config.dropout)
--- a/transformers/modeling_transfo_xl.py
+++ b/transformers/modeling_transfo_xl.py
@@ -592,14 +592,14 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
        self.output_attentions = config.output_attentions
        self.output_hidden_states = config.output_hidden_states
-        self.n_token = config.n_token
+        self.n_token = config.vocab_size
        self.d_embed = config.d_embed
        self.d_model = config.d_model
        self.n_head = config.n_head
        self.d_head = config.d_head
-        self.word_emb = AdaptiveEmbedding(config.n_token, config.d_embed, config.d_model, config.cutoffs,
+        self.word_emb = AdaptiveEmbedding(config.vocab_size, config.d_embed, config.d_model, config.cutoffs,
                                          div_val=config.div_val)
        self.drop = nn.Dropout(config.dropout)
@@ -836,11 +836,11 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel):
        self.sample_softmax = config.sample_softmax
        # use sampled softmax
        if config.sample_softmax > 0:
-            self.out_layer = nn.Linear(config.d_model, config.n_token)
+            self.out_layer = nn.Linear(config.d_model, config.vocab_size)
-            self.sampler = LogUniformSampler(config.n_token, config.sample_softmax)
+            self.sampler = LogUniformSampler(config.vocab_size, config.sample_softmax)
        # use adaptive softmax (including standard softmax)
        else:
-            self.crit = ProjectedAdaptiveLogSoftmax(config.n_token, config.d_embed, config.d_model,
+            self.crit = ProjectedAdaptiveLogSoftmax(config.vocab_size, config.d_embed, config.d_model,
                                                    config.cutoffs, div_val=config.div_val)
        self.init_weights()
--- a/transformers/modeling_xlnet.py
+++ b/transformers/modeling_xlnet.py
@@ -609,7 +609,7 @@ class XLNetModel(XLNetPreTrainedModel):
        self.clamp_len = config.clamp_len
        self.n_layer = config.n_layer
-        self.word_embedding = nn.Embedding(config.n_token, config.d_model)
+        self.word_embedding = nn.Embedding(config.vocab_size, config.d_model)
        self.mask_emb = nn.Parameter(torch.FloatTensor(1, 1, config.d_model))
        self.layer = nn.ModuleList([XLNetLayer(config) for _ in range(config.n_layer)])
        self.dropout = nn.Dropout(config.dropout)
@@ -940,7 +940,7 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
        self.same_length = config.same_length
        self.transformer = XLNetModel(config)
-        self.lm_loss = nn.Linear(config.d_model, config.n_token, bias=True)
+        self.lm_loss = nn.Linear(config.d_model, config.vocab_size, bias=True)
        self.init_weights()
--- a/transformers/tests/configuration_common_test.py
+++ b/transformers/tests/configuration_common_test.py
@@ -16,15 +16,12 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 import copy
 import os
 import shutil
 import json
-import random
+import tempfile
 import uuid
 import unittest
-import logging
+from .tokenization_tests_commons import TemporaryDirectory
 class ConfigTester(object):
@@ -48,16 +45,28 @@ class ConfigTester(object):
    def create_and_test_config_to_json_file(self):
        config_first = self.config_class(**self.inputs_dict)
-        json_file_path = os.path.join(os.getcwd(), "config_" + str(uuid.uuid4()) + ".json")
+
-        config_first.to_json_file(json_file_path)
+        with TemporaryDirectory() as tmpdirname:
-        config_second = self.config_class.from_json_file(json_file_path)
+            json_file_path = os.path.join(tmpdirname, "config.json")
-        os.remove(json_file_path)
+            config_first.to_json_file(json_file_path)
            config_second = self.config_class.from_json_file(json_file_path)
        self.parent.assertEqual(config_second.to_dict(), config_first.to_dict())
    def create_and_test_config_from_and_save_pretrained(self):
        config_first = self.config_class(**self.inputs_dict)
        with TemporaryDirectory() as tmpdirname:
            config_first.save_pretrained(tmpdirname)
            config_second = self.config_class.from_pretrained(tmpdirname)
        self.parent.assertEqual(config_second.to_dict(), config_first.to_dict())
    def run_common_tests(self):
        self.create_and_test_config_common_properties()
        self.create_and_test_config_to_json_string()
        self.create_and_test_config_to_json_file()
        self.create_and_test_config_from_and_save_pretrained()
 if __name__ == "__main__":
    unittest.main()
--- a/transformers/tests/model_card_test.py
+++ b/transformers/tests/model_card_test.py
@@ -0,0 +1,89 @@
 # coding=utf-8
 # Copyright 2019 HuggingFace Inc.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from __future__ import absolute_import, division, print_function, unicode_literals
 import os
 import json
 import unittest
 from transformers.model_card import ModelCard
 from .tokenization_tests_commons import TemporaryDirectory
 class ModelCardTester(unittest.TestCase):
    def setUp(self):
        self.inputs_dict = {'model_details': {
                                'Organization': 'testing',
                                'Model date': 'today',
                                'Model version': 'v2.1, Developed by Test Corp in 2019.',
                                'Architecture': 'Convolutional Neural Network.',
                                },
                            'metrics': 'BLEU and ROUGE-1',
                            'evaluation_data':{
                                'Datasets':{
                                    'BLEU': 'My-great-dataset-v1',
                                    'ROUGE-1': 'My-short-dataset-v2.1',
                                },
                                'Preprocessing': 'See details on https://arxiv.org/pdf/1810.03993.pdf'
                            },
                            'training_data':{
                                'Dataset': 'English Wikipedia dump dated 2018-12-01',
                                'Preprocessing': 'Using SentencePiece vocabulary of size 52k tokens. See details on https://arxiv.org/pdf/1810.03993.pdf'
                            },
                            'quantitative_analyses': {
                                'BLEU': 55.1,
                                'ROUGE-1': 76,
                            },
                            }
    def test_model_card_common_properties(self):
        model_card = ModelCard.from_dict(self.inputs_dict)
        self.assertTrue(hasattr(model_card, 'model_details'))
        self.assertTrue(hasattr(model_card, 'intended_use'))
        self.assertTrue(hasattr(model_card, 'factors'))
        self.assertTrue(hasattr(model_card, 'metrics'))
        self.assertTrue(hasattr(model_card, 'evaluation_data'))
        self.assertTrue(hasattr(model_card, 'training_data'))
        self.assertTrue(hasattr(model_card, 'quantitative_analyses'))
        self.assertTrue(hasattr(model_card, 'ethical_considerations'))
        self.assertTrue(hasattr(model_card, 'caveats_and_recommendations'))
    def test_model_card_to_json_string(self):
        model_card = ModelCard.from_dict(self.inputs_dict)
        obj = json.loads(model_card.to_json_string())
        for key, value in self.inputs_dict.items():
            self.assertEqual(obj[key], value)
    def test_model_card_to_json_file(self):
        model_card_first = ModelCard.from_dict(self.inputs_dict)
        with TemporaryDirectory() as tmpdirname:
            filename = os.path.join(tmpdirname, u"model_card.json")
            model_card_first.to_json_file(filename)
            model_card_second = ModelCard.from_json_file(filename)
        self.assertEqual(model_card_second.to_dict(), model_card_first.to_dict())
    def test_model_card_from_and_save_pretrained(self):
        model_card_first = ModelCard.from_dict(self.inputs_dict)
        with TemporaryDirectory() as tmpdirname:
            model_card_first.save_pretrained(tmpdirname)
            model_card_second = ModelCard.from_pretrained(tmpdirname)
        self.assertEqual(model_card_second.to_dict(), model_card_first.to_dict())
 if __name__ == "__main__":
    unittest.main()
--- a/transformers/tests/modeling_albert_test.py
+++ b/transformers/tests/modeling_albert_test.py
@@ -110,7 +110,7 @@ class AlbertModelTest(CommonTestCases.CommonModelTester):
                choice_labels = ids_tensor([self.batch_size], self.num_choices)
            config = AlbertConfig(
-                vocab_size_or_config_json_file=self.vocab_size,
+                vocab_size=self.vocab_size,
                hidden_size=self.hidden_size,
                num_hidden_layers=self.num_hidden_layers,
                num_attention_heads=self.num_attention_heads,
--- a/transformers/tests/modeling_bert_test.py
+++ b/transformers/tests/modeling_bert_test.py
@@ -109,7 +109,7 @@ class BertModelTest(CommonTestCases.CommonModelTester):
                choice_labels = ids_tensor([self.batch_size], self.num_choices)
            config = BertConfig(
-                vocab_size_or_config_json_file=self.vocab_size,
+                vocab_size=self.vocab_size,
                hidden_size=self.hidden_size,
                num_hidden_layers=self.num_hidden_layers,
                num_attention_heads=self.num_attention_heads,
--- a/transformers/tests/modeling_common_test.py
+++ b/transformers/tests/modeling_common_test.py
@@ -676,7 +676,7 @@ class CommonTestCases:
                mc_token_ids = ids_tensor([self.batch_size, self.n_choices], self.seq_length)
            config = self.config_class(
-                vocab_size_or_config_json_file=self.vocab_size,
+                vocab_size=self.vocab_size,
                n_positions=self.n_positions,
                n_embd=self.hidden_size,
                n_layer=self.num_hidden_layers,
--- a/transformers/tests/modeling_ctrl_test.py
+++ b/transformers/tests/modeling_ctrl_test.py
@@ -114,7 +114,7 @@ class CTRLModelTest(CommonTestCases.CommonModelTester):
                choice_labels = ids_tensor([self.batch_size], self.num_choices)
            config = CTRLConfig(
-                vocab_size_or_config_json_file=self.vocab_size,
+                vocab_size=self.vocab_size,
                n_embd=self.hidden_size,
                n_layer=self.num_hidden_layers,
                n_head=self.num_attention_heads,
--- a/transformers/tests/modeling_distilbert_test.py
+++ b/transformers/tests/modeling_distilbert_test.py
@@ -105,7 +105,7 @@ class DistilBertModelTest(CommonTestCases.CommonModelTester):
                choice_labels = ids_tensor([self.batch_size], self.num_choices)
            config = DistilBertConfig(
-                vocab_size_or_config_json_file=self.vocab_size,
+                vocab_size=self.vocab_size,
                dim=self.hidden_size,
                n_layers=self.num_hidden_layers,
                n_heads=self.num_attention_heads,
--- a/transformers/tests/modeling_gpt2_test.py
+++ b/transformers/tests/modeling_gpt2_test.py
@@ -110,7 +110,7 @@ class GPT2ModelTest(CommonTestCases.CommonModelTester):
                choice_labels = ids_tensor([self.batch_size], self.num_choices)
            config = GPT2Config(
-                vocab_size_or_config_json_file=self.vocab_size,
+                vocab_size=self.vocab_size,
                n_embd=self.hidden_size,
                n_layer=self.num_hidden_layers,
                n_head=self.num_attention_heads,
--- a/transformers/tests/modeling_openai_test.py
+++ b/transformers/tests/modeling_openai_test.py
@@ -98,7 +98,7 @@ class OpenAIGPTModelTest(CommonTestCases.CommonModelTester):
                choice_labels = ids_tensor([self.batch_size], self.num_choices)
            config = OpenAIGPTConfig(
-                vocab_size_or_config_json_file=self.vocab_size,
+                vocab_size=self.vocab_size,
                n_embd=self.hidden_size,
                n_layer=self.num_hidden_layers,
                n_head=self.num_attention_heads,
--- a/transformers/tests/modeling_roberta_test.py
+++ b/transformers/tests/modeling_roberta_test.py
@@ -106,7 +106,7 @@ class RobertaModelTest(CommonTestCases.CommonModelTester):
                choice_labels = ids_tensor([self.batch_size], self.num_choices)
            config = RobertaConfig(
-                vocab_size_or_config_json_file=self.vocab_size,
+                vocab_size=self.vocab_size,
                hidden_size=self.hidden_size,
                num_hidden_layers=self.num_hidden_layers,
                num_attention_heads=self.num_attention_heads,
--- a/transformers/tests/modeling_t5_test.py
+++ b/transformers/tests/modeling_t5_test.py
@@ -93,7 +93,7 @@ class T5ModelTest(CommonTestCases.CommonModelTester):
                decoder_lm_labels = ids_tensor([self.batch_size, self.decoder_seq_length], self.vocab_size)
            config = T5Config(
-                vocab_size_or_config_json_file=self.vocab_size,
+                vocab_size=self.vocab_size,
                n_positions=self.n_positions,
                d_model=self.hidden_size,
                d_ff=self.d_ff,
--- a/transformers/tests/modeling_tf_albert_test.py
+++ b/transformers/tests/modeling_tf_albert_test.py
@@ -118,7 +118,7 @@ class TFAlbertModelTest(TFCommonTestCases.TFCommonModelTester):
                choice_labels = ids_tensor([self.batch_size], self.num_choices)
            config = AlbertConfig(
-                vocab_size_or_config_json_file=self.vocab_size,
+                vocab_size=self.vocab_size,
                hidden_size=self.hidden_size,
                num_hidden_layers=self.num_hidden_layers,
                num_attention_heads=self.num_attention_heads,
--- a/transformers/tests/modeling_tf_bert_test.py
+++ b/transformers/tests/modeling_tf_bert_test.py
@@ -114,7 +114,7 @@ class TFBertModelTest(TFCommonTestCases.TFCommonModelTester):
                choice_labels = ids_tensor([self.batch_size], self.num_choices)
            config = BertConfig(
-                vocab_size_or_config_json_file=self.vocab_size,
+                vocab_size=self.vocab_size,
                hidden_size=self.hidden_size,
                num_hidden_layers=self.num_hidden_layers,
                num_attention_heads=self.num_attention_heads,
--- a/transformers/tests/modeling_tf_ctrl_test.py
+++ b/transformers/tests/modeling_tf_ctrl_test.py
@@ -112,7 +112,7 @@ class TFCTRLModelTest(TFCommonTestCases.TFCommonModelTester):
                choice_labels = ids_tensor([self.batch_size], self.num_choices)
            config = CTRLConfig(
-                vocab_size_or_config_json_file=self.vocab_size,
+                vocab_size=self.vocab_size,
                n_embd=self.hidden_size,
                n_layer=self.num_hidden_layers,
                n_head=self.num_attention_heads,
--- a/transformers/tests/modeling_tf_distilbert_test.py
+++ b/transformers/tests/modeling_tf_distilbert_test.py
@@ -107,7 +107,7 @@ class TFDistilBertModelTest(TFCommonTestCases.TFCommonModelTester):
                choice_labels = ids_tensor([self.batch_size], self.num_choices)
            config = DistilBertConfig(
-                vocab_size_or_config_json_file=self.vocab_size,
+                vocab_size=self.vocab_size,
                dim=self.hidden_size,
                n_layers=self.num_hidden_layers,
                n_heads=self.num_attention_heads,
--- a/transformers/tests/modeling_tf_gpt2_test.py
+++ b/transformers/tests/modeling_tf_gpt2_test.py
@@ -115,7 +115,7 @@ class TFGPT2ModelTest(TFCommonTestCases.TFCommonModelTester):
                choice_labels = ids_tensor([self.batch_size], self.num_choices)
            config = GPT2Config(
-                vocab_size_or_config_json_file=self.vocab_size,
+                vocab_size=self.vocab_size,
                n_embd=self.hidden_size,
                n_layer=self.num_hidden_layers,
                n_head=self.num_attention_heads,
--- a/transformers/tests/modeling_tf_openai_gpt_test.py
+++ b/transformers/tests/modeling_tf_openai_gpt_test.py
@@ -114,7 +114,7 @@ class TFOpenAIGPTModelTest(TFCommonTestCases.TFCommonModelTester):
                choice_labels = ids_tensor([self.batch_size], self.num_choices)
            config = OpenAIGPTConfig(
-                vocab_size_or_config_json_file=self.vocab_size,
+                vocab_size=self.vocab_size,
                n_embd=self.hidden_size,
                n_layer=self.num_hidden_layers,
                n_head=self.num_attention_heads,
--- a/transformers/tests/modeling_tf_roberta_test.py
+++ b/transformers/tests/modeling_tf_roberta_test.py
@@ -109,7 +109,7 @@ class TFRobertaModelTest(TFCommonTestCases.TFCommonModelTester):
                choice_labels = ids_tensor([self.batch_size], self.num_choices)
            config = RobertaConfig(
-                vocab_size_or_config_json_file=self.vocab_size,
+                vocab_size=self.vocab_size,
                hidden_size=self.hidden_size,
                num_hidden_layers=self.num_hidden_layers,
                num_attention_heads=self.num_attention_heads,
--- a/transformers/tests/modeling_tf_t5_test.py
+++ b/transformers/tests/modeling_tf_t5_test.py
@@ -87,7 +87,7 @@ class TFT5ModelTest(TFCommonTestCases.TFCommonModelTester):
                token_labels = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
            config = T5Config(
-                vocab_size_or_config_json_file=self.vocab_size,
+                vocab_size=self.vocab_size,
                n_positions=self.n_positions,
                d_model=self.hidden_size,
                d_ff=self.d_ff,
--- a/transformers/tests/modeling_tf_transfo_xl_test.py
+++ b/transformers/tests/modeling_tf_transfo_xl_test.py
@@ -92,7 +92,7 @@ class TFTransfoXLModelTest(TFCommonTestCases.TFCommonModelTester):
                lm_labels = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
            config = TransfoXLConfig(
-                vocab_size_or_config_json_file=self.vocab_size,
+                vocab_size=self.vocab_size,
                mem_len=self.mem_len,
                clamp_len=self.clamp_len,
                cutoffs=self.cutoffs,
--- a/transformers/tests/modeling_tf_xlm_test.py
+++ b/transformers/tests/modeling_tf_xlm_test.py
@@ -125,7 +125,7 @@ class TFXLMModelTest(TFCommonTestCases.TFCommonModelTester):
                is_impossible_labels = ids_tensor([self.batch_size], 2, dtype=tf.float32)
            config = XLMConfig(
-                 vocab_size_or_config_json_file=self.vocab_size,
+                 vocab_size=self.vocab_size,
                 n_special=self.n_special,
                 emb_dim=self.hidden_size,
                 n_layers=self.num_hidden_layers,
--- a/transformers/tests/modeling_tf_xlnet_test.py
+++ b/transformers/tests/modeling_tf_xlnet_test.py
@@ -64,7 +64,6 @@ class TFXLNetModelTest(TFCommonTestCases.TFCommonModelTester):
                     num_attention_heads=4,
                     d_inner=128,
                     num_hidden_layers=5,
                     max_position_embeddings=10,
                     type_sequence_label_size=2,
                     untie_r=True,
                     bi_data=False,
@@ -88,7 +87,6 @@ class TFXLNetModelTest(TFCommonTestCases.TFCommonModelTester):
            self.num_attention_heads = num_attention_heads
            self.d_inner = d_inner
            self.num_hidden_layers = num_hidden_layers
            self.max_position_embeddings = max_position_embeddings
            self.bi_data = bi_data
            self.untie_r = untie_r
            self.same_length = same_length
@@ -122,13 +120,12 @@ class TFXLNetModelTest(TFCommonTestCases.TFCommonModelTester):
                is_impossible_labels = ids_tensor([self.batch_size], 2, dtype=tf.float32)
            config = XLNetConfig(
-                vocab_size_or_config_json_file=self.vocab_size,
+                vocab_size=self.vocab_size,
                d_model=self.hidden_size,
                n_head=self.num_attention_heads,
                d_inner=self.d_inner,
                n_layer=self.num_hidden_layers,
                untie_r=self.untie_r,
                max_position_embeddings=self.max_position_embeddings,
                mem_len=self.mem_len,
                clamp_len=self.clamp_len,
                same_length=self.same_length,
--- a/transformers/tests/modeling_transfo_xl_test.py
+++ b/transformers/tests/modeling_transfo_xl_test.py
@@ -91,7 +91,7 @@ class TransfoXLModelTest(CommonTestCases.CommonModelTester):
                lm_labels = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
            config = TransfoXLConfig(
-                vocab_size_or_config_json_file=self.vocab_size,
+                vocab_size=self.vocab_size,
                mem_len=self.mem_len,
                clamp_len=self.clamp_len,
                cutoffs=self.cutoffs,
--- a/transformers/tests/modeling_xlm_test.py
+++ b/transformers/tests/modeling_xlm_test.py
@@ -121,7 +121,7 @@ class XLMModelTest(CommonTestCases.CommonModelTester):
                is_impossible_labels = ids_tensor([self.batch_size], 2).float()
            config = XLMConfig(
-                 vocab_size_or_config_json_file=self.vocab_size,
+                 vocab_size=self.vocab_size,
                 n_special=self.n_special,
                 emb_dim=self.hidden_size,
                 n_layers=self.num_hidden_layers,
--- a/transformers/tests/modeling_xlnet_test.py
+++ b/transformers/tests/modeling_xlnet_test.py
@@ -60,7 +60,6 @@ class XLNetModelTest(CommonTestCases.CommonModelTester):
                     num_attention_heads=4,
                     d_inner=128,
                     num_hidden_layers=5,
                     max_position_embeddings=10,
                     type_sequence_label_size=2,
                     untie_r=True,
                     bi_data=False,
@@ -84,7 +83,6 @@ class XLNetModelTest(CommonTestCases.CommonModelTester):
            self.num_attention_heads = num_attention_heads
            self.d_inner = d_inner
            self.num_hidden_layers = num_hidden_layers
            self.max_position_embeddings = max_position_embeddings
            self.bi_data = bi_data
            self.untie_r = untie_r
            self.same_length = same_length
@@ -116,13 +114,12 @@ class XLNetModelTest(CommonTestCases.CommonModelTester):
                token_labels = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
            config = XLNetConfig(
-                vocab_size_or_config_json_file=self.vocab_size,
+                vocab_size=self.vocab_size,
                d_model=self.hidden_size,
                n_head=self.num_attention_heads,
                d_inner=self.d_inner,
                n_layer=self.num_hidden_layers,
                untie_r=self.untie_r,
                max_position_embeddings=self.max_position_embeddings,
                mem_len=self.mem_len,
                clamp_len=self.clamp_len,
                same_length=self.same_length,