From 256086bc6908448fc6aff9b1e19d95c4f6019bee Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Fri, 30 Aug 2019 22:34:23 +0200
Subject: [PATCH] clean up and simplify hubconf

---
 hubconf.py                            | 118 ++++++++-
 hubconfs/automodels_hubconf.py        | 110 --------
 hubconfs/bert_hubconf.py              | 360 --------------------------
 hubconfs/gpt2_hubconf.py              | 168 ------------
 hubconfs/gpt_hubconf.py               | 186 -------------
 hubconfs/transformer_xl_hubconf.py    | 130 ----------
 hubconfs/xlm_hubconf.py               | 167 ------------
 hubconfs/xlnet_hubconf.1.py           | 169 ------------
 pytorch_transformers/modeling_auto.py |   5 -
 9 files changed, 110 insertions(+), 1303 deletions(-)
 delete mode 100644 hubconfs/automodels_hubconf.py
 delete mode 100644 hubconfs/bert_hubconf.py
 delete mode 100644 hubconfs/gpt2_hubconf.py
 delete mode 100644 hubconfs/gpt_hubconf.py
 delete mode 100644 hubconfs/transformer_xl_hubconf.py
 delete mode 100644 hubconfs/xlm_hubconf.py
 delete mode 100644 hubconfs/xlnet_hubconf.1.py

diff --git a/hubconf.py b/hubconf.py
index 05afd63a46..35e7f1eea8 100644
--- a/hubconf.py
+++ b/hubconf.py
@@ -1,10 +1,112 @@
+from pytorch_transformers import (
+    AutoTokenizer, AutoConfig, AutoModel, AutoModelWithLMHead, AutoModelForSequenceClassification, AutoModelForQuestionAnswering
+)
+from pytorch_transformers.modeling_utils import add_start_docstrings
+
 dependencies = ['torch', 'tqdm', 'boto3', 'requests', 'regex', 'sentencepiece', 'sacremoses']
 
-from hubconfs.automodels_hubconf import (
-    config,
-    model,
-    modelForQuestionAnswering,
-    modelForSequenceClassification,
-    modelWithLMHead,
-    tokenizer,
-)
+@add_start_docstrings(AutoConfig.__doc__)
+def config(*args, **kwargs):
+    r""" 
+                # Using torch.hub !
+                import torch
+
+                config = torch.hub.load('huggingface/pytorch-transformers', 'config', 'bert-base-uncased')  # Download configuration from S3 and cache.
+                config = torch.hub.load('huggingface/pytorch-transformers', 'config', './test/bert_saved_model/')  # E.g. config (or model) was saved using `save_pretrained('./test/saved_model/')`
+                config = torch.hub.load('huggingface/pytorch-transformers', 'config', './test/bert_saved_model/my_configuration.json')
+                config = torch.hub.load('huggingface/pytorch-transformers', 'config', 'bert-base-uncased', output_attention=True, foo=False)
+                assert config.output_attention == True
+                config, unused_kwargs = torch.hub.load('huggingface/pytorch-transformers', 'config', 'bert-base-uncased', output_attention=True, foo=False, return_unused_kwargs=True)
+                assert config.output_attention == True
+                assert unused_kwargs == {'foo': False}
+
+            """
+
+    return AutoConfig.from_pretrained(*args, **kwargs)
+
+
+@add_start_docstrings(AutoTokenizer.__doc__)
+def tokenizer(*args, **kwargs):
+    r""" 
+        # Using torch.hub !
+        import torch
+
+        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'tokenizer', 'bert-base-uncased')    # Download vocabulary from S3 and cache.
+        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'tokenizer', './test/bert_saved_model/')  # E.g. tokenizer was saved using `save_pretrained('./test/saved_model/')`
+
+    """
+
+    return AutoTokenizer.from_pretrained(*args, **kwargs)
+
+
+@add_start_docstrings(AutoModel.__doc__)
+def model(*args, **kwargs):
+    r"""
+            # Using torch.hub !
+            import torch
+
+            model = torch.hub.load('huggingface/pytorch-transformers', 'model', 'bert-base-uncased')    # Download model and configuration from S3 and cache.
+            model = torch.hub.load('huggingface/pytorch-transformers', 'model', './test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
+            model = torch.hub.load('huggingface/pytorch-transformers', 'model', 'bert-base-uncased', output_attention=True)  # Update configuration during loading
+            assert model.config.output_attention == True
+            # Loading from a TF checkpoint file instead of a PyTorch model (slower)
+            config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
+            model = torch.hub.load('huggingface/pytorch-transformers', 'model', './tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
+
+        """
+
+    return AutoModel.from_pretrained(*args, **kwargs)
+
+@add_start_docstrings(AutoModelWithLMHead.__doc__)
+def modelWithLMHead(*args, **kwargs):
+    r"""
+        # Using torch.hub !
+        import torch
+
+        model = torch.hub.load('huggingface/pytorch-transformers', 'modelWithLMHead', 'bert-base-uncased')    # Download model and configuration from S3 and cache.
+        model = torch.hub.load('huggingface/pytorch-transformers', 'modelWithLMHead', './test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
+        model = torch.hub.load('huggingface/pytorch-transformers', 'modelWithLMHead', 'bert-base-uncased', output_attention=True)  # Update configuration during loading
+        assert model.config.output_attention == True
+        # Loading from a TF checkpoint file instead of a PyTorch model (slower)
+        config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
+        model = torch.hub.load('huggingface/pytorch-transformers', 'modelWithLMHead', './tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
+
+    """
+    return AutoModelWithLMHead.from_pretrained(*args, **kwargs)
+
+
+@add_start_docstrings(AutoModelForSequenceClassification.__doc__)
+def modelForSequenceClassification(*args, **kwargs):
+    r"""
+            # Using torch.hub !
+            import torch
+
+            model = torch.hub.load('huggingface/pytorch-transformers', 'modelForSequenceClassification', 'bert-base-uncased')    # Download model and configuration from S3 and cache.
+            model = torch.hub.load('huggingface/pytorch-transformers', 'modelForSequenceClassification', './test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
+            model = torch.hub.load('huggingface/pytorch-transformers', 'modelForSequenceClassification', 'bert-base-uncased', output_attention=True)  # Update configuration during loading
+            assert model.config.output_attention == True
+            # Loading from a TF checkpoint file instead of a PyTorch model (slower)
+            config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
+            model = torch.hub.load('huggingface/pytorch-transformers', 'modelForSequenceClassification', './tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
+
+        """
+
+    return AutoModelForSequenceClassification.from_pretrained(*args, **kwargs)
+
+
+@add_start_docstrings(AutoModelForQuestionAnswering.__doc__)
+def modelForQuestionAnswering(*args, **kwargs):
+    r"""
+        # Using torch.hub !
+        import torch
+
+        model = torch.hub.load('huggingface/pytorch-transformers', 'modelForQuestionAnswering', 'bert-base-uncased')    # Download model and configuration from S3 and cache.
+        model = torch.hub.load('huggingface/pytorch-transformers', 'modelForQuestionAnswering', './test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
+        model = torch.hub.load('huggingface/pytorch-transformers', 'modelForQuestionAnswering', 'bert-base-uncased', output_attention=True)  # Update configuration during loading
+        assert model.config.output_attention == True
+        # Loading from a TF checkpoint file instead of a PyTorch model (slower)
+        config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
+        model = torch.hub.load('huggingface/pytorch-transformers', 'modelForQuestionAnswering', './tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
+
+    """
+    return AutoModelForQuestionAnswering.from_pretrained(*args, **kwargs)
diff --git a/hubconfs/automodels_hubconf.py b/hubconfs/automodels_hubconf.py
deleted file mode 100644
index 5c1ab5ebc6..0000000000
--- a/hubconfs/automodels_hubconf.py
+++ /dev/null
@@ -1,110 +0,0 @@
-from pytorch_transformers import (
-    AutoTokenizer, AutoConfig, AutoModel, AutoModelWithLMHead, AutoModelForSequenceClassification, AutoModelForQuestionAnswering
-)
-from pytorch_transformers.modeling_utils import add_start_docstrings
-
-@add_start_docstrings(AutoConfig.__doc__)
-def config(*args, **kwargs):
-    r""" 
-                # Using torch.hub !
-                import torch
-
-                config = torch.hub.load('huggingface/pytorch-transformers', 'config', 'bert-base-uncased')  # Download configuration from S3 and cache.
-                config = torch.hub.load('huggingface/pytorch-transformers', 'config', './test/bert_saved_model/')  # E.g. config (or model) was saved using `save_pretrained('./test/saved_model/')`
-                config = torch.hub.load('huggingface/pytorch-transformers', 'config', './test/bert_saved_model/my_configuration.json')
-                config = torch.hub.load('huggingface/pytorch-transformers', 'config', 'bert-base-uncased', output_attention=True, foo=False)
-                assert config.output_attention == True
-                config, unused_kwargs = torch.hub.load('huggingface/pytorch-transformers', 'config', 'bert-base-uncased', output_attention=True, foo=False, return_unused_kwargs=True)
-                assert config.output_attention == True
-                assert unused_kwargs == {'foo': False}
-
-            """
-
-    return AutoConfig.from_pretrained(*args, **kwargs)
-
-
-@add_start_docstrings(AutoTokenizer.__doc__)
-def tokenizer(*args, **kwargs):
-    r""" 
-        # Using torch.hub !
-        import torch
-
-        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'tokenizer', 'bert-base-uncased')    # Download vocabulary from S3 and cache.
-        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'tokenizer', './test/bert_saved_model/')  # E.g. tokenizer was saved using `save_pretrained('./test/saved_model/')`
-
-    """
-
-    return AutoTokenizer.from_pretrained(*args, **kwargs)
-
-
-@add_start_docstrings(AutoModel.__doc__)
-def model(*args, **kwargs):
-    r"""
-            # Using torch.hub !
-            import torch
-
-            model = torch.hub.load('huggingface/pytorch-transformers', 'model', 'bert-base-uncased')    # Download model and configuration from S3 and cache.
-            model = torch.hub.load('huggingface/pytorch-transformers', 'model', './test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
-            model = torch.hub.load('huggingface/pytorch-transformers', 'model', 'bert-base-uncased', output_attention=True)  # Update configuration during loading
-            assert model.config.output_attention == True
-            # Loading from a TF checkpoint file instead of a PyTorch model (slower)
-            config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
-            model = torch.hub.load('huggingface/pytorch-transformers', 'model', './tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
-
-        """
-
-    return AutoModel.from_pretrained(*args, **kwargs)
-
-@add_start_docstrings(AutoModelWithLMHead.__doc__)
-def modelWithLMHead(*args, **kwargs):
-    r"""
-        # Using torch.hub !
-        import torch
-
-        model = torch.hub.load('huggingface/pytorch-transformers', 'modelWithLMHead', 'bert-base-uncased')    # Download model and configuration from S3 and cache.
-        model = torch.hub.load('huggingface/pytorch-transformers', 'modelWithLMHead', './test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
-        model = torch.hub.load('huggingface/pytorch-transformers', 'modelWithLMHead', 'bert-base-uncased', output_attention=True)  # Update configuration during loading
-        assert model.config.output_attention == True
-        # Loading from a TF checkpoint file instead of a PyTorch model (slower)
-        config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
-        model = torch.hub.load('huggingface/pytorch-transformers', 'modelWithLMHead', './tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
-
-    """
-    return AutoModelWithLMHead.from_pretrained(*args, **kwargs)
-
-
-@add_start_docstrings(AutoModelForSequenceClassification.__doc__)
-def modelForSequenceClassification(*args, **kwargs):
-    r"""
-            # Using torch.hub !
-            import torch
-
-            model = torch.hub.load('huggingface/pytorch-transformers', 'modelForSequenceClassification', 'bert-base-uncased')    # Download model and configuration from S3 and cache.
-            model = torch.hub.load('huggingface/pytorch-transformers', 'modelForSequenceClassification', './test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
-            model = torch.hub.load('huggingface/pytorch-transformers', 'modelForSequenceClassification', 'bert-base-uncased', output_attention=True)  # Update configuration during loading
-            assert model.config.output_attention == True
-            # Loading from a TF checkpoint file instead of a PyTorch model (slower)
-            config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
-            model = torch.hub.load('huggingface/pytorch-transformers', 'modelForSequenceClassification', './tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
-
-        """
-
-    return AutoModelForSequenceClassification.from_pretrained(*args, **kwargs)
-
-
-@add_start_docstrings(AutoModelForQuestionAnswering.__doc__)
-def modelForQuestionAnswering(*args, **kwargs):
-    r"""
-        # Using torch.hub !
-        import torch
-
-        model = torch.hub.load('huggingface/pytorch-transformers', 'modelForQuestionAnswering', 'bert-base-uncased')    # Download model and configuration from S3 and cache.
-        model = torch.hub.load('huggingface/pytorch-transformers', 'modelForQuestionAnswering', './test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
-        model = torch.hub.load('huggingface/pytorch-transformers', 'modelForQuestionAnswering', 'bert-base-uncased', output_attention=True)  # Update configuration during loading
-        assert model.config.output_attention == True
-        # Loading from a TF checkpoint file instead of a PyTorch model (slower)
-        config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
-        model = torch.hub.load('huggingface/pytorch-transformers', 'modelForQuestionAnswering', './tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
-
-    """
-    return AutoModelForQuestionAnswering.from_pretrained(*args, **kwargs)
diff --git a/hubconfs/bert_hubconf.py b/hubconfs/bert_hubconf.py
deleted file mode 100644
index 6e2830617f..0000000000
--- a/hubconfs/bert_hubconf.py
+++ /dev/null
@@ -1,360 +0,0 @@
-from pytorch_transformers.tokenization_bert import BertTokenizer
-from pytorch_transformers.modeling_bert import (
-        BertModel,
-        BertForNextSentencePrediction,
-        BertForMaskedLM,
-        BertForMultipleChoice,
-        BertForPreTraining,
-        BertForQuestionAnswering,
-        BertForSequenceClassification,
-        BertForTokenClassification,
-        )
-
-# A lot of models share the same param doc. Use a decorator
-# to save typing
-bert_docstring = """
-    Params:
-        pretrained_model_name_or_path: either:
-            - a str with the name of a pre-trained model to load
-                . `bert-base-uncased`
-                . `bert-large-uncased`
-                . `bert-base-cased`
-                . `bert-large-cased`
-                . `bert-base-multilingual-uncased`
-                . `bert-base-multilingual-cased`
-                . `bert-base-chinese`
-                . `bert-base-german-cased`
-                . `bert-large-uncased-whole-word-masking`
-                . `bert-large-cased-whole-word-masking`
-            - a path or url to a pretrained model archive containing:
-                . `bert_config.json` a configuration file for the model
-                . `pytorch_model.bin` a PyTorch dump of a BertForPreTraining
-                  instance
-            - a path or url to a pretrained model archive containing:
-                . `bert_config.json` a configuration file for the model
-                . `model.chkpt` a TensorFlow checkpoint
-        from_tf: should we load the weights from a locally saved TensorFlow
-                 checkpoint
-        cache_dir: an optional path to a folder in which the pre-trained models
-                   will be cached.
-        state_dict: an optional state dictionary
-                    (collections.OrderedDict object) to use instead of Google
-                    pre-trained models
-        *inputs, **kwargs: additional input for the specific Bert class
-            (ex: num_labels for BertForSequenceClassification)
-"""
-
-
-def _append_from_pretrained_docstring(docstr):
-    def docstring_decorator(fn):
-        fn.__doc__ = fn.__doc__ + docstr
-        return fn
-    return docstring_decorator
-
-
-def bertTokenizer(*args, **kwargs):
-    """
-    Instantiate a BertTokenizer from a pre-trained/customized vocab file
-    Args:
-    pretrained_model_name_or_path: Path to pretrained model archive
-                                   or one of pre-trained vocab configs below.
-                                       * bert-base-uncased
-                                       * bert-large-uncased
-                                       * bert-base-cased
-                                       * bert-large-cased
-                                       * bert-base-multilingual-uncased
-                                       * bert-base-multilingual-cased
-                                       * bert-base-chinese
-    Keyword args:
-    cache_dir: an optional path to a specific directory to download and cache
-               the pre-trained model weights.
-               Default: None
-    do_lower_case: Whether to lower case the input.
-                   Only has an effect when do_wordpiece_only=False
-                   Default: True
-    do_basic_tokenize: Whether to do basic tokenization before wordpiece.
-                       Default: True
-    max_len: An artificial maximum length to truncate tokenized sequences to;
-             Effective maximum length is always the minimum of this
-             value (if specified) and the underlying BERT model's
-             sequence length.
-             Default: None
-    never_split: List of tokens which will never be split during tokenization.
-                 Only has an effect when do_wordpiece_only=False
-                 Default: ["[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]"]
-
-    Example:
-        import torch
-        sentence = 'Hello, World!'
-        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
-        toks = tokenizer.tokenize(sentence)
-        ['Hello', '##,', 'World', '##!']
-        ids = tokenizer.convert_tokens_to_ids(toks)
-        [8667, 28136, 1291, 28125]
-    """
-    tokenizer = BertTokenizer.from_pretrained(*args, **kwargs)
-    return tokenizer
-
-
-@_append_from_pretrained_docstring(bert_docstring)
-def bertModel(*args, **kwargs):
-    """
-    BertModel is the basic BERT Transformer model with a layer of summed token,
-    position and sequence embeddings followed by a series of identical
-    self-attention blocks (12 for BERT-base, 24 for BERT-large).
-
-    Example:
-        # Load the tokenizer
-        import torch
-        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
-        #  Prepare tokenized input
-        text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
-        tokenized_text = tokenizer.tokenize(text)
-        indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
-        segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-        tokens_tensor = torch.tensor([indexed_tokens])
-        segments_tensors = torch.tensor([segments_ids])
-        # Load bertModel
-        model = torch.hub.load('huggingface/pytorch-transformers', 'bertModel', 'bert-base-cased')
-        model.eval()
-        # Predict hidden states features for each layer
-        with torch.no_grad():
-                encoded_layers, _ = model(tokens_tensor, segments_tensors)
-    """
-    model = BertModel.from_pretrained(*args, **kwargs)
-    return model
-
-
-@_append_from_pretrained_docstring(bert_docstring)
-def bertForNextSentencePrediction(*args, **kwargs):
-    """
-    BERT model with next sentence prediction head.
-    This module comprises the BERT model followed by the next sentence
-    classification head.
-
-    Example:
-        # Load the tokenizer
-        import torch
-        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
-        #  Prepare tokenized input
-        text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
-        tokenized_text = tokenizer.tokenize(text)
-        indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
-        segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-        tokens_tensor = torch.tensor([indexed_tokens])
-        segments_tensors = torch.tensor([segments_ids])
-        # Load bertForNextSentencePrediction
-        model = torch.hub.load('huggingface/pytorch-transformers', 'bertForNextSentencePrediction', 'bert-base-cased')
-        model.eval()
-        # Predict the next sentence classification logits
-        with torch.no_grad():
-                next_sent_classif_logits = model(tokens_tensor, segments_tensors)
-    """
-    model = BertForNextSentencePrediction.from_pretrained(*args, **kwargs)
-    return model
-
-
-@_append_from_pretrained_docstring(bert_docstring)
-def bertForPreTraining(*args, **kwargs):
-    """
-    BERT model with pre-training heads.
-    This module comprises the BERT model followed by the two pre-training heads
-        - the masked language modeling head, and
-        - the next sentence classification head.
-
-    Example:
-        # Load the tokenizer
-        import torch
-        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
-        #  Prepare tokenized input
-        text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
-        tokenized_text = tokenizer.tokenize(text)
-        segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-        tokens_tensor = torch.tensor([indexed_tokens])
-        segments_tensors = torch.tensor([segments_ids])
-        # Load bertForPreTraining
-        model = torch.hub.load('huggingface/pytorch-transformers', 'bertForPreTraining', 'bert-base-cased')
-        masked_lm_logits_scores, seq_relationship_logits = model(tokens_tensor, segments_tensors)
-    """
-    model = BertForPreTraining.from_pretrained(*args, **kwargs)
-    return model
-
-
-@_append_from_pretrained_docstring(bert_docstring)
-def bertForMaskedLM(*args, **kwargs):
-    """
-    BertForMaskedLM includes the BertModel Transformer followed by the
-    (possibly) pre-trained masked language modeling head.
-
-    Example:
-        # Load the tokenizer
-        import torch
-        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
-        #  Prepare tokenized input
-        text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
-        tokenized_text = tokenizer.tokenize(text)
-        masked_index = 8
-        tokenized_text[masked_index] = '[MASK]'
-        indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
-        segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-        tokens_tensor = torch.tensor([indexed_tokens])
-        segments_tensors = torch.tensor([segments_ids])
-        # Load bertForMaskedLM
-        model = torch.hub.load('huggingface/pytorch-transformers', 'bertForMaskedLM', 'bert-base-cased')
-        model.eval()
-        # Predict all tokens
-        with torch.no_grad():
-                predictions = model(tokens_tensor, segments_tensors)
-        predicted_index = torch.argmax(predictions[0, masked_index]).item()
-        predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
-        'henson'
-    """
-    model = BertForMaskedLM.from_pretrained(*args, **kwargs)
-    return model
-
-
-@_append_from_pretrained_docstring(bert_docstring)
-def bertForSequenceClassification(*args, **kwargs):
-    """
-    BertForSequenceClassification is a fine-tuning model that includes
-    BertModel and a sequence-level (sequence or pair of sequences) classifier
-    on top of the BertModel. Note that the classification head is only initialized
-    and has to be trained.
-
-    The sequence-level classifier is a linear layer that takes as input the
-    last hidden state of the first character in the input sequence
-    (see Figures 3a and 3b in the BERT paper).
-
-    Args:
-    num_labels: the number (>=2) of classes for the classifier.
-
-    Example:
-        # Load the tokenizer
-        import torch
-        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
-        #  Prepare tokenized input
-        text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
-        tokenized_text = tokenizer.tokenize(text)
-        indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
-        segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-        tokens_tensor = torch.tensor([indexed_tokens])
-        segments_tensors = torch.tensor([segments_ids])
-        # Load bertForSequenceClassification
-        model = torch.hub.load('huggingface/pytorch-transformers', 'bertForSequenceClassification', 'bert-base-cased', num_labels=2)
-        model.eval()
-        # Predict the sequence classification logits
-        with torch.no_grad():
-                seq_classif_logits = model(tokens_tensor, segments_tensors)
-        # Or get the sequence classification loss
-        labels = torch.tensor([1])
-        seq_classif_loss = model(tokens_tensor, segments_tensors, labels=labels) # set model.train() before if training this loss
-    """
-    model = BertForSequenceClassification.from_pretrained(*args, **kwargs)
-    return model
-
-
-@_append_from_pretrained_docstring(bert_docstring)
-def bertForMultipleChoice(*args, **kwargs):
-    """
-    BertForMultipleChoice is a fine-tuning model that includes BertModel and a
-    linear layer on top of the BertModel. Note that the multiple choice head is
-    only initialized and has to be trained.
-
-    Args:
-    num_choices: the number (>=2) of classes for the classifier.
-
-    Example:
-        # Load the tokenizer
-        import torch
-        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
-        #  Prepare tokenized input
-        text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
-        tokenized_text = tokenizer.tokenize(text)
-        indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
-        segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-        tokens_tensor = torch.tensor([indexed_tokens, indexed_tokens]).unsqueeze(0)
-        segments_tensors = torch.tensor([segments_ids, segments_ids]).unsqueeze(0)
-        # Load bertForMultipleChoice
-        model = torch.hub.load('huggingface/pytorch-transformers', 'bertForMultipleChoice', 'bert-base-cased', num_choices=2)
-        model.eval()
-        # Predict the multiple choice logits
-        with torch.no_grad():
-                multiple_choice_logits = model(tokens_tensor, segments_tensors)
-        # Or get the multiple choice loss
-        labels = torch.tensor([1])
-        multiple_choice_loss = model(tokens_tensor, segments_tensors, labels=labels) # set model.train() before if training this loss
-    """
-    model = BertForMultipleChoice.from_pretrained(*args, **kwargs)
-    return model
-
-
-@_append_from_pretrained_docstring(bert_docstring)
-def bertForQuestionAnswering(*args, **kwargs):
-    """
-    BertForQuestionAnswering is a fine-tuning model that includes BertModel
-    with a token-level classifiers on top of the full sequence of last hidden
-    states. Note that the classification head is only initialized
-    and has to be trained.
-
-    Example:
-        # Load the tokenizer
-        import torch
-        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
-        #  Prepare tokenized input
-        text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
-        tokenized_text = tokenizer.tokenize(text)
-        indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
-        segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-        tokens_tensor = torch.tensor([indexed_tokens])
-        segments_tensors = torch.tensor([segments_ids])
-        # Load bertForQuestionAnswering
-        model = torch.hub.load('huggingface/pytorch-transformers', 'bertForQuestionAnswering', 'bert-base-cased')
-        model.eval()
-        # Predict the start and end positions logits
-        with torch.no_grad():
-                start_logits, end_logits = model(tokens_tensor, segments_tensors)
-        # Or get the total loss which is the sum of the CrossEntropy loss for the start and end token positions
-        start_positions, end_positions = torch.tensor([12]), torch.tensor([14])
-        # set model.train() before if training this loss
-        multiple_choice_loss = model(tokens_tensor, segments_tensors, start_positions=start_positions, end_positions=end_positions)
-    """
-    model = BertForQuestionAnswering.from_pretrained(*args, **kwargs)
-    return model
-
-
-@_append_from_pretrained_docstring(bert_docstring)
-def bertForTokenClassification(*args, **kwargs):
-    """
-    BertForTokenClassification is a fine-tuning model that includes BertModel
-    and a token-level classifier on top of the BertModel. Note that the classification
-    head is only initialized and has to be trained.
-
-    The token-level classifier is a linear layer that takes as input the last
-    hidden state of the sequence.
-
-    Args:
-    num_labels: the number (>=2) of classes for the classifier.
-
-    Example:
-        # Load the tokenizer
-        import torch
-        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
-        #  Prepare tokenized input
-        text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
-        tokenized_text = tokenizer.tokenize(text)
-        indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
-        segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-        tokens_tensor = torch.tensor([indexed_tokens])
-        segments_tensors = torch.tensor([segments_ids])
-        # Load bertForTokenClassification
-        model = torch.hub.load('huggingface/pytorch-transformers', 'bertForTokenClassification', 'bert-base-cased', num_labels=2)
-        model.eval()
-        # Predict the token classification logits
-        with torch.no_grad():
-                classif_logits = model(tokens_tensor, segments_tensors)
-        # Or get the token classification loss
-        labels = torch.tensor([[0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0]])
-        classif_loss = model(tokens_tensor, segments_tensors, labels=labels) # set model.train() before if training this loss
-    """
-    model = BertForTokenClassification.from_pretrained(*args, **kwargs)
-    return model
diff --git a/hubconfs/gpt2_hubconf.py b/hubconfs/gpt2_hubconf.py
deleted file mode 100644
index 18afad3913..0000000000
--- a/hubconfs/gpt2_hubconf.py
+++ /dev/null
@@ -1,168 +0,0 @@
-from pytorch_transformers.tokenization_gpt2 import GPT2Tokenizer
-from pytorch_transformers.modeling_gpt2 import (
-    GPT2Model,
-    GPT2LMHeadModel,
-    GPT2DoubleHeadsModel
-)
-
-# A lot of models share the same param doc. Use a decorator
-# to save typing
-gpt2_docstring = """
-    Params:
-        pretrained_model_name_or_path: either:
-            - a str with the name of a pre-trained model to load selected in the list of:
-                . `gpt2`, `gpt2-medium`
-            - a path or url to a pretrained model archive containing:
-                . `gpt2_config.json` a configuration file for the model
-                . `pytorch_model.bin` a PyTorch dump of a GPT2Model instance
-            - a path or url to a pretrained model archive containing:
-                . `gpt2_config.json` a configuration file for the model
-                . a TensorFlow checkpoint with trained weights
-        from_tf: should we load the weights from a locally saved TensorFlow checkpoint
-        cache_dir: an optional path to a folder in which the pre-trained models will be cached.
-        state_dict: an optional state dictionary (collections.OrderedDict object) to use instead of pre-trained models
-        *inputs, **kwargs: additional input for the specific GPT-2 class
-"""
-
-
-def _append_from_pretrained_docstring(docstr):
-    def docstring_decorator(fn):
-        fn.__doc__ = fn.__doc__ + docstr
-        return fn
-    return docstring_decorator
-
-
-def gpt2Tokenizer(*args, **kwargs):
-    """
-    Instantiate a GPT-2 BPE tokenizer for OpenAI GPT-2 from a pre-trained/customized vocab file.
-    Peculiarities:
-        - Byte-level BPE
-
-    Args:
-    pretrained_model_name_or_path: Path to pretrained model archive
-                                   or one of pre-trained vocab configs below.
-                                       * gpt2
-    Keyword args:
-    special_tokens: Special tokens in vocabulary that are not pretrained ([SEP], [CLS]...)
-                    Default: None
-    max_len: An artificial maximum length to truncate tokenized sequences to;
-             Effective maximum length is always the minimum of this
-             value (if specified) and the underlying BERT model's
-             sequence length.
-             Default: None
-
-    Example:
-        import torch
-        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'gpt2Tokenizer', 'gpt2')
-
-        text = "Who was Jim Henson ?"
-        indexed_tokens = tokenizer.encode(tokenized_text)
-    """
-    tokenizer = GPT2Tokenizer.from_pretrained(*args, **kwargs)
-    return tokenizer
-
-
-@_append_from_pretrained_docstring(gpt2_docstring)
-def gpt2Model(*args, **kwargs):
-    """
-    gpt2Model is the basic OpenAI GPT-2 Transformer model based on
-    identical stacked masked self-attention blocks and pre-trained
-    on large scale dataset using language modeling signal.
-
-    Example:
-        # Load the tokenizer
-        import torch
-        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'gpt2Tokenizer', 'gpt2')
-
-        #  Prepare tokenized input
-        text_1 = "Who was Jim Henson ?"
-        text_2 = "Jim Henson was a puppeteer"
-        indexed_tokens_1 = tokenizer.encode(text_1)
-        indexed_tokens_2 = tokenizer.encode(text_2)
-        tokens_tensor_1 = torch.tensor([indexed_tokens_1])
-        tokens_tensor_2 = torch.tensor([indexed_tokens_2])
-
-        # Load gpt2Model
-        model = torch.hub.load('huggingface/pytorch-transformers', 'gpt2Model', 'gpt2')
-        model.eval()
-
-        # Predict hidden states features for each layer
-        # past can be used to reuse precomputed hidden state in a subsequent predictions
-        with torch.no_grad():
-                hidden_states_1, past = model(tokens_tensor_1)
-                hidden_states_2, past = model(tokens_tensor_2, past=past)
-    """
-    model = GPT2Model.from_pretrained(*args, **kwargs)
-    return model
-
-
-@_append_from_pretrained_docstring(gpt2_docstring)
-def gpt2LMHeadModel(*args, **kwargs):
-    """
-    gpt2LMHeadModel is the OpenAI GPT-2 Transformer model with the
-    tied (pre-trained) language modeling head on top.
-
-    Example:
-        # Load the tokenizer
-        import torch
-        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'gpt2Tokenizer', 'gpt2')
-
-        #  Prepare tokenized input
-        text_1 = "Who was Jim Henson ?"
-        text_2 = "Jim Henson was a puppeteer"
-        indexed_tokens_1 = tokenizer.encode(text_1)
-        indexed_tokens_2 = tokenizer.encode(text_2)
-        tokens_tensor_1 = torch.tensor([indexed_tokens_1])
-        tokens_tensor_2 = torch.tensor([indexed_tokens_2])
-
-        # Load gpt2LMHeadModel
-        model = torch.hub.load('huggingface/pytorch-transformers', 'gpt2LMHeadModel', 'gpt2')
-        model.eval()
-
-        # Predict hidden states features for each layer
-        # past can be used to reuse precomputed hidden state in a subsequent predictions
-        with torch.no_grad():
-                predictions_1, past = model(tokens_tensor_1)
-                predictions_2, past = model(tokens_tensor_2, past=past)
-
-        # Get the predicted last token
-        predicted_index = torch.argmax(predictions_2[0, -1, :]).item()
-        predicted_token = tokenizer.decode([predicted_index])
-        assert predicted_token == ' who'
-    """
-    model = GPT2LMHeadModel.from_pretrained(*args, **kwargs)
-    return model
-
-
-@_append_from_pretrained_docstring(gpt2_docstring)
-def gpt2DoubleHeadsModel(*args, **kwargs):
-    """
-    gpt2DoubleHeadsModel is the OpenAI GPT-2 Transformer model with the
-    tied (pre-trained) language modeling head and a multiple choice
-    classification head (only initialized, not pre-trained).
-
-    Example:
-        # Load the tokenizer
-        import torch
-        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'gpt2Tokenizer', 'gpt2')
-
-        #  Prepare tokenized input
-        text1 = "Who was Jim Henson ? Jim Henson was a puppeteer"
-        text2 = "Who was Jim Henson ? Jim Henson was a mysterious young man"
-        tokenized_text1 = tokenizer.tokenize(text1)
-        tokenized_text2 = tokenizer.tokenize(text2)
-        indexed_tokens1 = tokenizer.convert_tokens_to_ids(tokenized_text1)
-        indexed_tokens2 = tokenizer.convert_tokens_to_ids(tokenized_text2)
-        tokens_tensor = torch.tensor([[indexed_tokens1, indexed_tokens2]])
-        mc_token_ids = torch.LongTensor([[len(tokenized_text1)-1, len(tokenized_text2)-1]])
-
-        # Load gpt2DoubleHeadsModel
-        model = torch.hub.load('huggingface/pytorch-transformers', 'gpt2DoubleHeadsModel', 'gpt2')
-        model.eval()
-
-        # Predict hidden states features for each layer
-        with torch.no_grad():
-                lm_logits, multiple_choice_logits, presents = model(tokens_tensor, mc_token_ids)
-    """
-    model = GPT2DoubleHeadsModel.from_pretrained(*args, **kwargs)
-    return model
diff --git a/hubconfs/gpt_hubconf.py b/hubconfs/gpt_hubconf.py
deleted file mode 100644
index 649075980c..0000000000
--- a/hubconfs/gpt_hubconf.py
+++ /dev/null
@@ -1,186 +0,0 @@
-from pytorch_transformers.tokenization_openai import OpenAIGPTTokenizer
-from pytorch_transformers.modeling_openai import (
-	OpenAIGPTModel,
-	OpenAIGPTLMHeadModel,
-	OpenAIGPTDoubleHeadsModel
-)
-
-# Dependecies that are not specified in global hubconf.py
-specific_dependencies = ['spacy', 'ftfy']
-
-# A lot of models share the same param doc. Use a decorator
-# to save typing
-gpt_docstring = """
-    OpenAI GPT use a single embedding matrix to store the word and special embeddings.
-    Special tokens embeddings are additional tokens that are not pre-trained: [SEP], [CLS]...
-    Special tokens need to be trained during the fine-tuning if you use them.
-    The number of special embeddings can be controled using the `set_num_special_tokens(num_special_tokens)` function.
-
-    The embeddings are ordered as follow in the token embeddings matrice:
-        [0,                                                         ----------------------
-         ...                                                        -> word embeddings
-         config.vocab_size - 1,                                     ______________________
-         config.vocab_size,
-         ...                                                        -> special embeddings
-         config.vocab_size + config.n_special - 1]                  ______________________
-
-    where total_tokens_embeddings can be obtained as config.total_tokens_embeddings and is:
-        total_tokens_embeddings = config.vocab_size + config.n_special
-    You should use the associate indices to index the embeddings.
-
-    Params:
-		pretrained_model_name_or_path: either:
-			- a str with the name of a pre-trained model to load selected in the list of:
-				. `openai-gpt`
-			- a path or url to a pretrained model archive containing:
-				. `openai_gpt_config.json` a configuration file for the model
-				. `pytorch_model.bin` a PyTorch dump of a OpenAIGPTModel instance
-			- a path or url to a pretrained model archive containing:
-				. `openai-gpt-config.json` a configuration file for the model
-				. a series of NumPy files containing OpenAI TensorFlow trained weights
-		from_tf: should we load the weights from a locally saved TensorFlow checkpoint
-		cache_dir: an optional path to a folder in which the pre-trained models will be cached.
-		state_dict: an optional state dictionary (collections.OrderedDict object)
-		        	to use instead of pre-trained models
-		*inputs, **kwargs: additional input for the specific OpenAI-GPT class
-"""
-
-
-def _append_from_pretrained_docstring(docstr):
-    def docstring_decorator(fn):
-        fn.__doc__ = fn.__doc__ + docstr
-        return fn
-    return docstring_decorator
-
-
-def openAIGPTTokenizer(*args, **kwargs):
-    """
-    Instantiate a BPE tokenizer for OpenAI GPT from a pre-trained/customized vocab file.
-	Peculiarities:
-        - lower case all inputs
-        - uses SpaCy tokenizer ('en' model) and ftfy for pre-BPE tokenization if they are installed, fallback to BERT's BasicTokenizer if not.
-        - argument special_tokens and function set_special_tokens:
-            can be used to add additional symbols (ex: "__classify__") to a vocabulary.
-
-    Args:
-    pretrained_model_name_or_path: Path to pretrained model archive
-                                   or one of pre-trained vocab configs below.
-                                       * openai-gpt
-    Keyword args:
-	special_tokens: Special tokens in vocabulary that are not pretrained ([SEP], [CLS]...)
-					Default: None
-	max_len: An artificial maximum length to truncate tokenized sequences to;
-        	 Effective maximum length is always the minimum of this
-             value (if specified) and the underlying BERT model's
-             sequence length.
-			 Default: None
-
-    Example:
-		import torch
-        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'openAIGPTTokenizer', 'openai-gpt')
-		
-		text = "Who was Jim Henson ? Jim Henson was a puppeteer"
-        tokenized_text = tokenizer.tokenize(text)
-        indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
-        [763, 509, 4265, 2298, 945, 257, 4265, 2298, 945, 509, 246, 10148, 39041, 483]
-    """
-    tokenizer = OpenAIGPTTokenizer.from_pretrained(*args, **kwargs)
-    return tokenizer
-
-
-@_append_from_pretrained_docstring(gpt_docstring)
-def openAIGPTModel(*args, **kwargs):
-    """
-    OpenAIGPTModel is the basic OpenAI GPT Transformer model based on
-	identical stacked masked self-attention blocks and pre-trained
-	on large scale dataset using language modeling signal.
-
-    Example:
-        # Load the tokenizer
-		import torch
-        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'openAIGPTTokenizer', 'openai-gpt')
-
-        #  Prepare tokenized input
-        text = "Who was Jim Henson ? Jim Henson was a puppeteer"
-        tokenized_text = tokenizer.tokenize(text)
-        indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
-        tokens_tensor = torch.tensor([indexed_tokens])
-
-        # Load openAIGPTModel
-        model = torch.hub.load('huggingface/pytorch-transformers', 'openAIGPTModel', 'openai-gpt')
-        model.eval()
-
-        # Predict hidden states features for each layer
-        with torch.no_grad():
-                hidden_states = model(tokens_tensor)
-    """
-    model = OpenAIGPTModel.from_pretrained(*args, **kwargs)
-    return model
-
-
-@_append_from_pretrained_docstring(gpt_docstring)
-def openAIGPTLMHeadModel(*args, **kwargs):
-    """
-    OpenAIGPTLMHeadModel is the OpenAI GPT Transformer model with the
-	tied (pre-trained) language modeling head on top.
-
-	Example:
-        # Load the tokenizer
-        import torch
-        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'openAIGPTTokenizer', 'openai-gpt')
-
-        #  Prepare tokenized input
-        text = "Who was Jim Henson ? Jim Henson was a puppeteer"
-        tokenized_text = tokenizer.tokenize(text)
-        indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
-        tokens_tensor = torch.tensor([indexed_tokens])
-
-        # Load openAIGPTLMHeadModel
-        model = torch.hub.load('huggingface/pytorch-transformers', 'openAIGPTLMHeadModel', 'openai-gpt')
-        model.eval()
-
-        # Predict hidden states features for each layer
-        with torch.no_grad():
-                predictions = model(tokens_tensor)
-
-		# Get the predicted last token
-		predicted_index = torch.argmax(predictions[0, -1, :]).item()
-		predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
-        '.</w>'
-    """
-    model = OpenAIGPTLMHeadModel.from_pretrained(*args, **kwargs)
-    return model
-
-
-@_append_from_pretrained_docstring(gpt_docstring)
-def openAIGPTDoubleHeadsModel(*args, **kwargs):
-    """
-    OpenAIGPTDoubleHeadsModel is the OpenAI GPT Transformer model with the
-	tied (pre-trained) language modeling head and a multiple choice
-	classification head (only initialized, not pre-trained).
-
-	Example:
-        # Load the tokenizer
-        import torch
-        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'openAIGPTTokenizer', 'openai-gpt')
-
-        #  Prepare tokenized input
-        text1 = "Who was Jim Henson ? Jim Henson was a puppeteer"
-        text2 = "Who was Jim Henson ? Jim Henson was a mysterious young man"
-        tokenized_text1 = tokenizer.tokenize(text1)
-        tokenized_text2 = tokenizer.tokenize(text2)
-        indexed_tokens1 = tokenizer.convert_tokens_to_ids(tokenized_text1)
-        indexed_tokens2 = tokenizer.convert_tokens_to_ids(tokenized_text2)
-        tokens_tensor = torch.tensor([[indexed_tokens1, indexed_tokens2]])
-        mc_token_ids = torch.LongTensor([[len(tokenized_text1)-1, len(tokenized_text2)-1]])
-
-        # Load openAIGPTDoubleHeadsModel
-        model = torch.hub.load('huggingface/pytorch-transformers', 'openAIGPTDoubleHeadsModel', 'openai-gpt')
-        model.eval()
-
-        # Predict hidden states features for each layer
-        with torch.no_grad():
-                lm_logits, multiple_choice_logits = model(tokens_tensor, mc_token_ids)
-    """
-    model = OpenAIGPTDoubleHeadsModel.from_pretrained(*args, **kwargs)
-    return model
diff --git a/hubconfs/transformer_xl_hubconf.py b/hubconfs/transformer_xl_hubconf.py
deleted file mode 100644
index 548d407581..0000000000
--- a/hubconfs/transformer_xl_hubconf.py
+++ /dev/null
@@ -1,130 +0,0 @@
-from pytorch_transformers.tokenization_transfo_xl import TransfoXLTokenizer
-from pytorch_transformers.modeling_transfo_xl import (
-    TransfoXLModel,
-    TransfoXLLMHeadModel
-)
-
-# A lot of models share the same param doc. Use a decorator
-# to save typing
-transformer_xl_docstring = """
-    Transformer XL use a relative positioning (with sinusiodal patterns) and adaptive softmax inputs which means that:
-    - you don't need to specify positioning embeddings indices
-    - the tokens in the vocabulary have to be sorted to decreasing frequency.
-
-    Params:
-        pretrained_model_name_or_path: either:
-            - a str with the name of a pre-trained model to load selected in the list of:
-                . `transfo-xl-wt103`
-            - a path or url to a pretrained model archive containing:
-                . `transfo_xl_config.json` a configuration file for the model
-                . `pytorch_model.bin` a PyTorch dump of a TransfoXLModel instance
-            - a path or url to a pretrained model archive containing:
-                . `transfo_xl_config.json` a configuration file for the model
-                . `model.chkpt` a TensorFlow checkpoint
-        from_tf: should we load the weights from a locally saved TensorFlow checkpoint
-        cache_dir: an optional path to a folder in which the pre-trained models will be cached.
-        state_dict: an optional state dictionary (collections.OrderedDict object) to use instead of pre-trained models
-        *inputs, **kwargs: additional input for the specific TransformerXL class
-"""
-
-
-def _append_from_pretrained_docstring(docstr):
-    def docstring_decorator(fn):
-        fn.__doc__ = fn.__doc__ + docstr
-        return fn
-    return docstring_decorator
-
-
-def transformerXLTokenizer(*args, **kwargs):
-    """
-    Instantiate a Transformer-XL tokenizer adapted from Vocab class in https://github.com/kimiyoung/transformer-xl
-
-    Args:
-    pretrained_model_name_or_path: Path to pretrained model archive
-                                   or one of pre-trained vocab configs below.
-                                       * transfo-xl-wt103
-
-    Example:
-        import torch
-        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'transformerXLTokenizer', 'transfo-xl-wt103')
-        
-        text = "Who was Jim Henson ?"
-        tokenized_text = tokenizer.tokenize(tokenized_text)
-        indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
-    """
-    tokenizer = TransfoXLTokenizer.from_pretrained(*args, **kwargs)
-    return tokenizer
-
-
-@_append_from_pretrained_docstring(transformer_xl_docstring)
-def transformerXLModel(*args, **kwargs):
-    """
-    transformerXLModel is the basic Transformer XL model.
-
-    Example:
-        # Load the tokenizer
-        import torch
-        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'transformerXLTokenizer', 'transfo-xl-wt103')
-
-        #  Prepare tokenized input
-        text_1 = "Who was Jim Henson ?"
-        text_2 = "Jim Henson was a puppeteer"
-        tokenized_text_1 = tokenizer.tokenize(text_1)
-        tokenized_text_2 = tokenizer.tokenize(text_2)
-        indexed_tokens_1 = tokenizer.convert_tokens_to_ids(tokenized_text_1)
-        indexed_tokens_2 = tokenizer.convert_tokens_to_ids(tokenized_text_2)
-        tokens_tensor_1 = torch.tensor([indexed_tokens_1])
-        tokens_tensor_2 = torch.tensor([indexed_tokens_2])
-
-        # Load transformerXLModel
-        model = torch.hub.load('huggingface/pytorch-transformers', 'transformerXLModel', 'transfo-xl-wt103')
-        model.eval()
-
-        # Predict hidden states features for each layer
-        # We can re-use the memory cells in a subsequent call to attend a longer context
-        with torch.no_grad():
-                hidden_states_1, mems_1 = model(tokens_tensor_1)
-                hidden_states_2, mems_2 = model(tokens_tensor_2, mems=mems_1)
-    """
-    model = TransfoXLModel.from_pretrained(*args, **kwargs)
-    return model
-
-
-@_append_from_pretrained_docstring(transformer_xl_docstring)
-def transformerXLLMHeadModel(*args, **kwargs):
-    """
-    transformerXLModel is the basic Transformer XL model with the
-    tied (pre-trained) language modeling head on top.
-
-    Example:
-        # Load the tokenizer
-        import torch
-        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'transformerXLTokenizer', 'transfo-xl-wt103')
-
-        #  Prepare tokenized input
-        text_1 = "Who was Jim Henson ?"
-        text_2 = "Jim Henson was a puppeteer"
-        tokenized_text_1 = tokenizer.tokenize(text_1)
-        tokenized_text_2 = tokenizer.tokenize(text_2)
-        indexed_tokens_1 = tokenizer.convert_tokens_to_ids(tokenized_text_1)
-        indexed_tokens_2 = tokenizer.convert_tokens_to_ids(tokenized_text_2)
-        tokens_tensor_1 = torch.tensor([indexed_tokens_1])
-        tokens_tensor_2 = torch.tensor([indexed_tokens_2])
-
-        # Load transformerXLLMHeadModel
-        model = torch.hub.load('huggingface/pytorch-transformers', 'transformerXLLMHeadModel', 'transfo-xl-wt103')
-        model.eval()
-
-        # Predict hidden states features for each layer
-        # We can re-use the memory cells in a subsequent call to attend a longer context
-        with torch.no_grad():
-                predictions_1, mems_1 = model(tokens_tensor_1)
-                predictions_2, mems_2 = model(tokens_tensor_2, mems=mems_1)
-
-        # Get the predicted last token
-        predicted_index = torch.argmax(predictions_2[0, -1, :]).item()
-        predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
-        assert predicted_token == 'who'
-    """
-    model = TransfoXLLMHeadModel.from_pretrained(*args, **kwargs)
-    return model
diff --git a/hubconfs/xlm_hubconf.py b/hubconfs/xlm_hubconf.py
deleted file mode 100644
index e96d923944..0000000000
--- a/hubconfs/xlm_hubconf.py
+++ /dev/null
@@ -1,167 +0,0 @@
-from pytorch_transformers.tokenization_xlm import XLMTokenizer
-from pytorch_transformers.modeling_xlm import (
-    XLMConfig,
-    XLMModel,
-    XLMWithLMHeadModel,
-    XLMForSequenceClassification,
-    XLMForQuestionAnswering
-)
-
-# A lot of models share the same param doc. Use a decorator
-# to save typing
-xlm_start_docstring = """
-    Model class adapted from the XLM Transformer model of
-        "Cross-lingual Language Model Pretraining" by Guillaume Lample, Alexis Conneau
-        Paper: https://arxiv.org/abs/1901.07291
-        Original code: https://github.com/facebookresearch/XLM
-
-    Example:
-        # Load the tokenizer
-        import torch
-        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'xlmTokenizer', 'xlm-mlm-en-2048')
-
-        #  Prepare tokenized input
-        text_1 = "Who was Jim Henson ?"
-        text_2 = "Jim Henson was a puppeteer"
-        indexed_tokens_1 = tokenizer.encode(text_1)
-        indexed_tokens_2 = tokenizer.encode(text_2)
-        tokens_tensor_1 = torch.tensor([indexed_tokens_1])
-        tokens_tensor_2 = torch.tensor([indexed_tokens_2])
-"""
-
-# A lot of models share the same param doc. Use a decorator
-# to save typing
-xlm_end_docstring = """
-    Params:
-        pretrained_model_name_or_path: either:
-            - a str with the name of a pre-trained model to load selected in the list of:
-                . `xlm-mlm-en-2048`
-            - a path or url to a pretrained model archive containing:
-                . `config.json` a configuration file for the model
-                . `pytorch_model.bin` a PyTorch dump created using the `convert_xlm_checkpoint_to_pytorch` conversion script
-        cache_dir: an optional path to a folder in which the pre-trained models will be cached.
-        state_dict: an optional state dictionary (collections.OrderedDict object) to use instead of pre-trained models
-        *inputs, **kwargs: additional input for the specific XLM class
-"""
-
-
-def _begin_with_docstring(docstr):
-    def docstring_decorator(fn):
-        fn.__doc__ = fn.__doc__ + docstr
-        return fn
-    return docstring_decorator
-
-def _end_with_docstring(docstr):
-    def docstring_decorator(fn):
-        fn.__doc__ = fn.__doc__ + docstr
-        return fn
-    return docstring_decorator
-
-
-def xlmTokenizer(*args, **kwargs):
-    """
-    Instantiate a XLM BPE tokenizer for XLM from a pre-trained vocab file.
-
-    Args:
-    pretrained_model_name_or_path: Path to pretrained model archive
-                                   or one of pre-trained vocab configs below.
-                                       * xlm-mlm-en-2048
-    Keyword args:
-    special_tokens: Special tokens in vocabulary that are not pretrained
-                    Default: None
-    max_len: An artificial maximum length to truncate tokenized sequences to;
-             Effective maximum length is always the minimum of this
-             value (if specified) and the underlying model's
-             sequence length.
-             Default: None
-
-    Example:
-        import torch
-        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'xlmTokenizer', 'xlm-mlm-en-2048')
-
-        text = "Who was Jim Henson ?"
-        indexed_tokens = tokenizer.encode(tokenized_text)
-    """
-    tokenizer = XLMTokenizer.from_pretrained(*args, **kwargs)
-    return tokenizer
-
-
-@_begin_with_docstring(xlm_start_docstring)
-@_end_with_docstring(xlm_end_docstring)
-def xlmModel(*args, **kwargs):
-    """
-        # Load xlmModel
-        model = torch.hub.load('huggingface/pytorch-transformers', 'xlmModel', 'xlm-mlm-en-2048')
-        model.eval()
-
-        # Predict hidden states features for each layer
-        with torch.no_grad():
-                hidden_states_1, mems = model(tokens_tensor_1)
-                hidden_states_2, mems = model(tokens_tensor_2, past=mems)
-    """
-    model = XLMModel.from_pretrained(*args, **kwargs)
-    return model
-
-
-@_begin_with_docstring(xlm_start_docstring)
-@_end_with_docstring(xlm_end_docstring)
-def xlmLMHeadModel(*args, **kwargs):
-    """
-        #  Prepare tokenized input
-        text_1 = "Who was Jim Henson ?"
-        text_2 = "Jim Henson was a puppeteer"
-        indexed_tokens_1 = tokenizer.encode(text_1)
-        indexed_tokens_2 = tokenizer.encode(text_2)
-        tokens_tensor_1 = torch.tensor([indexed_tokens_1])
-        tokens_tensor_2 = torch.tensor([indexed_tokens_2])
-
-        # Load xlnetLMHeadModel
-        model = torch.hub.load('huggingface/pytorch-transformers', 'xlnetLMHeadModel', 'xlm-mlm-en-2048')
-        model.eval()
-
-        # Predict hidden states features for each layer
-        with torch.no_grad():
-                predictions_1, mems = model(tokens_tensor_1)
-                predictions_2, mems = model(tokens_tensor_2, mems=mems)
-
-        # Get the predicted last token
-        predicted_index = torch.argmax(predictions_2[0, -1, :]).item()
-        predicted_token = tokenizer.decode([predicted_index])
-        assert predicted_token == ' who'
-    """
-    model = XLMWithLMHeadModel.from_pretrained(*args, **kwargs)
-    return model
-
-
-# @_end_with_docstring(xlnet_docstring)
-# def xlnetForSequenceClassification(*args, **kwargs):
-#     """
-#     xlnetModel is the basic XLNet Transformer model from
-#         "XLNet: Generalized Autoregressive Pretraining for Language Understanding"
-#         by Zhilin Yang, Zihang Dai1, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le
-
-#     Example:
-#         # Load the tokenizer
-#         import torch
-#         tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'xlnetTokenizer', 'xlm-mlm-en-2048')
-
-#         #  Prepare tokenized input
-#         text1 = "Who was Jim Henson ? Jim Henson was a puppeteer"
-#         text2 = "Who was Jim Henson ? Jim Henson was a mysterious young man"
-#         tokenized_text1 = tokenizer.tokenize(text1)
-#         tokenized_text2 = tokenizer.tokenize(text2)
-#         indexed_tokens1 = tokenizer.convert_tokens_to_ids(tokenized_text1)
-#         indexed_tokens2 = tokenizer.convert_tokens_to_ids(tokenized_text2)
-#         tokens_tensor = torch.tensor([[indexed_tokens1, indexed_tokens2]])
-#         mc_token_ids = torch.LongTensor([[len(tokenized_text1)-1, len(tokenized_text2)-1]])
-
-#         # Load xlnetForSequenceClassification
-#         model = torch.hub.load('huggingface/pytorch-transformers', 'xlnetForSequenceClassification', 'xlm-mlm-en-2048')
-#         model.eval()
-
-#         # Predict sequence classes logits
-#         with torch.no_grad():
-#                 lm_logits, mems = model(tokens_tensor)
-#     """
-#     model = XLNetForSequenceClassification.from_pretrained(*args, **kwargs)
-#     return model
diff --git a/hubconfs/xlnet_hubconf.1.py b/hubconfs/xlnet_hubconf.1.py
deleted file mode 100644
index fa7b7ddb9f..0000000000
--- a/hubconfs/xlnet_hubconf.1.py
+++ /dev/null
@@ -1,169 +0,0 @@
-from pytorch_transformers.tokenization_xlnet import XLNetTokenizer
-from pytorch_transformers.modeling_xlnet import (
-    XLNetConfig,
-    XLNetModel,
-    XLNetLMHeadModel,
-    # XLNetForSequenceClassification
-)
-
-# A lot of models share the same param doc. Use a decorator
-# to save typing
-xlnet_docstring = """
-    Params:
-        pretrained_model_name_or_path: either:
-            - a str with the name of a pre-trained model to load selected in the list of:
-                . `xlnet-large-cased`
-            - a path or url to a pretrained model archive containing:
-                . `config.json` a configuration file for the model
-                . `pytorch_model.bin` a PyTorch dump of a XLNetForPreTraining instance
-            - a path or url to a pretrained model archive containing:
-                . `xlnet_config.json` a configuration file for the model
-                . `model.chkpt` a TensorFlow checkpoint
-        from_tf: should we load the weights from a locally saved TensorFlow checkpoint
-        cache_dir: an optional path to a folder in which the pre-trained models will be cached.
-        state_dict: an optional state dictionary (collections.OrderedDict object) to use instead of pre-trained models
-        *inputs, **kwargs: additional input for the specific XLNet class
-"""
-
-
-def _append_from_pretrained_docstring(docstr):
-    def docstring_decorator(fn):
-        fn.__doc__ = fn.__doc__ + docstr
-        return fn
-    return docstring_decorator
-
-
-def xlnetTokenizer(*args, **kwargs):
-    """
-    Instantiate a XLNet sentencepiece tokenizer for XLNet from a pre-trained vocab file.
-    Peculiarities:
-        - require Google sentencepiece (https://github.com/google/sentencepiece)
-
-    Args:
-    pretrained_model_name_or_path: Path to pretrained model archive
-                                   or one of pre-trained vocab configs below.
-                                       * xlnet-large-cased
-    Keyword args:
-    special_tokens: Special tokens in vocabulary that are not pretrained
-                    Default: None
-    max_len: An artificial maximum length to truncate tokenized sequences to;
-             Effective maximum length is always the minimum of this
-             value (if specified) and the underlying model's
-             sequence length.
-             Default: None
-
-    Example:
-        import torch
-        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'xlnetTokenizer', 'xlnet-large-cased')
-
-        text = "Who was Jim Henson ?"
-        indexed_tokens = tokenizer.encode(tokenized_text)
-    """
-    tokenizer = XLNetTokenizer.from_pretrained(*args, **kwargs)
-    return tokenizer
-
-
-@_append_from_pretrained_docstring(xlnet_docstring)
-def xlnetModel(*args, **kwargs):
-    """
-    xlnetModel is the basic XLNet Transformer model from
-        "XLNet: Generalized Autoregressive Pretraining for Language Understanding"
-        by Zhilin Yang, Zihang Dai1, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le
-
-    Example:
-        # Load the tokenizer
-        import torch
-        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'xlnetTokenizer', 'xlnet-large-cased')
-
-        #  Prepare tokenized input
-        text_1 = "Who was Jim Henson ?"
-        text_2 = "Jim Henson was a puppeteer"
-        indexed_tokens_1 = tokenizer.encode(text_1)
-        indexed_tokens_2 = tokenizer.encode(text_2)
-        tokens_tensor_1 = torch.tensor([indexed_tokens_1])
-        tokens_tensor_2 = torch.tensor([indexed_tokens_2])
-
-        # Load xlnetModel
-        model = torch.hub.load('huggingface/pytorch-transformers', 'xlnetModel', 'xlnet-large-cased')
-        model.eval()
-
-        # Predict hidden states features for each layer
-        with torch.no_grad():
-                hidden_states_1, mems = model(tokens_tensor_1)
-                hidden_states_2, mems = model(tokens_tensor_2, past=mems)
-    """
-    model = XLNetModel.from_pretrained(*args, **kwargs)
-    return model
-
-
-@_append_from_pretrained_docstring(xlnet_docstring)
-def xlnetLMHeadModel(*args, **kwargs):
-    """
-    xlnetModel is the basic XLNet Transformer model from
-        "XLNet: Generalized Autoregressive Pretraining for Language Understanding"
-        by Zhilin Yang, Zihang Dai1, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le
-    with a tied (pre-trained) language modeling head on top.
-
-    Example:
-        # Load the tokenizer
-        import torch
-        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'xlnetTokenizer', 'xlnet-large-cased')
-
-        #  Prepare tokenized input
-        text_1 = "Who was Jim Henson ?"
-        text_2 = "Jim Henson was a puppeteer"
-        indexed_tokens_1 = tokenizer.encode(text_1)
-        indexed_tokens_2 = tokenizer.encode(text_2)
-        tokens_tensor_1 = torch.tensor([indexed_tokens_1])
-        tokens_tensor_2 = torch.tensor([indexed_tokens_2])
-
-        # Load xlnetLMHeadModel
-        model = torch.hub.load('huggingface/pytorch-transformers', 'xlnetLMHeadModel', 'xlnet-large-cased')
-        model.eval()
-
-        # Predict hidden states features for each layer
-        with torch.no_grad():
-                predictions_1, mems = model(tokens_tensor_1)
-                predictions_2, mems = model(tokens_tensor_2, mems=mems)
-
-        # Get the predicted last token
-        predicted_index = torch.argmax(predictions_2[0, -1, :]).item()
-        predicted_token = tokenizer.decode([predicted_index])
-        assert predicted_token == ' who'
-    """
-    model = XLNetLMHeadModel.from_pretrained(*args, **kwargs)
-    return model
-
-
-# @_append_from_pretrained_docstring(xlnet_docstring)
-# def xlnetForSequenceClassification(*args, **kwargs):
-#     """
-#     xlnetModel is the basic XLNet Transformer model from
-#         "XLNet: Generalized Autoregressive Pretraining for Language Understanding"
-#         by Zhilin Yang, Zihang Dai1, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le
-
-#     Example:
-#         # Load the tokenizer
-#         import torch
-#         tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'xlnetTokenizer', 'xlnet-large-cased')
-
-#         #  Prepare tokenized input
-#         text1 = "Who was Jim Henson ? Jim Henson was a puppeteer"
-#         text2 = "Who was Jim Henson ? Jim Henson was a mysterious young man"
-#         tokenized_text1 = tokenizer.tokenize(text1)
-#         tokenized_text2 = tokenizer.tokenize(text2)
-#         indexed_tokens1 = tokenizer.convert_tokens_to_ids(tokenized_text1)
-#         indexed_tokens2 = tokenizer.convert_tokens_to_ids(tokenized_text2)
-#         tokens_tensor = torch.tensor([[indexed_tokens1, indexed_tokens2]])
-#         mc_token_ids = torch.LongTensor([[len(tokenized_text1)-1, len(tokenized_text2)-1]])
-
-#         # Load xlnetForSequenceClassification
-#         model = torch.hub.load('huggingface/pytorch-transformers', 'xlnetForSequenceClassification', 'xlnet-large-cased')
-#         model.eval()
-
-#         # Predict sequence classes logits
-#         with torch.no_grad():
-#                 lm_logits, mems = model(tokens_tensor)
-#     """
-#     model = XLNetForSequenceClassification.from_pretrained(*args, **kwargs)
-#     return model
diff --git a/pytorch_transformers/modeling_auto.py b/pytorch_transformers/modeling_auto.py
index 0c328909c2..05ff5e5b33 100644
--- a/pytorch_transformers/modeling_auto.py
+++ b/pytorch_transformers/modeling_auto.py
@@ -18,11 +18,6 @@ from __future__ import absolute_import, division, print_function, unicode_litera
 
 import logging
 
-import torch
-import torch.nn as nn
-from torch.nn import CrossEntropyLoss, MSELoss
-from torch.nn.parameter import Parameter
-
 from .modeling_bert import BertConfig, BertModel, BertForMaskedLM, BertForSequenceClassification, BertForQuestionAnswering
 from .modeling_openai import OpenAIGPTConfig, OpenAIGPTModel, OpenAIGPTLMHeadModel
 from .modeling_gpt2 import GPT2Config, GPT2Model, GPT2LMHeadModel