clean up and simplify hubconf

2019-08-30 22:34:23 +02:00
parent 80aa87d9a3
commit 256086bc69
9 changed files with 110 additions and 1303 deletions
--- a/hubconf.py
+++ b/hubconf.py
@@ -1,10 +1,112 @@
 from pytorch_transformers import (
    AutoTokenizer, AutoConfig, AutoModel, AutoModelWithLMHead, AutoModelForSequenceClassification, AutoModelForQuestionAnswering
 )
 from pytorch_transformers.modeling_utils import add_start_docstrings
 dependencies = ['torch', 'tqdm', 'boto3', 'requests', 'regex', 'sentencepiece', 'sacremoses']
-from hubconfs.automodels_hubconf import (
+@add_start_docstrings(AutoConfig.__doc__)
-    config,
+def config(*args, **kwargs):
-    model,
+    r""" 
-    modelForQuestionAnswering,
+                # Using torch.hub !
-    modelForSequenceClassification,
+                import torch
-    modelWithLMHead,
+
-    tokenizer,
+                config = torch.hub.load('huggingface/pytorch-transformers', 'config', 'bert-base-uncased')  # Download configuration from S3 and cache.
-)
+                config = torch.hub.load('huggingface/pytorch-transformers', 'config', './test/bert_saved_model/')  # E.g. config (or model) was saved using `save_pretrained('./test/saved_model/')`
                config = torch.hub.load('huggingface/pytorch-transformers', 'config', './test/bert_saved_model/my_configuration.json')
                config = torch.hub.load('huggingface/pytorch-transformers', 'config', 'bert-base-uncased', output_attention=True, foo=False)
                assert config.output_attention == True
                config, unused_kwargs = torch.hub.load('huggingface/pytorch-transformers', 'config', 'bert-base-uncased', output_attention=True, foo=False, return_unused_kwargs=True)
                assert config.output_attention == True
                assert unused_kwargs == {'foo': False}
            """
    return AutoConfig.from_pretrained(*args, **kwargs)
@add_start_docstrings(AutoTokenizer.__doc__)
 def tokenizer(*args, **kwargs):
    r""" 
        # Using torch.hub !
        import torch
        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'tokenizer', 'bert-base-uncased')    # Download vocabulary from S3 and cache.
        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'tokenizer', './test/bert_saved_model/')  # E.g. tokenizer was saved using `save_pretrained('./test/saved_model/')`
    """
    return AutoTokenizer.from_pretrained(*args, **kwargs)
@add_start_docstrings(AutoModel.__doc__)
 def model(*args, **kwargs):
    r"""
            # Using torch.hub !
            import torch
            model = torch.hub.load('huggingface/pytorch-transformers', 'model', 'bert-base-uncased')    # Download model and configuration from S3 and cache.
            model = torch.hub.load('huggingface/pytorch-transformers', 'model', './test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
            model = torch.hub.load('huggingface/pytorch-transformers', 'model', 'bert-base-uncased', output_attention=True)  # Update configuration during loading
            assert model.config.output_attention == True
            # Loading from a TF checkpoint file instead of a PyTorch model (slower)
            config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
            model = torch.hub.load('huggingface/pytorch-transformers', 'model', './tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
        """
    return AutoModel.from_pretrained(*args, **kwargs)
@add_start_docstrings(AutoModelWithLMHead.__doc__)
 def modelWithLMHead(*args, **kwargs):
    r"""
        # Using torch.hub !
        import torch
        model = torch.hub.load('huggingface/pytorch-transformers', 'modelWithLMHead', 'bert-base-uncased')    # Download model and configuration from S3 and cache.
        model = torch.hub.load('huggingface/pytorch-transformers', 'modelWithLMHead', './test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
        model = torch.hub.load('huggingface/pytorch-transformers', 'modelWithLMHead', 'bert-base-uncased', output_attention=True)  # Update configuration during loading
        assert model.config.output_attention == True
        # Loading from a TF checkpoint file instead of a PyTorch model (slower)
        config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
        model = torch.hub.load('huggingface/pytorch-transformers', 'modelWithLMHead', './tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
    """
    return AutoModelWithLMHead.from_pretrained(*args, **kwargs)
@add_start_docstrings(AutoModelForSequenceClassification.__doc__)
 def modelForSequenceClassification(*args, **kwargs):
    r"""
            # Using torch.hub !
            import torch
            model = torch.hub.load('huggingface/pytorch-transformers', 'modelForSequenceClassification', 'bert-base-uncased')    # Download model and configuration from S3 and cache.
            model = torch.hub.load('huggingface/pytorch-transformers', 'modelForSequenceClassification', './test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
            model = torch.hub.load('huggingface/pytorch-transformers', 'modelForSequenceClassification', 'bert-base-uncased', output_attention=True)  # Update configuration during loading
            assert model.config.output_attention == True
            # Loading from a TF checkpoint file instead of a PyTorch model (slower)
            config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
            model = torch.hub.load('huggingface/pytorch-transformers', 'modelForSequenceClassification', './tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
        """
    return AutoModelForSequenceClassification.from_pretrained(*args, **kwargs)
@add_start_docstrings(AutoModelForQuestionAnswering.__doc__)
 def modelForQuestionAnswering(*args, **kwargs):
    r"""
        # Using torch.hub !
        import torch
        model = torch.hub.load('huggingface/pytorch-transformers', 'modelForQuestionAnswering', 'bert-base-uncased')    # Download model and configuration from S3 and cache.
        model = torch.hub.load('huggingface/pytorch-transformers', 'modelForQuestionAnswering', './test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
        model = torch.hub.load('huggingface/pytorch-transformers', 'modelForQuestionAnswering', 'bert-base-uncased', output_attention=True)  # Update configuration during loading
        assert model.config.output_attention == True
        # Loading from a TF checkpoint file instead of a PyTorch model (slower)
        config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
        model = torch.hub.load('huggingface/pytorch-transformers', 'modelForQuestionAnswering', './tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
    """
    return AutoModelForQuestionAnswering.from_pretrained(*args, **kwargs)
--- a/hubconfs/automodels_hubconf.py
+++ b/hubconfs/automodels_hubconf.py
@@ -1,110 +0,0 @@
 from pytorch_transformers import (
    AutoTokenizer, AutoConfig, AutoModel, AutoModelWithLMHead, AutoModelForSequenceClassification, AutoModelForQuestionAnswering
 )
 from pytorch_transformers.modeling_utils import add_start_docstrings
@add_start_docstrings(AutoConfig.__doc__)
 def config(*args, **kwargs):
    r""" 
                # Using torch.hub !
                import torch
                config = torch.hub.load('huggingface/pytorch-transformers', 'config', 'bert-base-uncased')  # Download configuration from S3 and cache.
                config = torch.hub.load('huggingface/pytorch-transformers', 'config', './test/bert_saved_model/')  # E.g. config (or model) was saved using `save_pretrained('./test/saved_model/')`
                config = torch.hub.load('huggingface/pytorch-transformers', 'config', './test/bert_saved_model/my_configuration.json')
                config = torch.hub.load('huggingface/pytorch-transformers', 'config', 'bert-base-uncased', output_attention=True, foo=False)
                assert config.output_attention == True
                config, unused_kwargs = torch.hub.load('huggingface/pytorch-transformers', 'config', 'bert-base-uncased', output_attention=True, foo=False, return_unused_kwargs=True)
                assert config.output_attention == True
                assert unused_kwargs == {'foo': False}
            """
    return AutoConfig.from_pretrained(*args, **kwargs)
@add_start_docstrings(AutoTokenizer.__doc__)
 def tokenizer(*args, **kwargs):
    r""" 
        # Using torch.hub !
        import torch
        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'tokenizer', 'bert-base-uncased')    # Download vocabulary from S3 and cache.
        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'tokenizer', './test/bert_saved_model/')  # E.g. tokenizer was saved using `save_pretrained('./test/saved_model/')`
    """
    return AutoTokenizer.from_pretrained(*args, **kwargs)
@add_start_docstrings(AutoModel.__doc__)
 def model(*args, **kwargs):
    r"""
            # Using torch.hub !
            import torch
            model = torch.hub.load('huggingface/pytorch-transformers', 'model', 'bert-base-uncased')    # Download model and configuration from S3 and cache.
            model = torch.hub.load('huggingface/pytorch-transformers', 'model', './test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
            model = torch.hub.load('huggingface/pytorch-transformers', 'model', 'bert-base-uncased', output_attention=True)  # Update configuration during loading
            assert model.config.output_attention == True
            # Loading from a TF checkpoint file instead of a PyTorch model (slower)
            config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
            model = torch.hub.load('huggingface/pytorch-transformers', 'model', './tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
        """
    return AutoModel.from_pretrained(*args, **kwargs)
@add_start_docstrings(AutoModelWithLMHead.__doc__)
 def modelWithLMHead(*args, **kwargs):
    r"""
        # Using torch.hub !
        import torch
        model = torch.hub.load('huggingface/pytorch-transformers', 'modelWithLMHead', 'bert-base-uncased')    # Download model and configuration from S3 and cache.
        model = torch.hub.load('huggingface/pytorch-transformers', 'modelWithLMHead', './test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
        model = torch.hub.load('huggingface/pytorch-transformers', 'modelWithLMHead', 'bert-base-uncased', output_attention=True)  # Update configuration during loading
        assert model.config.output_attention == True
        # Loading from a TF checkpoint file instead of a PyTorch model (slower)
        config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
        model = torch.hub.load('huggingface/pytorch-transformers', 'modelWithLMHead', './tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
    """
    return AutoModelWithLMHead.from_pretrained(*args, **kwargs)
@add_start_docstrings(AutoModelForSequenceClassification.__doc__)
 def modelForSequenceClassification(*args, **kwargs):
    r"""
            # Using torch.hub !
            import torch
            model = torch.hub.load('huggingface/pytorch-transformers', 'modelForSequenceClassification', 'bert-base-uncased')    # Download model and configuration from S3 and cache.
            model = torch.hub.load('huggingface/pytorch-transformers', 'modelForSequenceClassification', './test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
            model = torch.hub.load('huggingface/pytorch-transformers', 'modelForSequenceClassification', 'bert-base-uncased', output_attention=True)  # Update configuration during loading
            assert model.config.output_attention == True
            # Loading from a TF checkpoint file instead of a PyTorch model (slower)
            config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
            model = torch.hub.load('huggingface/pytorch-transformers', 'modelForSequenceClassification', './tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
        """
    return AutoModelForSequenceClassification.from_pretrained(*args, **kwargs)
@add_start_docstrings(AutoModelForQuestionAnswering.__doc__)
 def modelForQuestionAnswering(*args, **kwargs):
    r"""
        # Using torch.hub !
        import torch
        model = torch.hub.load('huggingface/pytorch-transformers', 'modelForQuestionAnswering', 'bert-base-uncased')    # Download model and configuration from S3 and cache.
        model = torch.hub.load('huggingface/pytorch-transformers', 'modelForQuestionAnswering', './test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
        model = torch.hub.load('huggingface/pytorch-transformers', 'modelForQuestionAnswering', 'bert-base-uncased', output_attention=True)  # Update configuration during loading
        assert model.config.output_attention == True
        # Loading from a TF checkpoint file instead of a PyTorch model (slower)
        config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
        model = torch.hub.load('huggingface/pytorch-transformers', 'modelForQuestionAnswering', './tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
    """
    return AutoModelForQuestionAnswering.from_pretrained(*args, **kwargs)
--- a/hubconfs/bert_hubconf.py
+++ b/hubconfs/bert_hubconf.py
@@ -1,360 +0,0 @@
 from pytorch_transformers.tokenization_bert import BertTokenizer
 from pytorch_transformers.modeling_bert import (
        BertModel,
        BertForNextSentencePrediction,
        BertForMaskedLM,
        BertForMultipleChoice,
        BertForPreTraining,
        BertForQuestionAnswering,
        BertForSequenceClassification,
        BertForTokenClassification,
        )
 # A lot of models share the same param doc. Use a decorator
 # to save typing
 bert_docstring = """
    Params:
        pretrained_model_name_or_path: either:
            - a str with the name of a pre-trained model to load
                . `bert-base-uncased`
                . `bert-large-uncased`
                . `bert-base-cased`
                . `bert-large-cased`
                . `bert-base-multilingual-uncased`
                . `bert-base-multilingual-cased`
                . `bert-base-chinese`
                . `bert-base-german-cased`
                . `bert-large-uncased-whole-word-masking`
                . `bert-large-cased-whole-word-masking`
            - a path or url to a pretrained model archive containing:
                . `bert_config.json` a configuration file for the model
                . `pytorch_model.bin` a PyTorch dump of a BertForPreTraining
                  instance
            - a path or url to a pretrained model archive containing:
                . `bert_config.json` a configuration file for the model
                . `model.chkpt` a TensorFlow checkpoint
        from_tf: should we load the weights from a locally saved TensorFlow
                 checkpoint
        cache_dir: an optional path to a folder in which the pre-trained models
                   will be cached.
        state_dict: an optional state dictionary
                    (collections.OrderedDict object) to use instead of Google
                    pre-trained models
        *inputs, **kwargs: additional input for the specific Bert class
            (ex: num_labels for BertForSequenceClassification)
 """
 def _append_from_pretrained_docstring(docstr):
    def docstring_decorator(fn):
        fn.__doc__ = fn.__doc__ + docstr
        return fn
    return docstring_decorator
 def bertTokenizer(*args, **kwargs):
    """
    Instantiate a BertTokenizer from a pre-trained/customized vocab file
    Args:
    pretrained_model_name_or_path: Path to pretrained model archive
                                   or one of pre-trained vocab configs below.
                                       * bert-base-uncased
                                       * bert-large-uncased
                                       * bert-base-cased
                                       * bert-large-cased
                                       * bert-base-multilingual-uncased
                                       * bert-base-multilingual-cased
                                       * bert-base-chinese
    Keyword args:
    cache_dir: an optional path to a specific directory to download and cache
               the pre-trained model weights.
               Default: None
    do_lower_case: Whether to lower case the input.
                   Only has an effect when do_wordpiece_only=False
                   Default: True
    do_basic_tokenize: Whether to do basic tokenization before wordpiece.
                       Default: True
    max_len: An artificial maximum length to truncate tokenized sequences to;
             Effective maximum length is always the minimum of this
             value (if specified) and the underlying BERT model's
             sequence length.
             Default: None
    never_split: List of tokens which will never be split during tokenization.
                 Only has an effect when do_wordpiece_only=False
                 Default: ["[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]"]
    Example:
        import torch
        sentence = 'Hello, World!'
        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
        toks = tokenizer.tokenize(sentence)
        ['Hello', '##,', 'World', '##!']
        ids = tokenizer.convert_tokens_to_ids(toks)
        [8667, 28136, 1291, 28125]
    """
    tokenizer = BertTokenizer.from_pretrained(*args, **kwargs)
    return tokenizer
@_append_from_pretrained_docstring(bert_docstring)
 def bertModel(*args, **kwargs):
    """
    BertModel is the basic BERT Transformer model with a layer of summed token,
    position and sequence embeddings followed by a series of identical
    self-attention blocks (12 for BERT-base, 24 for BERT-large).
    Example:
        # Load the tokenizer
        import torch
        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
        #  Prepare tokenized input
        text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
        tokenized_text = tokenizer.tokenize(text)
        indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
        segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
        tokens_tensor = torch.tensor([indexed_tokens])
        segments_tensors = torch.tensor([segments_ids])
        # Load bertModel
        model = torch.hub.load('huggingface/pytorch-transformers', 'bertModel', 'bert-base-cased')
        model.eval()
        # Predict hidden states features for each layer
        with torch.no_grad():
                encoded_layers, _ = model(tokens_tensor, segments_tensors)
    """
    model = BertModel.from_pretrained(*args, **kwargs)
    return model
@_append_from_pretrained_docstring(bert_docstring)
 def bertForNextSentencePrediction(*args, **kwargs):
    """
    BERT model with next sentence prediction head.
    This module comprises the BERT model followed by the next sentence
    classification head.
    Example:
        # Load the tokenizer
        import torch
        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
        #  Prepare tokenized input
        text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
        tokenized_text = tokenizer.tokenize(text)
        indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
        segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
        tokens_tensor = torch.tensor([indexed_tokens])
        segments_tensors = torch.tensor([segments_ids])
        # Load bertForNextSentencePrediction
        model = torch.hub.load('huggingface/pytorch-transformers', 'bertForNextSentencePrediction', 'bert-base-cased')
        model.eval()
        # Predict the next sentence classification logits
        with torch.no_grad():
                next_sent_classif_logits = model(tokens_tensor, segments_tensors)
    """
    model = BertForNextSentencePrediction.from_pretrained(*args, **kwargs)
    return model
@_append_from_pretrained_docstring(bert_docstring)
 def bertForPreTraining(*args, **kwargs):
    """
    BERT model with pre-training heads.
    This module comprises the BERT model followed by the two pre-training heads
        - the masked language modeling head, and
        - the next sentence classification head.
    Example:
        # Load the tokenizer
        import torch
        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
        #  Prepare tokenized input
        text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
        tokenized_text = tokenizer.tokenize(text)
        segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
        tokens_tensor = torch.tensor([indexed_tokens])
        segments_tensors = torch.tensor([segments_ids])
        # Load bertForPreTraining
        model = torch.hub.load('huggingface/pytorch-transformers', 'bertForPreTraining', 'bert-base-cased')
        masked_lm_logits_scores, seq_relationship_logits = model(tokens_tensor, segments_tensors)
    """
    model = BertForPreTraining.from_pretrained(*args, **kwargs)
    return model
@_append_from_pretrained_docstring(bert_docstring)
 def bertForMaskedLM(*args, **kwargs):
    """
    BertForMaskedLM includes the BertModel Transformer followed by the
    (possibly) pre-trained masked language modeling head.
    Example:
        # Load the tokenizer
        import torch
        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
        #  Prepare tokenized input
        text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
        tokenized_text = tokenizer.tokenize(text)
        masked_index = 8
        tokenized_text[masked_index] = '[MASK]'
        indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
        segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
        tokens_tensor = torch.tensor([indexed_tokens])
        segments_tensors = torch.tensor([segments_ids])
        # Load bertForMaskedLM
        model = torch.hub.load('huggingface/pytorch-transformers', 'bertForMaskedLM', 'bert-base-cased')
        model.eval()
        # Predict all tokens
        with torch.no_grad():
                predictions = model(tokens_tensor, segments_tensors)
        predicted_index = torch.argmax(predictions[0, masked_index]).item()
        predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
        'henson'
    """
    model = BertForMaskedLM.from_pretrained(*args, **kwargs)
    return model
@_append_from_pretrained_docstring(bert_docstring)
 def bertForSequenceClassification(*args, **kwargs):
    """
    BertForSequenceClassification is a fine-tuning model that includes
    BertModel and a sequence-level (sequence or pair of sequences) classifier
    on top of the BertModel. Note that the classification head is only initialized
    and has to be trained.
    The sequence-level classifier is a linear layer that takes as input the
    last hidden state of the first character in the input sequence
    (see Figures 3a and 3b in the BERT paper).
    Args:
    num_labels: the number (>=2) of classes for the classifier.
    Example:
        # Load the tokenizer
        import torch
        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
        #  Prepare tokenized input
        text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
        tokenized_text = tokenizer.tokenize(text)
        indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
        segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
        tokens_tensor = torch.tensor([indexed_tokens])
        segments_tensors = torch.tensor([segments_ids])
        # Load bertForSequenceClassification
        model = torch.hub.load('huggingface/pytorch-transformers', 'bertForSequenceClassification', 'bert-base-cased', num_labels=2)
        model.eval()
        # Predict the sequence classification logits
        with torch.no_grad():
                seq_classif_logits = model(tokens_tensor, segments_tensors)
        # Or get the sequence classification loss
        labels = torch.tensor([1])
        seq_classif_loss = model(tokens_tensor, segments_tensors, labels=labels) # set model.train() before if training this loss
    """
    model = BertForSequenceClassification.from_pretrained(*args, **kwargs)
    return model
@_append_from_pretrained_docstring(bert_docstring)
 def bertForMultipleChoice(*args, **kwargs):
    """
    BertForMultipleChoice is a fine-tuning model that includes BertModel and a
    linear layer on top of the BertModel. Note that the multiple choice head is
    only initialized and has to be trained.
    Args:
    num_choices: the number (>=2) of classes for the classifier.
    Example:
        # Load the tokenizer
        import torch
        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
        #  Prepare tokenized input
        text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
        tokenized_text = tokenizer.tokenize(text)
        indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
        segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
        tokens_tensor = torch.tensor([indexed_tokens, indexed_tokens]).unsqueeze(0)
        segments_tensors = torch.tensor([segments_ids, segments_ids]).unsqueeze(0)
        # Load bertForMultipleChoice
        model = torch.hub.load('huggingface/pytorch-transformers', 'bertForMultipleChoice', 'bert-base-cased', num_choices=2)
        model.eval()
        # Predict the multiple choice logits
        with torch.no_grad():
                multiple_choice_logits = model(tokens_tensor, segments_tensors)
        # Or get the multiple choice loss
        labels = torch.tensor([1])
        multiple_choice_loss = model(tokens_tensor, segments_tensors, labels=labels) # set model.train() before if training this loss
    """
    model = BertForMultipleChoice.from_pretrained(*args, **kwargs)
    return model
@_append_from_pretrained_docstring(bert_docstring)
 def bertForQuestionAnswering(*args, **kwargs):
    """
    BertForQuestionAnswering is a fine-tuning model that includes BertModel
    with a token-level classifiers on top of the full sequence of last hidden
    states. Note that the classification head is only initialized
    and has to be trained.
    Example:
        # Load the tokenizer
        import torch
        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
        #  Prepare tokenized input
        text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
        tokenized_text = tokenizer.tokenize(text)
        indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
        segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
        tokens_tensor = torch.tensor([indexed_tokens])
        segments_tensors = torch.tensor([segments_ids])
        # Load bertForQuestionAnswering
        model = torch.hub.load('huggingface/pytorch-transformers', 'bertForQuestionAnswering', 'bert-base-cased')
        model.eval()
        # Predict the start and end positions logits
        with torch.no_grad():
                start_logits, end_logits = model(tokens_tensor, segments_tensors)
        # Or get the total loss which is the sum of the CrossEntropy loss for the start and end token positions
        start_positions, end_positions = torch.tensor([12]), torch.tensor([14])
        # set model.train() before if training this loss
        multiple_choice_loss = model(tokens_tensor, segments_tensors, start_positions=start_positions, end_positions=end_positions)
    """
    model = BertForQuestionAnswering.from_pretrained(*args, **kwargs)
    return model
@_append_from_pretrained_docstring(bert_docstring)
 def bertForTokenClassification(*args, **kwargs):
    """
    BertForTokenClassification is a fine-tuning model that includes BertModel
    and a token-level classifier on top of the BertModel. Note that the classification
    head is only initialized and has to be trained.
    The token-level classifier is a linear layer that takes as input the last
    hidden state of the sequence.
    Args:
    num_labels: the number (>=2) of classes for the classifier.
    Example:
        # Load the tokenizer
        import torch
        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
        #  Prepare tokenized input
        text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
        tokenized_text = tokenizer.tokenize(text)
        indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
        segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
        tokens_tensor = torch.tensor([indexed_tokens])
        segments_tensors = torch.tensor([segments_ids])
        # Load bertForTokenClassification
        model = torch.hub.load('huggingface/pytorch-transformers', 'bertForTokenClassification', 'bert-base-cased', num_labels=2)
        model.eval()
        # Predict the token classification logits
        with torch.no_grad():
                classif_logits = model(tokens_tensor, segments_tensors)
        # Or get the token classification loss
        labels = torch.tensor([[0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0]])
        classif_loss = model(tokens_tensor, segments_tensors, labels=labels) # set model.train() before if training this loss
    """
    model = BertForTokenClassification.from_pretrained(*args, **kwargs)
    return model
--- a/hubconfs/gpt2_hubconf.py
+++ b/hubconfs/gpt2_hubconf.py
@@ -1,168 +0,0 @@
 from pytorch_transformers.tokenization_gpt2 import GPT2Tokenizer
 from pytorch_transformers.modeling_gpt2 import (
    GPT2Model,
    GPT2LMHeadModel,
    GPT2DoubleHeadsModel
 )
 # A lot of models share the same param doc. Use a decorator
 # to save typing
 gpt2_docstring = """
    Params:
        pretrained_model_name_or_path: either:
            - a str with the name of a pre-trained model to load selected in the list of:
                . `gpt2`, `gpt2-medium`
            - a path or url to a pretrained model archive containing:
                . `gpt2_config.json` a configuration file for the model
                . `pytorch_model.bin` a PyTorch dump of a GPT2Model instance
            - a path or url to a pretrained model archive containing:
                . `gpt2_config.json` a configuration file for the model
                . a TensorFlow checkpoint with trained weights
        from_tf: should we load the weights from a locally saved TensorFlow checkpoint
        cache_dir: an optional path to a folder in which the pre-trained models will be cached.
        state_dict: an optional state dictionary (collections.OrderedDict object) to use instead of pre-trained models
        *inputs, **kwargs: additional input for the specific GPT-2 class
 """
 def _append_from_pretrained_docstring(docstr):
    def docstring_decorator(fn):
        fn.__doc__ = fn.__doc__ + docstr
        return fn
    return docstring_decorator
 def gpt2Tokenizer(*args, **kwargs):
    """
    Instantiate a GPT-2 BPE tokenizer for OpenAI GPT-2 from a pre-trained/customized vocab file.
    Peculiarities:
        - Byte-level BPE
    Args:
    pretrained_model_name_or_path: Path to pretrained model archive
                                   or one of pre-trained vocab configs below.
                                       * gpt2
    Keyword args:
    special_tokens: Special tokens in vocabulary that are not pretrained ([SEP], [CLS]...)
                    Default: None
    max_len: An artificial maximum length to truncate tokenized sequences to;
             Effective maximum length is always the minimum of this
             value (if specified) and the underlying BERT model's
             sequence length.
             Default: None
    Example:
        import torch
        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'gpt2Tokenizer', 'gpt2')
        text = "Who was Jim Henson ?"
        indexed_tokens = tokenizer.encode(tokenized_text)
    """
    tokenizer = GPT2Tokenizer.from_pretrained(*args, **kwargs)
    return tokenizer
@_append_from_pretrained_docstring(gpt2_docstring)
 def gpt2Model(*args, **kwargs):
    """
    gpt2Model is the basic OpenAI GPT-2 Transformer model based on
    identical stacked masked self-attention blocks and pre-trained
    on large scale dataset using language modeling signal.
    Example:
        # Load the tokenizer
        import torch
        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'gpt2Tokenizer', 'gpt2')
        #  Prepare tokenized input
        text_1 = "Who was Jim Henson ?"
        text_2 = "Jim Henson was a puppeteer"
        indexed_tokens_1 = tokenizer.encode(text_1)
        indexed_tokens_2 = tokenizer.encode(text_2)
        tokens_tensor_1 = torch.tensor([indexed_tokens_1])
        tokens_tensor_2 = torch.tensor([indexed_tokens_2])
        # Load gpt2Model
        model = torch.hub.load('huggingface/pytorch-transformers', 'gpt2Model', 'gpt2')
        model.eval()
        # Predict hidden states features for each layer
        # past can be used to reuse precomputed hidden state in a subsequent predictions
        with torch.no_grad():
                hidden_states_1, past = model(tokens_tensor_1)
                hidden_states_2, past = model(tokens_tensor_2, past=past)
    """
    model = GPT2Model.from_pretrained(*args, **kwargs)
    return model
@_append_from_pretrained_docstring(gpt2_docstring)
 def gpt2LMHeadModel(*args, **kwargs):
    """
    gpt2LMHeadModel is the OpenAI GPT-2 Transformer model with the
    tied (pre-trained) language modeling head on top.
    Example:
        # Load the tokenizer
        import torch
        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'gpt2Tokenizer', 'gpt2')
        #  Prepare tokenized input
        text_1 = "Who was Jim Henson ?"
        text_2 = "Jim Henson was a puppeteer"
        indexed_tokens_1 = tokenizer.encode(text_1)
        indexed_tokens_2 = tokenizer.encode(text_2)
        tokens_tensor_1 = torch.tensor([indexed_tokens_1])
        tokens_tensor_2 = torch.tensor([indexed_tokens_2])
        # Load gpt2LMHeadModel
        model = torch.hub.load('huggingface/pytorch-transformers', 'gpt2LMHeadModel', 'gpt2')
        model.eval()
        # Predict hidden states features for each layer
        # past can be used to reuse precomputed hidden state in a subsequent predictions
        with torch.no_grad():
                predictions_1, past = model(tokens_tensor_1)
                predictions_2, past = model(tokens_tensor_2, past=past)
        # Get the predicted last token
        predicted_index = torch.argmax(predictions_2[0, -1, :]).item()
        predicted_token = tokenizer.decode([predicted_index])
        assert predicted_token == ' who'
    """
    model = GPT2LMHeadModel.from_pretrained(*args, **kwargs)
    return model
@_append_from_pretrained_docstring(gpt2_docstring)
 def gpt2DoubleHeadsModel(*args, **kwargs):
    """
    gpt2DoubleHeadsModel is the OpenAI GPT-2 Transformer model with the
    tied (pre-trained) language modeling head and a multiple choice
    classification head (only initialized, not pre-trained).
    Example:
        # Load the tokenizer
        import torch
        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'gpt2Tokenizer', 'gpt2')
        #  Prepare tokenized input
        text1 = "Who was Jim Henson ? Jim Henson was a puppeteer"
        text2 = "Who was Jim Henson ? Jim Henson was a mysterious young man"
        tokenized_text1 = tokenizer.tokenize(text1)
        tokenized_text2 = tokenizer.tokenize(text2)
        indexed_tokens1 = tokenizer.convert_tokens_to_ids(tokenized_text1)
        indexed_tokens2 = tokenizer.convert_tokens_to_ids(tokenized_text2)
        tokens_tensor = torch.tensor([[indexed_tokens1, indexed_tokens2]])
        mc_token_ids = torch.LongTensor([[len(tokenized_text1)-1, len(tokenized_text2)-1]])
        # Load gpt2DoubleHeadsModel
        model = torch.hub.load('huggingface/pytorch-transformers', 'gpt2DoubleHeadsModel', 'gpt2')
        model.eval()
        # Predict hidden states features for each layer
        with torch.no_grad():
                lm_logits, multiple_choice_logits, presents = model(tokens_tensor, mc_token_ids)
    """
    model = GPT2DoubleHeadsModel.from_pretrained(*args, **kwargs)
    return model
--- a/hubconfs/gpt_hubconf.py
+++ b/hubconfs/gpt_hubconf.py
@@ -1,186 +0,0 @@
 from pytorch_transformers.tokenization_openai import OpenAIGPTTokenizer
 from pytorch_transformers.modeling_openai import (
 	OpenAIGPTModel,
 	OpenAIGPTLMHeadModel,
 	OpenAIGPTDoubleHeadsModel
 )
 # Dependecies that are not specified in global hubconf.py
 specific_dependencies = ['spacy', 'ftfy']
 # A lot of models share the same param doc. Use a decorator
 # to save typing
 gpt_docstring = """
    OpenAI GPT use a single embedding matrix to store the word and special embeddings.
    Special tokens embeddings are additional tokens that are not pre-trained: [SEP], [CLS]...
    Special tokens need to be trained during the fine-tuning if you use them.
    The number of special embeddings can be controled using the `set_num_special_tokens(num_special_tokens)` function.
    The embeddings are ordered as follow in the token embeddings matrice:
        [0,                                                         ----------------------
         ...                                                        -> word embeddings
         config.vocab_size - 1,                                     ______________________
         config.vocab_size,
         ...                                                        -> special embeddings
         config.vocab_size + config.n_special - 1]                  ______________________
    where total_tokens_embeddings can be obtained as config.total_tokens_embeddings and is:
        total_tokens_embeddings = config.vocab_size + config.n_special
    You should use the associate indices to index the embeddings.
    Params:
 		pretrained_model_name_or_path: either:
 			- a str with the name of a pre-trained model to load selected in the list of:
 				. `openai-gpt`
 			- a path or url to a pretrained model archive containing:
 				. `openai_gpt_config.json` a configuration file for the model
 				. `pytorch_model.bin` a PyTorch dump of a OpenAIGPTModel instance
 			- a path or url to a pretrained model archive containing:
 				. `openai-gpt-config.json` a configuration file for the model
 				. a series of NumPy files containing OpenAI TensorFlow trained weights
 		from_tf: should we load the weights from a locally saved TensorFlow checkpoint
 		cache_dir: an optional path to a folder in which the pre-trained models will be cached.
 		state_dict: an optional state dictionary (collections.OrderedDict object)
 		        	to use instead of pre-trained models
 		*inputs, **kwargs: additional input for the specific OpenAI-GPT class
 """
 def _append_from_pretrained_docstring(docstr):
    def docstring_decorator(fn):
        fn.__doc__ = fn.__doc__ + docstr
        return fn
    return docstring_decorator
 def openAIGPTTokenizer(*args, **kwargs):
    """
    Instantiate a BPE tokenizer for OpenAI GPT from a pre-trained/customized vocab file.
 	Peculiarities:
        - lower case all inputs
        - uses SpaCy tokenizer ('en' model) and ftfy for pre-BPE tokenization if they are installed, fallback to BERT's BasicTokenizer if not.
        - argument special_tokens and function set_special_tokens:
            can be used to add additional symbols (ex: "__classify__") to a vocabulary.
    Args:
    pretrained_model_name_or_path: Path to pretrained model archive
                                   or one of pre-trained vocab configs below.
                                       * openai-gpt
    Keyword args:
 	special_tokens: Special tokens in vocabulary that are not pretrained ([SEP], [CLS]...)
 					Default: None
 	max_len: An artificial maximum length to truncate tokenized sequences to;
        	 Effective maximum length is always the minimum of this
             value (if specified) and the underlying BERT model's
             sequence length.
 			 Default: None
    Example:
 		import torch
        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'openAIGPTTokenizer', 'openai-gpt')
 		text = "Who was Jim Henson ? Jim Henson was a puppeteer"
        tokenized_text = tokenizer.tokenize(text)
        indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
        [763, 509, 4265, 2298, 945, 257, 4265, 2298, 945, 509, 246, 10148, 39041, 483]
    """
    tokenizer = OpenAIGPTTokenizer.from_pretrained(*args, **kwargs)
    return tokenizer
@_append_from_pretrained_docstring(gpt_docstring)
 def openAIGPTModel(*args, **kwargs):
    """
    OpenAIGPTModel is the basic OpenAI GPT Transformer model based on
 	identical stacked masked self-attention blocks and pre-trained
 	on large scale dataset using language modeling signal.
    Example:
        # Load the tokenizer
 		import torch
        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'openAIGPTTokenizer', 'openai-gpt')
        #  Prepare tokenized input
        text = "Who was Jim Henson ? Jim Henson was a puppeteer"
        tokenized_text = tokenizer.tokenize(text)
        indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
        tokens_tensor = torch.tensor([indexed_tokens])
        # Load openAIGPTModel
        model = torch.hub.load('huggingface/pytorch-transformers', 'openAIGPTModel', 'openai-gpt')
        model.eval()
        # Predict hidden states features for each layer
        with torch.no_grad():
                hidden_states = model(tokens_tensor)
    """
    model = OpenAIGPTModel.from_pretrained(*args, **kwargs)
    return model
@_append_from_pretrained_docstring(gpt_docstring)
 def openAIGPTLMHeadModel(*args, **kwargs):
    """
    OpenAIGPTLMHeadModel is the OpenAI GPT Transformer model with the
 	tied (pre-trained) language modeling head on top.
 	Example:
        # Load the tokenizer
        import torch
        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'openAIGPTTokenizer', 'openai-gpt')
        #  Prepare tokenized input
        text = "Who was Jim Henson ? Jim Henson was a puppeteer"
        tokenized_text = tokenizer.tokenize(text)
        indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
        tokens_tensor = torch.tensor([indexed_tokens])
        # Load openAIGPTLMHeadModel
        model = torch.hub.load('huggingface/pytorch-transformers', 'openAIGPTLMHeadModel', 'openai-gpt')
        model.eval()
        # Predict hidden states features for each layer
        with torch.no_grad():
                predictions = model(tokens_tensor)
 		# Get the predicted last token
 		predicted_index = torch.argmax(predictions[0, -1, :]).item()
 		predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
        '.</w>'
    """
    model = OpenAIGPTLMHeadModel.from_pretrained(*args, **kwargs)
    return model
@_append_from_pretrained_docstring(gpt_docstring)
 def openAIGPTDoubleHeadsModel(*args, **kwargs):
    """
    OpenAIGPTDoubleHeadsModel is the OpenAI GPT Transformer model with the
 	tied (pre-trained) language modeling head and a multiple choice
 	classification head (only initialized, not pre-trained).
 	Example:
        # Load the tokenizer
        import torch
        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'openAIGPTTokenizer', 'openai-gpt')
        #  Prepare tokenized input
        text1 = "Who was Jim Henson ? Jim Henson was a puppeteer"
        text2 = "Who was Jim Henson ? Jim Henson was a mysterious young man"
        tokenized_text1 = tokenizer.tokenize(text1)
        tokenized_text2 = tokenizer.tokenize(text2)
        indexed_tokens1 = tokenizer.convert_tokens_to_ids(tokenized_text1)
        indexed_tokens2 = tokenizer.convert_tokens_to_ids(tokenized_text2)
        tokens_tensor = torch.tensor([[indexed_tokens1, indexed_tokens2]])
        mc_token_ids = torch.LongTensor([[len(tokenized_text1)-1, len(tokenized_text2)-1]])
        # Load openAIGPTDoubleHeadsModel
        model = torch.hub.load('huggingface/pytorch-transformers', 'openAIGPTDoubleHeadsModel', 'openai-gpt')
        model.eval()
        # Predict hidden states features for each layer
        with torch.no_grad():
                lm_logits, multiple_choice_logits = model(tokens_tensor, mc_token_ids)
    """
    model = OpenAIGPTDoubleHeadsModel.from_pretrained(*args, **kwargs)
    return model
--- a/hubconfs/transformer_xl_hubconf.py
+++ b/hubconfs/transformer_xl_hubconf.py
@@ -1,130 +0,0 @@
 from pytorch_transformers.tokenization_transfo_xl import TransfoXLTokenizer
 from pytorch_transformers.modeling_transfo_xl import (
    TransfoXLModel,
    TransfoXLLMHeadModel
 )
 # A lot of models share the same param doc. Use a decorator
 # to save typing
 transformer_xl_docstring = """
    Transformer XL use a relative positioning (with sinusiodal patterns) and adaptive softmax inputs which means that:
    - you don't need to specify positioning embeddings indices
    - the tokens in the vocabulary have to be sorted to decreasing frequency.
    Params:
        pretrained_model_name_or_path: either:
            - a str with the name of a pre-trained model to load selected in the list of:
                . `transfo-xl-wt103`
            - a path or url to a pretrained model archive containing:
                . `transfo_xl_config.json` a configuration file for the model
                . `pytorch_model.bin` a PyTorch dump of a TransfoXLModel instance
            - a path or url to a pretrained model archive containing:
                . `transfo_xl_config.json` a configuration file for the model
                . `model.chkpt` a TensorFlow checkpoint
        from_tf: should we load the weights from a locally saved TensorFlow checkpoint
        cache_dir: an optional path to a folder in which the pre-trained models will be cached.
        state_dict: an optional state dictionary (collections.OrderedDict object) to use instead of pre-trained models
        *inputs, **kwargs: additional input for the specific TransformerXL class
 """
 def _append_from_pretrained_docstring(docstr):
    def docstring_decorator(fn):
        fn.__doc__ = fn.__doc__ + docstr
        return fn
    return docstring_decorator
 def transformerXLTokenizer(*args, **kwargs):
    """
    Instantiate a Transformer-XL tokenizer adapted from Vocab class in https://github.com/kimiyoung/transformer-xl
    Args:
    pretrained_model_name_or_path: Path to pretrained model archive
                                   or one of pre-trained vocab configs below.
                                       * transfo-xl-wt103
    Example:
        import torch
        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'transformerXLTokenizer', 'transfo-xl-wt103')
        text = "Who was Jim Henson ?"
        tokenized_text = tokenizer.tokenize(tokenized_text)
        indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    """
    tokenizer = TransfoXLTokenizer.from_pretrained(*args, **kwargs)
    return tokenizer
@_append_from_pretrained_docstring(transformer_xl_docstring)
 def transformerXLModel(*args, **kwargs):
    """
    transformerXLModel is the basic Transformer XL model.
    Example:
        # Load the tokenizer
        import torch
        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'transformerXLTokenizer', 'transfo-xl-wt103')
        #  Prepare tokenized input
        text_1 = "Who was Jim Henson ?"
        text_2 = "Jim Henson was a puppeteer"
        tokenized_text_1 = tokenizer.tokenize(text_1)
        tokenized_text_2 = tokenizer.tokenize(text_2)
        indexed_tokens_1 = tokenizer.convert_tokens_to_ids(tokenized_text_1)
        indexed_tokens_2 = tokenizer.convert_tokens_to_ids(tokenized_text_2)
        tokens_tensor_1 = torch.tensor([indexed_tokens_1])
        tokens_tensor_2 = torch.tensor([indexed_tokens_2])
        # Load transformerXLModel
        model = torch.hub.load('huggingface/pytorch-transformers', 'transformerXLModel', 'transfo-xl-wt103')
        model.eval()
        # Predict hidden states features for each layer
        # We can re-use the memory cells in a subsequent call to attend a longer context
        with torch.no_grad():
                hidden_states_1, mems_1 = model(tokens_tensor_1)
                hidden_states_2, mems_2 = model(tokens_tensor_2, mems=mems_1)
    """
    model = TransfoXLModel.from_pretrained(*args, **kwargs)
    return model
@_append_from_pretrained_docstring(transformer_xl_docstring)
 def transformerXLLMHeadModel(*args, **kwargs):
    """
    transformerXLModel is the basic Transformer XL model with the
    tied (pre-trained) language modeling head on top.
    Example:
        # Load the tokenizer
        import torch
        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'transformerXLTokenizer', 'transfo-xl-wt103')
        #  Prepare tokenized input
        text_1 = "Who was Jim Henson ?"
        text_2 = "Jim Henson was a puppeteer"
        tokenized_text_1 = tokenizer.tokenize(text_1)
        tokenized_text_2 = tokenizer.tokenize(text_2)
        indexed_tokens_1 = tokenizer.convert_tokens_to_ids(tokenized_text_1)
        indexed_tokens_2 = tokenizer.convert_tokens_to_ids(tokenized_text_2)
        tokens_tensor_1 = torch.tensor([indexed_tokens_1])
        tokens_tensor_2 = torch.tensor([indexed_tokens_2])
        # Load transformerXLLMHeadModel
        model = torch.hub.load('huggingface/pytorch-transformers', 'transformerXLLMHeadModel', 'transfo-xl-wt103')
        model.eval()
        # Predict hidden states features for each layer
        # We can re-use the memory cells in a subsequent call to attend a longer context
        with torch.no_grad():
                predictions_1, mems_1 = model(tokens_tensor_1)
                predictions_2, mems_2 = model(tokens_tensor_2, mems=mems_1)
        # Get the predicted last token
        predicted_index = torch.argmax(predictions_2[0, -1, :]).item()
        predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
        assert predicted_token == 'who'
    """
    model = TransfoXLLMHeadModel.from_pretrained(*args, **kwargs)
    return model
--- a/hubconfs/xlm_hubconf.py
+++ b/hubconfs/xlm_hubconf.py
@@ -1,167 +0,0 @@
 from pytorch_transformers.tokenization_xlm import XLMTokenizer
 from pytorch_transformers.modeling_xlm import (
    XLMConfig,
    XLMModel,
    XLMWithLMHeadModel,
    XLMForSequenceClassification,
    XLMForQuestionAnswering
 )
 # A lot of models share the same param doc. Use a decorator
 # to save typing
 xlm_start_docstring = """
    Model class adapted from the XLM Transformer model of
        "Cross-lingual Language Model Pretraining" by Guillaume Lample, Alexis Conneau
        Paper: https://arxiv.org/abs/1901.07291
        Original code: https://github.com/facebookresearch/XLM
    Example:
        # Load the tokenizer
        import torch
        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'xlmTokenizer', 'xlm-mlm-en-2048')
        #  Prepare tokenized input
        text_1 = "Who was Jim Henson ?"
        text_2 = "Jim Henson was a puppeteer"
        indexed_tokens_1 = tokenizer.encode(text_1)
        indexed_tokens_2 = tokenizer.encode(text_2)
        tokens_tensor_1 = torch.tensor([indexed_tokens_1])
        tokens_tensor_2 = torch.tensor([indexed_tokens_2])
 """
 # A lot of models share the same param doc. Use a decorator
 # to save typing
 xlm_end_docstring = """
    Params:
        pretrained_model_name_or_path: either:
            - a str with the name of a pre-trained model to load selected in the list of:
                . `xlm-mlm-en-2048`
            - a path or url to a pretrained model archive containing:
                . `config.json` a configuration file for the model
                . `pytorch_model.bin` a PyTorch dump created using the `convert_xlm_checkpoint_to_pytorch` conversion script
        cache_dir: an optional path to a folder in which the pre-trained models will be cached.
        state_dict: an optional state dictionary (collections.OrderedDict object) to use instead of pre-trained models
        *inputs, **kwargs: additional input for the specific XLM class
 """
 def _begin_with_docstring(docstr):
    def docstring_decorator(fn):
        fn.__doc__ = fn.__doc__ + docstr
        return fn
    return docstring_decorator
 def _end_with_docstring(docstr):
    def docstring_decorator(fn):
        fn.__doc__ = fn.__doc__ + docstr
        return fn
    return docstring_decorator
 def xlmTokenizer(*args, **kwargs):
    """
    Instantiate a XLM BPE tokenizer for XLM from a pre-trained vocab file.
    Args:
    pretrained_model_name_or_path: Path to pretrained model archive
                                   or one of pre-trained vocab configs below.
                                       * xlm-mlm-en-2048
    Keyword args:
    special_tokens: Special tokens in vocabulary that are not pretrained
                    Default: None
    max_len: An artificial maximum length to truncate tokenized sequences to;
             Effective maximum length is always the minimum of this
             value (if specified) and the underlying model's
             sequence length.
             Default: None
    Example:
        import torch
        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'xlmTokenizer', 'xlm-mlm-en-2048')
        text = "Who was Jim Henson ?"
        indexed_tokens = tokenizer.encode(tokenized_text)
    """
    tokenizer = XLMTokenizer.from_pretrained(*args, **kwargs)
    return tokenizer
@_begin_with_docstring(xlm_start_docstring)
@_end_with_docstring(xlm_end_docstring)
 def xlmModel(*args, **kwargs):
    """
        # Load xlmModel
        model = torch.hub.load('huggingface/pytorch-transformers', 'xlmModel', 'xlm-mlm-en-2048')
        model.eval()
        # Predict hidden states features for each layer
        with torch.no_grad():
                hidden_states_1, mems = model(tokens_tensor_1)
                hidden_states_2, mems = model(tokens_tensor_2, past=mems)
    """
    model = XLMModel.from_pretrained(*args, **kwargs)
    return model
@_begin_with_docstring(xlm_start_docstring)
@_end_with_docstring(xlm_end_docstring)
 def xlmLMHeadModel(*args, **kwargs):
    """
        #  Prepare tokenized input
        text_1 = "Who was Jim Henson ?"
        text_2 = "Jim Henson was a puppeteer"
        indexed_tokens_1 = tokenizer.encode(text_1)
        indexed_tokens_2 = tokenizer.encode(text_2)
        tokens_tensor_1 = torch.tensor([indexed_tokens_1])
        tokens_tensor_2 = torch.tensor([indexed_tokens_2])
        # Load xlnetLMHeadModel
        model = torch.hub.load('huggingface/pytorch-transformers', 'xlnetLMHeadModel', 'xlm-mlm-en-2048')
        model.eval()
        # Predict hidden states features for each layer
        with torch.no_grad():
                predictions_1, mems = model(tokens_tensor_1)
                predictions_2, mems = model(tokens_tensor_2, mems=mems)
        # Get the predicted last token
        predicted_index = torch.argmax(predictions_2[0, -1, :]).item()
        predicted_token = tokenizer.decode([predicted_index])
        assert predicted_token == ' who'
    """
    model = XLMWithLMHeadModel.from_pretrained(*args, **kwargs)
    return model
 # @_end_with_docstring(xlnet_docstring)
 # def xlnetForSequenceClassification(*args, **kwargs):
 #     """
 #     xlnetModel is the basic XLNet Transformer model from
 #         "XLNet: Generalized Autoregressive Pretraining for Language Understanding"
 #         by Zhilin Yang, Zihang Dai1, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le
 #     Example:
 #         # Load the tokenizer
 #         import torch
 #         tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'xlnetTokenizer', 'xlm-mlm-en-2048')
 #         #  Prepare tokenized input
 #         text1 = "Who was Jim Henson ? Jim Henson was a puppeteer"
 #         text2 = "Who was Jim Henson ? Jim Henson was a mysterious young man"
 #         tokenized_text1 = tokenizer.tokenize(text1)
 #         tokenized_text2 = tokenizer.tokenize(text2)
 #         indexed_tokens1 = tokenizer.convert_tokens_to_ids(tokenized_text1)
 #         indexed_tokens2 = tokenizer.convert_tokens_to_ids(tokenized_text2)
 #         tokens_tensor = torch.tensor([[indexed_tokens1, indexed_tokens2]])
 #         mc_token_ids = torch.LongTensor([[len(tokenized_text1)-1, len(tokenized_text2)-1]])
 #         # Load xlnetForSequenceClassification
 #         model = torch.hub.load('huggingface/pytorch-transformers', 'xlnetForSequenceClassification', 'xlm-mlm-en-2048')
 #         model.eval()
 #         # Predict sequence classes logits
 #         with torch.no_grad():
 #                 lm_logits, mems = model(tokens_tensor)
 #     """
 #     model = XLNetForSequenceClassification.from_pretrained(*args, **kwargs)
 #     return model
--- a/hubconfs/xlnet_hubconf.1.py
+++ b/hubconfs/xlnet_hubconf.1.py
@@ -1,169 +0,0 @@
 from pytorch_transformers.tokenization_xlnet import XLNetTokenizer
 from pytorch_transformers.modeling_xlnet import (
    XLNetConfig,
    XLNetModel,
    XLNetLMHeadModel,
    # XLNetForSequenceClassification
 )
 # A lot of models share the same param doc. Use a decorator
 # to save typing
 xlnet_docstring = """
    Params:
        pretrained_model_name_or_path: either:
            - a str with the name of a pre-trained model to load selected in the list of:
                . `xlnet-large-cased`
            - a path or url to a pretrained model archive containing:
                . `config.json` a configuration file for the model
                . `pytorch_model.bin` a PyTorch dump of a XLNetForPreTraining instance
            - a path or url to a pretrained model archive containing:
                . `xlnet_config.json` a configuration file for the model
                . `model.chkpt` a TensorFlow checkpoint
        from_tf: should we load the weights from a locally saved TensorFlow checkpoint
        cache_dir: an optional path to a folder in which the pre-trained models will be cached.
        state_dict: an optional state dictionary (collections.OrderedDict object) to use instead of pre-trained models
        *inputs, **kwargs: additional input for the specific XLNet class
 """
 def _append_from_pretrained_docstring(docstr):
    def docstring_decorator(fn):
        fn.__doc__ = fn.__doc__ + docstr
        return fn
    return docstring_decorator
 def xlnetTokenizer(*args, **kwargs):
    """
    Instantiate a XLNet sentencepiece tokenizer for XLNet from a pre-trained vocab file.
    Peculiarities:
        - require Google sentencepiece (https://github.com/google/sentencepiece)
    Args:
    pretrained_model_name_or_path: Path to pretrained model archive
                                   or one of pre-trained vocab configs below.
                                       * xlnet-large-cased
    Keyword args:
    special_tokens: Special tokens in vocabulary that are not pretrained
                    Default: None
    max_len: An artificial maximum length to truncate tokenized sequences to;
             Effective maximum length is always the minimum of this
             value (if specified) and the underlying model's
             sequence length.
             Default: None
    Example:
        import torch
        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'xlnetTokenizer', 'xlnet-large-cased')
        text = "Who was Jim Henson ?"
        indexed_tokens = tokenizer.encode(tokenized_text)
    """
    tokenizer = XLNetTokenizer.from_pretrained(*args, **kwargs)
    return tokenizer
@_append_from_pretrained_docstring(xlnet_docstring)
 def xlnetModel(*args, **kwargs):
    """
    xlnetModel is the basic XLNet Transformer model from
        "XLNet: Generalized Autoregressive Pretraining for Language Understanding"
        by Zhilin Yang, Zihang Dai1, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le
    Example:
        # Load the tokenizer
        import torch
        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'xlnetTokenizer', 'xlnet-large-cased')
        #  Prepare tokenized input
        text_1 = "Who was Jim Henson ?"
        text_2 = "Jim Henson was a puppeteer"
        indexed_tokens_1 = tokenizer.encode(text_1)
        indexed_tokens_2 = tokenizer.encode(text_2)
        tokens_tensor_1 = torch.tensor([indexed_tokens_1])
        tokens_tensor_2 = torch.tensor([indexed_tokens_2])
        # Load xlnetModel
        model = torch.hub.load('huggingface/pytorch-transformers', 'xlnetModel', 'xlnet-large-cased')
        model.eval()
        # Predict hidden states features for each layer
        with torch.no_grad():
                hidden_states_1, mems = model(tokens_tensor_1)
                hidden_states_2, mems = model(tokens_tensor_2, past=mems)
    """
    model = XLNetModel.from_pretrained(*args, **kwargs)
    return model
@_append_from_pretrained_docstring(xlnet_docstring)
 def xlnetLMHeadModel(*args, **kwargs):
    """
    xlnetModel is the basic XLNet Transformer model from
        "XLNet: Generalized Autoregressive Pretraining for Language Understanding"
        by Zhilin Yang, Zihang Dai1, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le
    with a tied (pre-trained) language modeling head on top.
    Example:
        # Load the tokenizer
        import torch
        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'xlnetTokenizer', 'xlnet-large-cased')
        #  Prepare tokenized input
        text_1 = "Who was Jim Henson ?"
        text_2 = "Jim Henson was a puppeteer"
        indexed_tokens_1 = tokenizer.encode(text_1)
        indexed_tokens_2 = tokenizer.encode(text_2)
        tokens_tensor_1 = torch.tensor([indexed_tokens_1])
        tokens_tensor_2 = torch.tensor([indexed_tokens_2])
        # Load xlnetLMHeadModel
        model = torch.hub.load('huggingface/pytorch-transformers', 'xlnetLMHeadModel', 'xlnet-large-cased')
        model.eval()
        # Predict hidden states features for each layer
        with torch.no_grad():
                predictions_1, mems = model(tokens_tensor_1)
                predictions_2, mems = model(tokens_tensor_2, mems=mems)
        # Get the predicted last token
        predicted_index = torch.argmax(predictions_2[0, -1, :]).item()
        predicted_token = tokenizer.decode([predicted_index])
        assert predicted_token == ' who'
    """
    model = XLNetLMHeadModel.from_pretrained(*args, **kwargs)
    return model
 # @_append_from_pretrained_docstring(xlnet_docstring)
 # def xlnetForSequenceClassification(*args, **kwargs):
 #     """
 #     xlnetModel is the basic XLNet Transformer model from
 #         "XLNet: Generalized Autoregressive Pretraining for Language Understanding"
 #         by Zhilin Yang, Zihang Dai1, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le
 #     Example:
 #         # Load the tokenizer
 #         import torch
 #         tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'xlnetTokenizer', 'xlnet-large-cased')
 #         #  Prepare tokenized input
 #         text1 = "Who was Jim Henson ? Jim Henson was a puppeteer"
 #         text2 = "Who was Jim Henson ? Jim Henson was a mysterious young man"
 #         tokenized_text1 = tokenizer.tokenize(text1)
 #         tokenized_text2 = tokenizer.tokenize(text2)
 #         indexed_tokens1 = tokenizer.convert_tokens_to_ids(tokenized_text1)
 #         indexed_tokens2 = tokenizer.convert_tokens_to_ids(tokenized_text2)
 #         tokens_tensor = torch.tensor([[indexed_tokens1, indexed_tokens2]])
 #         mc_token_ids = torch.LongTensor([[len(tokenized_text1)-1, len(tokenized_text2)-1]])
 #         # Load xlnetForSequenceClassification
 #         model = torch.hub.load('huggingface/pytorch-transformers', 'xlnetForSequenceClassification', 'xlnet-large-cased')
 #         model.eval()
 #         # Predict sequence classes logits
 #         with torch.no_grad():
 #                 lm_logits, mems = model(tokens_tensor)
 #     """
 #     model = XLNetForSequenceClassification.from_pretrained(*args, **kwargs)
 #     return model
--- a/pytorch_transformers/modeling_auto.py
+++ b/pytorch_transformers/modeling_auto.py
@@ -18,11 +18,6 @@ from __future__ import absolute_import, division, print_function, unicode_litera
 import logging
 import torch
 import torch.nn as nn
 from torch.nn import CrossEntropyLoss, MSELoss
 from torch.nn.parameter import Parameter
 from .modeling_bert import BertConfig, BertModel, BertForMaskedLM, BertForSequenceClassification, BertForQuestionAnswering
 from .modeling_openai import OpenAIGPTConfig, OpenAIGPTModel, OpenAIGPTLMHeadModel
 from .modeling_gpt2 import GPT2Config, GPT2Model, GPT2LMHeadModel