From 9113b50c9674c1821a337195ae3cd7f0bb9a86c0 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Fri, 5 Jul 2019 11:31:51 +0200
Subject: [PATCH] hubs [WIP]

---
 hubconfs/xlm_hubconf.py                       | 167 ++++++++++++++++++
 .../{xlnet_hubconf.py => xlnet_hubconf.1.py}  |   0
 2 files changed, 167 insertions(+)
 create mode 100644 hubconfs/xlm_hubconf.py
 rename hubconfs/{xlnet_hubconf.py => xlnet_hubconf.1.py} (100%)

diff --git a/hubconfs/xlm_hubconf.py b/hubconfs/xlm_hubconf.py
new file mode 100644
index 0000000000..154f875bfb
--- /dev/null
+++ b/hubconfs/xlm_hubconf.py
@@ -0,0 +1,167 @@
+from pytorch_pretrained_bert.tokenization_xlm import XLMTokenizer
+from pytorch_pretrained_bert.modeling_xlm import (
+    XLMConfig,
+    XLMModel,
+    XLMWithLMHeadModel,
+    XLMForSequenceClassification,
+    XLMForQuestionAnswering
+)
+
+# A lot of models share the same param doc. Use a decorator
+# to save typing
+xlm_start_docstring = """
+    Model class adapted from the XLM Transformer model of
+        "Cross-lingual Language Model Pretraining" by Guillaume Lample, Alexis Conneau
+        Paper: https://arxiv.org/abs/1901.07291
+        Original code: https://github.com/facebookresearch/XLM
+
+    Example:
+        # Load the tokenizer
+        >>> import torch
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'xlmTokenizer', 'xlm-mlm-en-2048')
+
+        #  Prepare tokenized input
+        >>> text_1 = "Who was Jim Henson ?"
+        >>> text_2 = "Jim Henson was a puppeteer"
+        >>> indexed_tokens_1 = tokenizer.encode(text_1)
+        >>> indexed_tokens_2 = tokenizer.encode(text_2)
+        >>> tokens_tensor_1 = torch.tensor([indexed_tokens_1])
+        >>> tokens_tensor_2 = torch.tensor([indexed_tokens_2])
+"""
+
+# A lot of models share the same param doc. Use a decorator
+# to save typing
+xlm_end_docstring = """
+    Params:
+        pretrained_model_name_or_path: either:
+            - a str with the name of a pre-trained model to load selected in the list of:
+                . `xlm-mlm-en-2048`
+            - a path or url to a pretrained model archive containing:
+                . `config.json` a configuration file for the model
+                . `pytorch_model.bin` a PyTorch dump created using the `convert_xlm_checkpoint_to_pytorch` conversion script
+        cache_dir: an optional path to a folder in which the pre-trained models will be cached.
+        state_dict: an optional state dictionary (collections.OrderedDict object) to use instead of pre-trained models
+        *inputs, **kwargs: additional input for the specific XLM class
+"""
+
+
+def _begin_with_docstring(docstr):
+    def docstring_decorator(fn):
+        fn.__doc__ = fn.__doc__ + docstr
+        return fn
+    return docstring_decorator
+
+def _end_with_docstring(docstr):
+    def docstring_decorator(fn):
+        fn.__doc__ = fn.__doc__ + docstr
+        return fn
+    return docstring_decorator
+
+
+def xlmTokenizer(*args, **kwargs):
+    """
+    Instantiate a XLM BPE tokenizer for XLM from a pre-trained vocab file.
+
+    Args:
+    pretrained_model_name_or_path: Path to pretrained model archive
+                                   or one of pre-trained vocab configs below.
+                                       * xlm-mlm-en-2048
+    Keyword args:
+    special_tokens: Special tokens in vocabulary that are not pretrained
+                    Default: None
+    max_len: An artificial maximum length to truncate tokenized sequences to;
+             Effective maximum length is always the minimum of this
+             value (if specified) and the underlying model's
+             sequence length.
+             Default: None
+
+    Example:
+        >>> import torch
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'xlmTokenizer', 'xlm-mlm-en-2048')
+
+        >>> text = "Who was Jim Henson ?"
+        >>> indexed_tokens = tokenizer.encode(tokenized_text)
+    """
+    tokenizer = XLMTokenizer.from_pretrained(*args, **kwargs)
+    return tokenizer
+
+
+@_begin_with_docstring(xlm_start_docstring)
+@_end_with_docstring(xlm_end_docstring)
+def xlmModel(*args, **kwargs):
+    """
+        # Load xlmModel
+        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'xlmModel', 'xlm-mlm-en-2048')
+        >>> model.eval()
+
+        # Predict hidden states features for each layer
+        >>> with torch.no_grad():
+                hidden_states_1, mems = model(tokens_tensor_1)
+                hidden_states_2, mems = model(tokens_tensor_2, past=mems)
+    """
+    model = XLMModel.from_pretrained(*args, **kwargs)
+    return model
+
+
+@_begin_with_docstring(xlm_start_docstring)
+@_end_with_docstring(xlm_end_docstring)
+def xlmLMHeadModel(*args, **kwargs):
+    """
+        #  Prepare tokenized input
+        >>> text_1 = "Who was Jim Henson ?"
+        >>> text_2 = "Jim Henson was a puppeteer"
+        >>> indexed_tokens_1 = tokenizer.encode(text_1)
+        >>> indexed_tokens_2 = tokenizer.encode(text_2)
+        >>> tokens_tensor_1 = torch.tensor([indexed_tokens_1])
+        >>> tokens_tensor_2 = torch.tensor([indexed_tokens_2])
+
+        # Load xlnetLMHeadModel
+        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'xlnetLMHeadModel', 'xlm-mlm-en-2048')
+        >>> model.eval()
+
+        # Predict hidden states features for each layer
+        >>> with torch.no_grad():
+                predictions_1, mems = model(tokens_tensor_1)
+                predictions_2, mems = model(tokens_tensor_2, mems=mems)
+
+        # Get the predicted last token
+        >>> predicted_index = torch.argmax(predictions_2[0, -1, :]).item()
+        >>> predicted_token = tokenizer.decode([predicted_index])
+        >>> assert predicted_token == ' who'
+    """
+    model = XLMWithLMHeadModel.from_pretrained(*args, **kwargs)
+    return model
+
+
+# @_end_with_docstring(xlnet_docstring)
+# def xlnetForSequenceClassification(*args, **kwargs):
+#     """
+#     xlnetModel is the basic XLNet Transformer model from
+#         "XLNet: Generalized Autoregressive Pretraining for Language Understanding"
+#         by Zhilin Yang, Zihang Dai1, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le
+
+#     Example:
+#         # Load the tokenizer
+#         >>> import torch
+#         >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'xlnetTokenizer', 'xlm-mlm-en-2048')
+
+#         #  Prepare tokenized input
+#         >>> text1 = "Who was Jim Henson ? Jim Henson was a puppeteer"
+#         >>> text2 = "Who was Jim Henson ? Jim Henson was a mysterious young man"
+#         >>> tokenized_text1 = tokenizer.tokenize(text1)
+#         >>> tokenized_text2 = tokenizer.tokenize(text2)
+#         >>> indexed_tokens1 = tokenizer.convert_tokens_to_ids(tokenized_text1)
+#         >>> indexed_tokens2 = tokenizer.convert_tokens_to_ids(tokenized_text2)
+#         >>> tokens_tensor = torch.tensor([[indexed_tokens1, indexed_tokens2]])
+#         >>> mc_token_ids = torch.LongTensor([[len(tokenized_text1)-1, len(tokenized_text2)-1]])
+
+#         # Load xlnetForSequenceClassification
+#         >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'xlnetForSequenceClassification', 'xlm-mlm-en-2048')
+#         >>> model.eval()
+
+#         # Predict sequence classes logits
+#         >>> with torch.no_grad():
+#                 lm_logits, mems = model(tokens_tensor)
+#     """
+#     model = XLNetForSequenceClassification.from_pretrained(*args, **kwargs)
+#     return model
diff --git a/hubconfs/xlnet_hubconf.py b/hubconfs/xlnet_hubconf.1.py
similarity index 100%
rename from hubconfs/xlnet_hubconf.py
rename to hubconfs/xlnet_hubconf.1.py